• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
10
11; PR31551
12; Pairs of shufflevector:trunc functions with functional equivalence.
13; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
14
15define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
16; AVX1-LABEL: shuffle_v32i8_to_v16i8:
17; AVX1:       # %bb.0:
18; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
19; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
20; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
21; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
22; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
23; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
24; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
25; AVX1-NEXT:    vzeroupper
26; AVX1-NEXT:    retq
27;
28; AVX2-LABEL: shuffle_v32i8_to_v16i8:
29; AVX2:       # %bb.0:
30; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
31; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
32; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
33; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
34; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
35; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
36; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
37; AVX2-NEXT:    vzeroupper
38; AVX2-NEXT:    retq
39;
40; AVX512-LABEL: shuffle_v32i8_to_v16i8:
41; AVX512:       # %bb.0:
42; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
43; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
44; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
45; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
46; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
47; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
48; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
49; AVX512-NEXT:    vzeroupper
50; AVX512-NEXT:    retq
51  %vec = load <32 x i8>, <32 x i8>* %L
52  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
53  store <16 x i8> %strided.vec, <16 x i8>* %S
54  ret void
55}
56
57define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
58; AVX1-LABEL: trunc_v16i16_to_v16i8:
59; AVX1:       # %bb.0:
60; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
61; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
62; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
63; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
64; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
65; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
66; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
67; AVX1-NEXT:    vzeroupper
68; AVX1-NEXT:    retq
69;
70; AVX2-LABEL: trunc_v16i16_to_v16i8:
71; AVX2:       # %bb.0:
72; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
73; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
74; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
75; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
76; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
77; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
78; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
79; AVX2-NEXT:    vzeroupper
80; AVX2-NEXT:    retq
81;
82; AVX512F-LABEL: trunc_v16i16_to_v16i8:
83; AVX512F:       # %bb.0:
84; AVX512F-NEXT:    vpmovsxwd (%rdi), %zmm0
85; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
86; AVX512F-NEXT:    vzeroupper
87; AVX512F-NEXT:    retq
88;
89; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
90; AVX512VL:       # %bb.0:
91; AVX512VL-NEXT:    vpmovsxwd (%rdi), %zmm0
92; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
93; AVX512VL-NEXT:    vzeroupper
94; AVX512VL-NEXT:    retq
95;
96; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
97; AVX512BW:       # %bb.0:
98; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
99; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
100; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
101; AVX512BW-NEXT:    vzeroupper
102; AVX512BW-NEXT:    retq
103;
104; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
105; AVX512BWVL:       # %bb.0:
106; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
107; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
108; AVX512BWVL-NEXT:    vzeroupper
109; AVX512BWVL-NEXT:    retq
110;
111; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8:
112; AVX512VBMIVL:       # %bb.0:
113; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
114; AVX512VBMIVL-NEXT:    vpmovwb %ymm0, (%rsi)
115; AVX512VBMIVL-NEXT:    vzeroupper
116; AVX512VBMIVL-NEXT:    retq
117  %vec = load <32 x i8>, <32 x i8>* %L
118  %bc = bitcast <32 x i8> %vec to <16 x i16>
119  %strided.vec = trunc <16 x i16> %bc to <16 x i8>
120  store <16 x i8> %strided.vec, <16 x i8>* %S
121  ret void
122}
123
124define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
125; AVX1-LABEL: shuffle_v16i16_to_v8i16:
126; AVX1:       # %bb.0:
127; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
128; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
129; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
130; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
131; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
132; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
133; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
134; AVX1-NEXT:    vzeroupper
135; AVX1-NEXT:    retq
136;
137; AVX2-LABEL: shuffle_v16i16_to_v8i16:
138; AVX2:       # %bb.0:
139; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
140; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
141; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
142; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
143; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
144; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
145; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
146; AVX2-NEXT:    vzeroupper
147; AVX2-NEXT:    retq
148;
149; AVX512-LABEL: shuffle_v16i16_to_v8i16:
150; AVX512:       # %bb.0:
151; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
152; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
153; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
154; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
155; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
156; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
157; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
158; AVX512-NEXT:    vzeroupper
159; AVX512-NEXT:    retq
160  %vec = load <16 x i16>, <16 x i16>* %L
161  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
162  store <8 x i16> %strided.vec, <8 x i16>* %S
163  ret void
164}
165
166define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
167; AVX1-LABEL: trunc_v8i32_to_v8i16:
168; AVX1:       # %bb.0:
169; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
170; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
171; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
172; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
173; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
174; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
175; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
176; AVX1-NEXT:    vzeroupper
177; AVX1-NEXT:    retq
178;
179; AVX2-LABEL: trunc_v8i32_to_v8i16:
180; AVX2:       # %bb.0:
181; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
182; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
183; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
184; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
185; AVX2-NEXT:    vzeroupper
186; AVX2-NEXT:    retq
187;
188; AVX512F-LABEL: trunc_v8i32_to_v8i16:
189; AVX512F:       # %bb.0:
190; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
191; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
192; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
193; AVX512F-NEXT:    vzeroupper
194; AVX512F-NEXT:    retq
195;
196; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
197; AVX512VL:       # %bb.0:
198; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
199; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi)
200; AVX512VL-NEXT:    vzeroupper
201; AVX512VL-NEXT:    retq
202;
203; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
204; AVX512BW:       # %bb.0:
205; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
206; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
207; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
208; AVX512BW-NEXT:    vzeroupper
209; AVX512BW-NEXT:    retq
210;
211; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
212; AVX512BWVL:       # %bb.0:
213; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
214; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi)
215; AVX512BWVL-NEXT:    vzeroupper
216; AVX512BWVL-NEXT:    retq
217;
218; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16:
219; AVX512VBMIVL:       # %bb.0:
220; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
221; AVX512VBMIVL-NEXT:    vpmovdw %ymm0, (%rsi)
222; AVX512VBMIVL-NEXT:    vzeroupper
223; AVX512VBMIVL-NEXT:    retq
224  %vec = load <16 x i16>, <16 x i16>* %L
225  %bc = bitcast <16 x i16> %vec to <8 x i32>
226  %strided.vec = trunc <8 x i32> %bc to <8 x i16>
227  store <8 x i16> %strided.vec, <8 x i16>* %S
228  ret void
229}
230
231define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
232; AVX-LABEL: shuffle_v8i32_to_v4i32:
233; AVX:       # %bb.0:
234; AVX-NEXT:    vmovaps (%rdi), %ymm0
235; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
236; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
237; AVX-NEXT:    vmovaps %xmm0, (%rsi)
238; AVX-NEXT:    vzeroupper
239; AVX-NEXT:    retq
240;
241; AVX512-LABEL: shuffle_v8i32_to_v4i32:
242; AVX512:       # %bb.0:
243; AVX512-NEXT:    vmovaps (%rdi), %ymm0
244; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
245; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
246; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
247; AVX512-NEXT:    vzeroupper
248; AVX512-NEXT:    retq
249  %vec = load <8 x i32>, <8 x i32>* %L
250  %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
251  store <4 x i32> %strided.vec, <4 x i32>* %S
252  ret void
253}
254
255define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
256; AVX1-LABEL: trunc_v4i64_to_v4i32:
257; AVX1:       # %bb.0:
258; AVX1-NEXT:    vmovaps (%rdi), %ymm0
259; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
260; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
261; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
262; AVX1-NEXT:    vzeroupper
263; AVX1-NEXT:    retq
264;
265; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
266; AVX2-SLOW:       # %bb.0:
267; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
268; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
269; AVX2-SLOW-NEXT:    vmovaps %xmm0, (%rsi)
270; AVX2-SLOW-NEXT:    vzeroupper
271; AVX2-SLOW-NEXT:    retq
272;
273; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
274; AVX2-FAST:       # %bb.0:
275; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
276; AVX2-FAST-NEXT:    vpermps (%rdi), %ymm0, %ymm0
277; AVX2-FAST-NEXT:    vmovaps %xmm0, (%rsi)
278; AVX2-FAST-NEXT:    vzeroupper
279; AVX2-FAST-NEXT:    retq
280;
281; AVX512F-LABEL: trunc_v4i64_to_v4i32:
282; AVX512F:       # %bb.0:
283; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
284; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
285; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
286; AVX512F-NEXT:    vzeroupper
287; AVX512F-NEXT:    retq
288;
289; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
290; AVX512VL:       # %bb.0:
291; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
292; AVX512VL-NEXT:    vpmovqd %ymm0, (%rsi)
293; AVX512VL-NEXT:    vzeroupper
294; AVX512VL-NEXT:    retq
295;
296; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
297; AVX512BW:       # %bb.0:
298; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
299; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
300; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
301; AVX512BW-NEXT:    vzeroupper
302; AVX512BW-NEXT:    retq
303;
304; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
305; AVX512BWVL:       # %bb.0:
306; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
307; AVX512BWVL-NEXT:    vpmovqd %ymm0, (%rsi)
308; AVX512BWVL-NEXT:    vzeroupper
309; AVX512BWVL-NEXT:    retq
310;
311; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32:
312; AVX512VBMIVL:       # %bb.0:
313; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
314; AVX512VBMIVL-NEXT:    vpmovqd %ymm0, (%rsi)
315; AVX512VBMIVL-NEXT:    vzeroupper
316; AVX512VBMIVL-NEXT:    retq
317  %vec = load <8 x i32>, <8 x i32>* %L
318  %bc = bitcast <8 x i32> %vec to <4 x i64>
319  %strided.vec = trunc <4 x i64> %bc to <4 x i32>
320  store <4 x i32> %strided.vec, <4 x i32>* %S
321  ret void
322}
323
324define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
325; AVX1-LABEL: shuffle_v32i8_to_v8i8:
326; AVX1:       # %bb.0:
327; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
328; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
329; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
330; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
331; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
332; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
333; AVX1-NEXT:    vmovq %xmm0, (%rsi)
334; AVX1-NEXT:    vzeroupper
335; AVX1-NEXT:    retq
336;
337; AVX2-LABEL: shuffle_v32i8_to_v8i8:
338; AVX2:       # %bb.0:
339; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
340; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
341; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
342; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
343; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
344; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
345; AVX2-NEXT:    vmovq %xmm0, (%rsi)
346; AVX2-NEXT:    vzeroupper
347; AVX2-NEXT:    retq
348;
349; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
350; AVX512F:       # %bb.0:
351; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
352; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
353; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
354; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
355; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
356; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
357; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
358; AVX512F-NEXT:    vzeroupper
359; AVX512F-NEXT:    retq
360;
361; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
362; AVX512VL:       # %bb.0:
363; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
364; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
365; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
366; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
367; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
368; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
369; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
370; AVX512VL-NEXT:    vzeroupper
371; AVX512VL-NEXT:    retq
372;
373; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
374; AVX512BW:       # %bb.0:
375; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
376; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
377; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
378; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
379; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
380; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
381; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
382; AVX512BW-NEXT:    vzeroupper
383; AVX512BW-NEXT:    retq
384;
385; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
386; AVX512BWVL:       # %bb.0:
387; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
388; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
389; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
390; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
391; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
392; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
393; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
394; AVX512BWVL-NEXT:    vzeroupper
395; AVX512BWVL-NEXT:    retq
396;
397; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
398; AVX512VBMIVL:       # %bb.0:
399; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
400; AVX512VBMIVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
401; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
402; AVX512VBMIVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
403; AVX512VBMIVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
404; AVX512VBMIVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
405; AVX512VBMIVL-NEXT:    vpmovwb %xmm0, (%rsi)
406; AVX512VBMIVL-NEXT:    vzeroupper
407; AVX512VBMIVL-NEXT:    retq
408  %vec = load <32 x i8>, <32 x i8>* %L
409  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
410  store <8 x i8> %strided.vec, <8 x i8>* %S
411  ret void
412}
413
414define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
415; AVX1-LABEL: trunc_v8i32_to_v8i8:
416; AVX1:       # %bb.0:
417; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
418; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
419; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
420; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
421; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
422; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
423; AVX1-NEXT:    vmovq %xmm0, (%rsi)
424; AVX1-NEXT:    vzeroupper
425; AVX1-NEXT:    retq
426;
427; AVX2-LABEL: trunc_v8i32_to_v8i8:
428; AVX2:       # %bb.0:
429; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
430; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
431; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
432; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
433; AVX2-NEXT:    vmovq %xmm0, (%rsi)
434; AVX2-NEXT:    vzeroupper
435; AVX2-NEXT:    retq
436;
437; AVX512F-LABEL: trunc_v8i32_to_v8i8:
438; AVX512F:       # %bb.0:
439; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
440; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
441; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
442; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
443; AVX512F-NEXT:    vzeroupper
444; AVX512F-NEXT:    retq
445;
446; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
447; AVX512VL:       # %bb.0:
448; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
449; AVX512VL-NEXT:    vpmovdb %ymm0, (%rsi)
450; AVX512VL-NEXT:    vzeroupper
451; AVX512VL-NEXT:    retq
452;
453; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
454; AVX512BW:       # %bb.0:
455; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
456; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
457; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
458; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
459; AVX512BW-NEXT:    vzeroupper
460; AVX512BW-NEXT:    retq
461;
462; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
463; AVX512BWVL:       # %bb.0:
464; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
465; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rsi)
466; AVX512BWVL-NEXT:    vzeroupper
467; AVX512BWVL-NEXT:    retq
468;
469; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8:
470; AVX512VBMIVL:       # %bb.0:
471; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
472; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, (%rsi)
473; AVX512VBMIVL-NEXT:    vzeroupper
474; AVX512VBMIVL-NEXT:    retq
475  %vec = load <32 x i8>, <32 x i8>* %L
476  %bc = bitcast <32 x i8> %vec to <8 x i32>
477  %strided.vec = trunc <8 x i32> %bc to <8 x i8>
478  store <8 x i8> %strided.vec, <8 x i8>* %S
479  ret void
480}
481
482define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
483; IR generated from:
484; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0};
485; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
486; AVX1:       # %bb.0:
487; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
488; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
489; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
490; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
491; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
492; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
493; AVX1-NEXT:    vzeroupper
494; AVX1-NEXT:    retq
495;
496; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
497; AVX2:       # %bb.0:
498; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
499; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
500; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
501; AVX2-NEXT:    vzeroupper
502; AVX2-NEXT:    retq
503;
504; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
505; AVX512F:       # %bb.0:
506; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
507; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
508; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
509; AVX512F-NEXT:    vzeroupper
510; AVX512F-NEXT:    retq
511;
512; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
513; AVX512VL:       # %bb.0:
514; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
515; AVX512VL-NEXT:    vzeroupper
516; AVX512VL-NEXT:    retq
517;
518; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
519; AVX512BW:       # %bb.0:
520; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
521; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
522; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
523; AVX512BW-NEXT:    vzeroupper
524; AVX512BW-NEXT:    retq
525;
526; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
527; AVX512BWVL:       # %bb.0:
528; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
529; AVX512BWVL-NEXT:    vzeroupper
530; AVX512BWVL-NEXT:    retq
531;
532; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
533; AVX512VBMIVL:       # %bb.0:
534; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
535; AVX512VBMIVL-NEXT:    vzeroupper
536; AVX512VBMIVL-NEXT:    retq
537  %truncated.vec = trunc <8 x i32> %vec to <8 x i8>
538  %bc = bitcast <8 x i8> %truncated.vec to i64
539  %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
540  ret <2 x i64> %result
541}
542
543define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind {
544; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
545; AVX1:       # %bb.0:
546; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
547; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
548; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
549; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
550; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
551; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
552; AVX1-NEXT:    vzeroupper
553; AVX1-NEXT:    retq
554;
555; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
556; AVX2:       # %bb.0:
557; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
558; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
559; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
560; AVX2-NEXT:    vzeroupper
561; AVX2-NEXT:    retq
562;
563; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
564; AVX512F:       # %bb.0:
565; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
566; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
567; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
568; AVX512F-NEXT:    vzeroupper
569; AVX512F-NEXT:    retq
570;
571; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
572; AVX512VL:       # %bb.0:
573; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
574; AVX512VL-NEXT:    vzeroupper
575; AVX512VL-NEXT:    retq
576;
577; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
578; AVX512BW:       # %bb.0:
579; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
580; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
581; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
582; AVX512BW-NEXT:    vzeroupper
583; AVX512BW-NEXT:    retq
584;
585; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
586; AVX512BWVL:       # %bb.0:
587; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
588; AVX512BWVL-NEXT:    vzeroupper
589; AVX512BWVL-NEXT:    retq
590;
591; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
592; AVX512VBMIVL:       # %bb.0:
593; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
594; AVX512VBMIVL-NEXT:    vzeroupper
595; AVX512VBMIVL-NEXT:    retq
596  %truncated = trunc <8 x i32> %vec to <8 x i8>
597  %truncated.ext = zext <8 x i8> %truncated to <8 x i16>
598  %bc = bitcast <8 x i16> %truncated.ext to <16 x i8>
599  %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
600  ret <16 x i8> %result
601}
602
603define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind {
604; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
605; AVX1:       # %bb.0:
606; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
607; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
608; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
609; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
610; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
611; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
612; AVX1-NEXT:    vzeroupper
613; AVX1-NEXT:    retq
614;
615; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
616; AVX2:       # %bb.0:
617; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
618; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
619; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
620; AVX2-NEXT:    vzeroupper
621; AVX2-NEXT:    retq
622;
623; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
624; AVX512F:       # %bb.0:
625; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
626; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
627; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
628; AVX512F-NEXT:    vzeroupper
629; AVX512F-NEXT:    retq
630;
631; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
632; AVX512VL:       # %bb.0:
633; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
634; AVX512VL-NEXT:    vzeroupper
635; AVX512VL-NEXT:    retq
636;
637; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
638; AVX512BW:       # %bb.0:
639; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
640; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
641; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
642; AVX512BW-NEXT:    vzeroupper
643; AVX512BW-NEXT:    retq
644;
645; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
646; AVX512BWVL:       # %bb.0:
647; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
648; AVX512BWVL-NEXT:    vzeroupper
649; AVX512BWVL-NEXT:    retq
650;
651; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
652; AVX512VBMIVL:       # %bb.0:
653; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
654; AVX512VBMIVL-NEXT:    vzeroupper
655; AVX512VBMIVL-NEXT:    retq
656  %truncated = trunc <8 x i32> %vec to <8 x i16>
657  %bc = bitcast <8 x i16> %truncated to <16 x i8>
658  %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 17, i32 20, i32 24, i32 22, i32 31, i32 28, i32 28, i32 29>
659  ret <16 x i8> %result
660}
661
662define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
663; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
664; AVX1:       # %bb.0:
665; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
666; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
667; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
668; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
669; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
670; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
671; AVX1-NEXT:    vzeroupper
672; AVX1-NEXT:    retq
673;
674; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
675; AVX2:       # %bb.0:
676; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
677; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
678; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
679; AVX2-NEXT:    vzeroupper
680; AVX2-NEXT:    retq
681;
682; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
683; AVX512F:       # %bb.0:
684; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
685; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
686; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
687; AVX512F-NEXT:    vzeroupper
688; AVX512F-NEXT:    retq
689;
690; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
691; AVX512VL:       # %bb.0:
692; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
693; AVX512VL-NEXT:    vzeroupper
694; AVX512VL-NEXT:    retq
695;
696; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
697; AVX512BW:       # %bb.0:
698; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
699; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
700; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
701; AVX512BW-NEXT:    vzeroupper
702; AVX512BW-NEXT:    retq
703;
704; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
705; AVX512BWVL:       # %bb.0:
706; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
707; AVX512BWVL-NEXT:    vzeroupper
708; AVX512BWVL-NEXT:    retq
709;
710; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
711; AVX512VBMIVL:       # %bb.0:
712; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
713; AVX512VBMIVL-NEXT:    vzeroupper
714; AVX512VBMIVL-NEXT:    retq
715  %truncated = trunc <8 x i32> %vec to <8 x i8>
716  %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
717  ret <16 x i8> %result
718}
719
720define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind {
721; IR generated from:
722; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0};
723; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
724; AVX1:       # %bb.0:
725; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
726; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
727; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
728; AVX1-NEXT:    vzeroupper
729; AVX1-NEXT:    retq
730;
731; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
732; AVX2-SLOW:       # %bb.0:
733; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
734; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
735; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
736; AVX2-SLOW-NEXT:    vzeroupper
737; AVX2-SLOW-NEXT:    retq
738;
739; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
740; AVX2-FAST:       # %bb.0:
741; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
742; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
743; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
744; AVX2-FAST-NEXT:    vzeroupper
745; AVX2-FAST-NEXT:    retq
746;
747; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
748; AVX512F:       # %bb.0:
749; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
750; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
751; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
752; AVX512F-NEXT:    vzeroupper
753; AVX512F-NEXT:    retq
754;
755; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
756; AVX512VL:       # %bb.0:
757; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
758; AVX512VL-NEXT:    vzeroupper
759; AVX512VL-NEXT:    retq
760;
761; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
762; AVX512BW:       # %bb.0:
763; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
764; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
765; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
766; AVX512BW-NEXT:    vzeroupper
767; AVX512BW-NEXT:    retq
768;
769; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
770; AVX512BWVL:       # %bb.0:
771; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
772; AVX512BWVL-NEXT:    vzeroupper
773; AVX512BWVL-NEXT:    retq
774;
775; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
776; AVX512VBMIVL:       # %bb.0:
777; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
778; AVX512VBMIVL-NEXT:    vzeroupper
779; AVX512VBMIVL-NEXT:    retq
780  %truncated = trunc <4 x i64> %vec to <4 x i16>
781  %bc = bitcast <4 x i16> %truncated to i64
782  %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
783  ret <2 x i64> %result
784}
785
786define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind {
787; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
788; AVX1:       # %bb.0:
789; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
790; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
791; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
792; AVX1-NEXT:    vzeroupper
793; AVX1-NEXT:    retq
794;
795; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
796; AVX2-SLOW:       # %bb.0:
797; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
798; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
799; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
800; AVX2-SLOW-NEXT:    vzeroupper
801; AVX2-SLOW-NEXT:    retq
802;
803; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
804; AVX2-FAST:       # %bb.0:
805; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
806; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
807; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
808; AVX2-FAST-NEXT:    vzeroupper
809; AVX2-FAST-NEXT:    retq
810;
811; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
812; AVX512F:       # %bb.0:
813; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
814; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
815; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
816; AVX512F-NEXT:    vzeroupper
817; AVX512F-NEXT:    retq
818;
819; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
820; AVX512VL:       # %bb.0:
821; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
822; AVX512VL-NEXT:    vzeroupper
823; AVX512VL-NEXT:    retq
824;
825; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
826; AVX512BW:       # %bb.0:
827; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
828; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
829; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
830; AVX512BW-NEXT:    vzeroupper
831; AVX512BW-NEXT:    retq
832;
833; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
834; AVX512BWVL:       # %bb.0:
835; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
836; AVX512BWVL-NEXT:    vzeroupper
837; AVX512BWVL-NEXT:    retq
838;
839; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
840; AVX512VBMIVL:       # %bb.0:
841; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
842; AVX512VBMIVL-NEXT:    vzeroupper
843; AVX512VBMIVL-NEXT:    retq
844  %truncated = trunc <4 x i64> %vec to <4 x i16>
845  %truncated.ext = zext <4 x i16> %truncated to <4 x i32>
846  %bc = bitcast <4 x i32> %truncated.ext to <8 x i16>
847  %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
848  ret <8 x i16> %result
849}
850
851define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind {
852; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
853; AVX1:       # %bb.0:
854; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
855; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
856; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
857; AVX1-NEXT:    vzeroupper
858; AVX1-NEXT:    retq
859;
860; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
861; AVX2-SLOW:       # %bb.0:
862; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
863; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
864; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
865; AVX2-SLOW-NEXT:    vzeroupper
866; AVX2-SLOW-NEXT:    retq
867;
868; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
869; AVX2-FAST:       # %bb.0:
870; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
871; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
872; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
873; AVX2-FAST-NEXT:    vzeroupper
874; AVX2-FAST-NEXT:    retq
875;
876; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
877; AVX512F:       # %bb.0:
878; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
879; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
880; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
881; AVX512F-NEXT:    vzeroupper
882; AVX512F-NEXT:    retq
883;
884; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
885; AVX512VL:       # %bb.0:
886; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
887; AVX512VL-NEXT:    vzeroupper
888; AVX512VL-NEXT:    retq
889;
890; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
891; AVX512BW:       # %bb.0:
892; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
893; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
894; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
895; AVX512BW-NEXT:    vzeroupper
896; AVX512BW-NEXT:    retq
897;
898; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
899; AVX512BWVL:       # %bb.0:
900; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
901; AVX512BWVL-NEXT:    vzeroupper
902; AVX512BWVL-NEXT:    retq
903;
904; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
905; AVX512VBMIVL:       # %bb.0:
906; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
907; AVX512VBMIVL-NEXT:    vzeroupper
908; AVX512VBMIVL-NEXT:    retq
909  %truncated = trunc <4 x i64> %vec to <4 x i32>
910  %bc = bitcast <4 x i32> %truncated to <8 x i16>
911  %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 8, i32 undef, i32 13>
912  ret <8 x i16> %result
913}
914
915define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind {
916; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
917; AVX1:       # %bb.0:
918; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
919; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
920; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
921; AVX1-NEXT:    vzeroupper
922; AVX1-NEXT:    retq
923;
924; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
925; AVX2-SLOW:       # %bb.0:
926; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
927; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
928; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
929; AVX2-SLOW-NEXT:    vzeroupper
930; AVX2-SLOW-NEXT:    retq
931;
932; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
933; AVX2-FAST:       # %bb.0:
934; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
935; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
936; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
937; AVX2-FAST-NEXT:    vzeroupper
938; AVX2-FAST-NEXT:    retq
939;
940; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
941; AVX512F:       # %bb.0:
942; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
943; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
944; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
945; AVX512F-NEXT:    vzeroupper
946; AVX512F-NEXT:    retq
947;
948; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
949; AVX512VL:       # %bb.0:
950; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
951; AVX512VL-NEXT:    vzeroupper
952; AVX512VL-NEXT:    retq
953;
954; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
955; AVX512BW:       # %bb.0:
956; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
957; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
958; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
959; AVX512BW-NEXT:    vzeroupper
960; AVX512BW-NEXT:    retq
961;
962; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
963; AVX512BWVL:       # %bb.0:
964; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
965; AVX512BWVL-NEXT:    vzeroupper
966; AVX512BWVL-NEXT:    retq
967;
968; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
969; AVX512VBMIVL:       # %bb.0:
970; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
971; AVX512VBMIVL-NEXT:    vzeroupper
972; AVX512VBMIVL-NEXT:    retq
973  %truncated = trunc <4 x i64> %vec to <4 x i16>
974  %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
975  ret <8 x i16> %result
976}
977
978define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
979; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
980; AVX1:       # %bb.0:
981; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
982; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
983; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
984; AVX1-NEXT:    vzeroupper
985; AVX1-NEXT:    retq
986;
987; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
988; AVX2-SLOW:       # %bb.0:
989; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
990; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
991; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
992; AVX2-SLOW-NEXT:    vzeroupper
993; AVX2-SLOW-NEXT:    retq
994;
995; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
996; AVX2-FAST:       # %bb.0:
997; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
998; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
999; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
1000; AVX2-FAST-NEXT:    vzeroupper
1001; AVX2-FAST-NEXT:    retq
1002;
1003; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1004; AVX512F:       # %bb.0:
1005; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1006; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1007; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
1008; AVX512F-NEXT:    vzeroupper
1009; AVX512F-NEXT:    retq
1010;
1011; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1012; AVX512VL:       # %bb.0:
1013; AVX512VL-NEXT:    vpmovqb %ymm0, %xmm0
1014; AVX512VL-NEXT:    vzeroupper
1015; AVX512VL-NEXT:    retq
1016;
1017; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1018; AVX512BW:       # %bb.0:
1019; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1020; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1021; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
1022; AVX512BW-NEXT:    vzeroupper
1023; AVX512BW-NEXT:    retq
1024;
1025; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1026; AVX512BWVL:       # %bb.0:
1027; AVX512BWVL-NEXT:    vpmovqb %ymm0, %xmm0
1028; AVX512BWVL-NEXT:    vzeroupper
1029; AVX512BWVL-NEXT:    retq
1030;
1031; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1032; AVX512VBMIVL:       # %bb.0:
1033; AVX512VBMIVL-NEXT:    vpmovqb %ymm0, %xmm0
1034; AVX512VBMIVL-NEXT:    vzeroupper
1035; AVX512VBMIVL-NEXT:    retq
1036  %truncated = trunc <4 x i64> %vec to <4 x i8>
1037  %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 5, i32 5, i32 undef, i32 7>
1038  ret <16 x i8> %result
1039}
1040
1041define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
1042; AVX1-LABEL: shuffle_v16i16_to_v4i16:
1043; AVX1:       # %bb.0:
1044; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
1045; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1046; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1047; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1048; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1049; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1050; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1051; AVX1-NEXT:    vmovq %xmm0, (%rsi)
1052; AVX1-NEXT:    vzeroupper
1053; AVX1-NEXT:    retq
1054;
1055; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16:
1056; AVX2-SLOW:       # %bb.0:
1057; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
1058; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1059; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1060; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1061; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1062; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1063; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1064; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
1065; AVX2-SLOW-NEXT:    vzeroupper
1066; AVX2-SLOW-NEXT:    retq
1067;
1068; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16:
1069; AVX2-FAST:       # %bb.0:
1070; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
1071; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
1072; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
1073; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1074; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1075; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1076; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
1077; AVX2-FAST-NEXT:    vzeroupper
1078; AVX2-FAST-NEXT:    retq
1079;
1080; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
1081; AVX512F:       # %bb.0:
1082; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1083; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
1084; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1085; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1086; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1087; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1088; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1089; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
1090; AVX512F-NEXT:    vzeroupper
1091; AVX512F-NEXT:    retq
1092;
1093; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
1094; AVX512VL:       # %bb.0:
1095; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
1096; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
1097; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1098; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
1099; AVX512VL-NEXT:    vzeroupper
1100; AVX512VL-NEXT:    retq
1101;
1102; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
1103; AVX512BW:       # %bb.0:
1104; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
1105; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1106; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
1107; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1108; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1109; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1110; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
1111; AVX512BW-NEXT:    vzeroupper
1112; AVX512BW-NEXT:    retq
1113;
1114; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
1115; AVX512BWVL:       # %bb.0:
1116; AVX512BWVL-NEXT:    vmovaps (%rdi), %ymm0
1117; AVX512BWVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
1118; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1119; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
1120; AVX512BWVL-NEXT:    vzeroupper
1121; AVX512BWVL-NEXT:    retq
1122;
1123; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
1124; AVX512VBMIVL:       # %bb.0:
1125; AVX512VBMIVL-NEXT:    vmovaps (%rdi), %ymm0
1126; AVX512VBMIVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
1127; AVX512VBMIVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1128; AVX512VBMIVL-NEXT:    vpmovdw %xmm0, (%rsi)
1129; AVX512VBMIVL-NEXT:    vzeroupper
1130; AVX512VBMIVL-NEXT:    retq
1131  %vec = load <16 x i16>, <16 x i16>* %L
1132  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
1133  store <4 x i16> %strided.vec, <4 x i16>* %S
1134  ret void
1135}
1136
1137define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
1138; AVX1-LABEL: trunc_v4i64_to_v4i16:
1139; AVX1:       # %bb.0:
1140; AVX1-NEXT:    vmovaps (%rdi), %ymm0
1141; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1142; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1143; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1144; AVX1-NEXT:    vmovq %xmm0, (%rsi)
1145; AVX1-NEXT:    vzeroupper
1146; AVX1-NEXT:    retq
1147;
1148; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
1149; AVX2-SLOW:       # %bb.0:
1150; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
1151; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1152; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1153; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
1154; AVX2-SLOW-NEXT:    vzeroupper
1155; AVX2-SLOW-NEXT:    retq
1156;
1157; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16:
1158; AVX2-FAST:       # %bb.0:
1159; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
1160; AVX2-FAST-NEXT:    vpermd (%rdi), %ymm0, %ymm0
1161; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1162; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
1163; AVX2-FAST-NEXT:    vzeroupper
1164; AVX2-FAST-NEXT:    retq
1165;
1166; AVX512F-LABEL: trunc_v4i64_to_v4i16:
1167; AVX512F:       # %bb.0:
1168; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1169; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1170; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1171; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
1172; AVX512F-NEXT:    vzeroupper
1173; AVX512F-NEXT:    retq
1174;
1175; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
1176; AVX512VL:       # %bb.0:
1177; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
1178; AVX512VL-NEXT:    vpmovqw %ymm0, (%rsi)
1179; AVX512VL-NEXT:    vzeroupper
1180; AVX512VL-NEXT:    retq
1181;
1182; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
1183; AVX512BW:       # %bb.0:
1184; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
1185; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1186; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1187; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
1188; AVX512BW-NEXT:    vzeroupper
1189; AVX512BW-NEXT:    retq
1190;
1191; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
1192; AVX512BWVL:       # %bb.0:
1193; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
1194; AVX512BWVL-NEXT:    vpmovqw %ymm0, (%rsi)
1195; AVX512BWVL-NEXT:    vzeroupper
1196; AVX512BWVL-NEXT:    retq
1197;
1198; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16:
1199; AVX512VBMIVL:       # %bb.0:
1200; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
1201; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, (%rsi)
1202; AVX512VBMIVL-NEXT:    vzeroupper
1203; AVX512VBMIVL-NEXT:    retq
1204  %vec = load <16 x i16>, <16 x i16>* %L
1205  %bc = bitcast <16 x i16> %vec to <4 x i64>
1206  %strided.vec = trunc <4 x i64> %bc to <4 x i16>
1207  store <4 x i16> %strided.vec, <4 x i16>* %S
1208  ret void
1209}
1210
1211define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1212; AVX1-LABEL: shuffle_v32i8_to_v4i8:
1213; AVX1:       # %bb.0:
1214; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
1215; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1216; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1217; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1218; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1219; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1220; AVX1-NEXT:    vmovd %xmm0, (%rsi)
1221; AVX1-NEXT:    vzeroupper
1222; AVX1-NEXT:    retq
1223;
1224; AVX2-LABEL: shuffle_v32i8_to_v4i8:
1225; AVX2:       # %bb.0:
1226; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
1227; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1228; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1229; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1230; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1231; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1232; AVX2-NEXT:    vmovd %xmm0, (%rsi)
1233; AVX2-NEXT:    vzeroupper
1234; AVX2-NEXT:    retq
1235;
1236; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
1237; AVX512F:       # %bb.0:
1238; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1239; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
1240; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1241; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1242; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1243; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1244; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
1245; AVX512F-NEXT:    vzeroupper
1246; AVX512F-NEXT:    retq
1247;
1248; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
1249; AVX512VL:       # %bb.0:
1250; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
1251; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
1252; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1253; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
1254; AVX512VL-NEXT:    vzeroupper
1255; AVX512VL-NEXT:    retq
1256;
1257; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
1258; AVX512BW:       # %bb.0:
1259; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
1260; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1261; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1262; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1263; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1264; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1265; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
1266; AVX512BW-NEXT:    vzeroupper
1267; AVX512BW-NEXT:    retq
1268;
1269; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
1270; AVX512BWVL:       # %bb.0:
1271; AVX512BWVL-NEXT:    vmovaps (%rdi), %ymm0
1272; AVX512BWVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
1273; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1274; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
1275; AVX512BWVL-NEXT:    vzeroupper
1276; AVX512BWVL-NEXT:    retq
1277;
1278; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
1279; AVX512VBMIVL:       # %bb.0:
1280; AVX512VBMIVL-NEXT:    vmovaps (%rdi), %ymm0
1281; AVX512VBMIVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
1282; AVX512VBMIVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1283; AVX512VBMIVL-NEXT:    vpmovdb %xmm0, (%rsi)
1284; AVX512VBMIVL-NEXT:    vzeroupper
1285; AVX512VBMIVL-NEXT:    retq
1286  %vec = load <32 x i8>, <32 x i8>* %L
1287  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
1288  store <4 x i8> %strided.vec, <4 x i8>* %S
1289  ret void
1290}
1291
1292define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1293; AVX1-LABEL: trunc_v4i64_to_v4i8:
1294; AVX1:       # %bb.0:
1295; AVX1-NEXT:    vmovaps (%rdi), %ymm0
1296; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1297; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1298; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1299; AVX1-NEXT:    vmovd %xmm0, (%rsi)
1300; AVX1-NEXT:    vzeroupper
1301; AVX1-NEXT:    retq
1302;
1303; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8:
1304; AVX2-SLOW:       # %bb.0:
1305; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
1306; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1307; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1308; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
1309; AVX2-SLOW-NEXT:    vzeroupper
1310; AVX2-SLOW-NEXT:    retq
1311;
1312; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8:
1313; AVX2-FAST:       # %bb.0:
1314; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
1315; AVX2-FAST-NEXT:    vpermd (%rdi), %ymm0, %ymm0
1316; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1317; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
1318; AVX2-FAST-NEXT:    vzeroupper
1319; AVX2-FAST-NEXT:    retq
1320;
1321; AVX512F-LABEL: trunc_v4i64_to_v4i8:
1322; AVX512F:       # %bb.0:
1323; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1324; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1325; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1326; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
1327; AVX512F-NEXT:    vzeroupper
1328; AVX512F-NEXT:    retq
1329;
1330; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
1331; AVX512VL:       # %bb.0:
1332; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
1333; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
1334; AVX512VL-NEXT:    vzeroupper
1335; AVX512VL-NEXT:    retq
1336;
1337; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
1338; AVX512BW:       # %bb.0:
1339; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
1340; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1341; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1342; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
1343; AVX512BW-NEXT:    vzeroupper
1344; AVX512BW-NEXT:    retq
1345;
1346; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
1347; AVX512BWVL:       # %bb.0:
1348; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
1349; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
1350; AVX512BWVL-NEXT:    vzeroupper
1351; AVX512BWVL-NEXT:    retq
1352;
1353; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8:
1354; AVX512VBMIVL:       # %bb.0:
1355; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
1356; AVX512VBMIVL-NEXT:    vpmovqb %ymm0, (%rsi)
1357; AVX512VBMIVL-NEXT:    vzeroupper
1358; AVX512VBMIVL-NEXT:    retq
1359  %vec = load <32 x i8>, <32 x i8>* %L
1360  %bc = bitcast <32 x i8> %vec to <4 x i64>
1361  %strided.vec = trunc <4 x i64> %bc to <4 x i8>
1362  store <4 x i8> %strided.vec, <4 x i8>* %S
1363  ret void
1364}
1365
1366; In this case not all elements are collected from the same source vector, so
1367; the resulting BUILD_VECTOR should not be combined to a truncate.
1368define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
1369; AVX1-LABEL: negative:
1370; AVX1:       # %bb.0:
1371; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
1372; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1373; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
1374; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1375; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1376; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
1377; AVX1-NEXT:    vzeroupper
1378; AVX1-NEXT:    retq
1379;
1380; AVX2-LABEL: negative:
1381; AVX2:       # %bb.0:
1382; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
1383; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1384; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1385; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1386; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1387; AVX2-NEXT:    vzeroupper
1388; AVX2-NEXT:    retq
1389;
1390; AVX512F-LABEL: negative:
1391; AVX512F:       # %bb.0:
1392; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
1393; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1394; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1395; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1396; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1397; AVX512F-NEXT:    vzeroupper
1398; AVX512F-NEXT:    retq
1399;
1400; AVX512VL-LABEL: negative:
1401; AVX512VL:       # %bb.0:
1402; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
1403; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1404; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1405; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1406; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1407; AVX512VL-NEXT:    vzeroupper
1408; AVX512VL-NEXT:    retq
1409;
1410; AVX512BW-LABEL: negative:
1411; AVX512BW:       # %bb.0:
1412; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
1413; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1414; AVX512BW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1415; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1416; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1417; AVX512BW-NEXT:    vzeroupper
1418; AVX512BW-NEXT:    retq
1419;
1420; AVX512BWVL-LABEL: negative:
1421; AVX512BWVL:       # %bb.0:
1422; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
1423; AVX512BWVL-NEXT:    movl $65537, %eax # imm = 0x10001
1424; AVX512BWVL-NEXT:    kmovd %eax, %k1
1425; AVX512BWVL-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
1426; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1427; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1428; AVX512BWVL-NEXT:    vzeroupper
1429; AVX512BWVL-NEXT:    retq
1430;
1431; AVX512VBMIVL-LABEL: negative:
1432; AVX512VBMIVL:       # %bb.0:
1433; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,48,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
1434; AVX512VBMIVL-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
1435; AVX512VBMIVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1436; AVX512VBMIVL-NEXT:    vzeroupper
1437; AVX512VBMIVL-NEXT:    retq
1438  %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1439  %w0 = extractelement <32 x i8> %w, i32 0
1440  %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0
1441  ret <16 x i8> %merged
1442}
1443