• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512BWVL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VBMIVL
10
11; PR31551
12; Pairs of shufflevector:trunc functions with functional equivalence.
13; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
14
15define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
16; AVX-LABEL: shuffle_v32i8_to_v16i8:
17; AVX:       # %bb.0:
18; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
19; AVX-NEXT:    vpand 16(%rdi), %xmm0, %xmm1
20; AVX-NEXT:    vpand (%rdi), %xmm0, %xmm0
21; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
22; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
23; AVX-NEXT:    retq
24;
25; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
26; AVX512F:       # %bb.0:
27; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
28; AVX512F-NEXT:    vpand 16(%rdi), %xmm0, %xmm1
29; AVX512F-NEXT:    vpand (%rdi), %xmm0, %xmm0
30; AVX512F-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
31; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
32; AVX512F-NEXT:    retq
33;
34; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
35; AVX512VL:       # %bb.0:
36; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
37; AVX512VL-NEXT:    vpand 16(%rdi), %xmm0, %xmm1
38; AVX512VL-NEXT:    vpand (%rdi), %xmm0, %xmm0
39; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
40; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
41; AVX512VL-NEXT:    retq
42;
43; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
44; AVX512BW:       # %bb.0:
45; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
46; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
47; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
48; AVX512BW-NEXT:    vzeroupper
49; AVX512BW-NEXT:    retq
50;
51; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
52; AVX512BWVL:       # %bb.0:
53; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
54; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
55; AVX512BWVL-NEXT:    vzeroupper
56; AVX512BWVL-NEXT:    retq
57;
58; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8:
59; AVX512VBMIVL:       # %bb.0:
60; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
61; AVX512VBMIVL-NEXT:    vpmovwb %ymm0, (%rsi)
62; AVX512VBMIVL-NEXT:    vzeroupper
63; AVX512VBMIVL-NEXT:    retq
64  %vec = load <32 x i8>, <32 x i8>* %L
65  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
66  store <16 x i8> %strided.vec, <16 x i8>* %S
67  ret void
68}
69
70define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
71; AVX1-LABEL: trunc_v16i16_to_v16i8:
72; AVX1:       # %bb.0:
73; AVX1-NEXT:    vmovaps (%rdi), %ymm0
74; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
75; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
76; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
77; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
78; AVX1-NEXT:    vzeroupper
79; AVX1-NEXT:    retq
80;
81; AVX2-LABEL: trunc_v16i16_to_v16i8:
82; AVX2:       # %bb.0:
83; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
84; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
85; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
86; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
87; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
88; AVX2-NEXT:    vzeroupper
89; AVX2-NEXT:    retq
90;
91; AVX512F-LABEL: trunc_v16i16_to_v16i8:
92; AVX512F:       # %bb.0:
93; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
94; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
95; AVX512F-NEXT:    vzeroupper
96; AVX512F-NEXT:    retq
97;
98; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
99; AVX512VL:       # %bb.0:
100; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
101; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
102; AVX512VL-NEXT:    vzeroupper
103; AVX512VL-NEXT:    retq
104;
105; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
106; AVX512BW:       # %bb.0:
107; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
108; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
109; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
110; AVX512BW-NEXT:    vzeroupper
111; AVX512BW-NEXT:    retq
112;
113; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
114; AVX512BWVL:       # %bb.0:
115; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
116; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
117; AVX512BWVL-NEXT:    vzeroupper
118; AVX512BWVL-NEXT:    retq
119;
120; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8:
121; AVX512VBMIVL:       # %bb.0:
122; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
123; AVX512VBMIVL-NEXT:    vpmovwb %ymm0, (%rsi)
124; AVX512VBMIVL-NEXT:    vzeroupper
125; AVX512VBMIVL-NEXT:    retq
126  %vec = load <32 x i8>, <32 x i8>* %L
127  %bc = bitcast <32 x i8> %vec to <16 x i16>
128  %strided.vec = trunc <16 x i16> %bc to <16 x i8>
129  store <16 x i8> %strided.vec, <16 x i8>* %S
130  ret void
131}
132
133define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
134; AVX-LABEL: shuffle_v16i16_to_v8i16:
135; AVX:       # %bb.0:
136; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
137; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
138; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
139; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
140; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
141; AVX-NEXT:    retq
142;
143; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
144; AVX512F:       # %bb.0:
145; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
146; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
147; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
148; AVX512F-NEXT:    vzeroupper
149; AVX512F-NEXT:    retq
150;
151; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
152; AVX512VL:       # %bb.0:
153; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
154; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi)
155; AVX512VL-NEXT:    vzeroupper
156; AVX512VL-NEXT:    retq
157;
158; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
159; AVX512BW:       # %bb.0:
160; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
161; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
162; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
163; AVX512BW-NEXT:    vzeroupper
164; AVX512BW-NEXT:    retq
165;
166; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
167; AVX512BWVL:       # %bb.0:
168; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
169; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi)
170; AVX512BWVL-NEXT:    vzeroupper
171; AVX512BWVL-NEXT:    retq
172;
173; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16:
174; AVX512VBMIVL:       # %bb.0:
175; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
176; AVX512VBMIVL-NEXT:    vpmovdw %ymm0, (%rsi)
177; AVX512VBMIVL-NEXT:    vzeroupper
178; AVX512VBMIVL-NEXT:    retq
179  %vec = load <16 x i16>, <16 x i16>* %L
180  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
181  store <8 x i16> %strided.vec, <8 x i16>* %S
182  ret void
183}
184
185define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
186; AVX1-LABEL: trunc_v8i32_to_v8i16:
187; AVX1:       # %bb.0:
188; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
189; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
190; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
191; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
192; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
193; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
194; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
195; AVX1-NEXT:    retq
196;
197; AVX2-LABEL: trunc_v8i32_to_v8i16:
198; AVX2:       # %bb.0:
199; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
200; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
201; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
202; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
203; AVX2-NEXT:    vzeroupper
204; AVX2-NEXT:    retq
205;
206; AVX512F-LABEL: trunc_v8i32_to_v8i16:
207; AVX512F:       # %bb.0:
208; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
209; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
210; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
211; AVX512F-NEXT:    vzeroupper
212; AVX512F-NEXT:    retq
213;
214; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
215; AVX512VL:       # %bb.0:
216; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
217; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi)
218; AVX512VL-NEXT:    vzeroupper
219; AVX512VL-NEXT:    retq
220;
221; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
222; AVX512BW:       # %bb.0:
223; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
224; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
225; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
226; AVX512BW-NEXT:    vzeroupper
227; AVX512BW-NEXT:    retq
228;
229; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
230; AVX512BWVL:       # %bb.0:
231; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
232; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi)
233; AVX512BWVL-NEXT:    vzeroupper
234; AVX512BWVL-NEXT:    retq
235;
236; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16:
237; AVX512VBMIVL:       # %bb.0:
238; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
239; AVX512VBMIVL-NEXT:    vpmovdw %ymm0, (%rsi)
240; AVX512VBMIVL-NEXT:    vzeroupper
241; AVX512VBMIVL-NEXT:    retq
242  %vec = load <16 x i16>, <16 x i16>* %L
243  %bc = bitcast <16 x i16> %vec to <8 x i32>
244  %strided.vec = trunc <8 x i32> %bc to <8 x i16>
245  store <8 x i16> %strided.vec, <8 x i16>* %S
246  ret void
247}
248
249define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
250; AVX-LABEL: shuffle_v8i32_to_v4i32:
251; AVX:       # %bb.0:
252; AVX-NEXT:    vmovaps (%rdi), %xmm0
253; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
254; AVX-NEXT:    vmovaps %xmm0, (%rsi)
255; AVX-NEXT:    retq
256;
257; AVX512F-LABEL: shuffle_v8i32_to_v4i32:
258; AVX512F:       # %bb.0:
259; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
260; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
261; AVX512F-NEXT:    vmovaps %xmm0, (%rsi)
262; AVX512F-NEXT:    retq
263;
264; AVX512VL-LABEL: shuffle_v8i32_to_v4i32:
265; AVX512VL:       # %bb.0:
266; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
267; AVX512VL-NEXT:    vpmovqd %ymm0, (%rsi)
268; AVX512VL-NEXT:    vzeroupper
269; AVX512VL-NEXT:    retq
270;
271; AVX512BW-LABEL: shuffle_v8i32_to_v4i32:
272; AVX512BW:       # %bb.0:
273; AVX512BW-NEXT:    vmovaps (%rdi), %xmm0
274; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
275; AVX512BW-NEXT:    vmovaps %xmm0, (%rsi)
276; AVX512BW-NEXT:    retq
277;
278; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32:
279; AVX512BWVL:       # %bb.0:
280; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
281; AVX512BWVL-NEXT:    vpmovqd %ymm0, (%rsi)
282; AVX512BWVL-NEXT:    vzeroupper
283; AVX512BWVL-NEXT:    retq
284;
285; AVX512VBMIVL-LABEL: shuffle_v8i32_to_v4i32:
286; AVX512VBMIVL:       # %bb.0:
287; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
288; AVX512VBMIVL-NEXT:    vpmovqd %ymm0, (%rsi)
289; AVX512VBMIVL-NEXT:    vzeroupper
290; AVX512VBMIVL-NEXT:    retq
291  %vec = load <8 x i32>, <8 x i32>* %L
292  %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
293  store <4 x i32> %strided.vec, <4 x i32>* %S
294  ret void
295}
296
297define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
298; AVX1-LABEL: trunc_v4i64_to_v4i32:
299; AVX1:       # %bb.0:
300; AVX1-NEXT:    vmovaps (%rdi), %xmm0
301; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
302; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
303; AVX1-NEXT:    retq
304;
305; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
306; AVX2-SLOW:       # %bb.0:
307; AVX2-SLOW-NEXT:    vmovaps (%rdi), %xmm0
308; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
309; AVX2-SLOW-NEXT:    vmovaps %xmm0, (%rsi)
310; AVX2-SLOW-NEXT:    retq
311;
312; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
313; AVX2-FAST:       # %bb.0:
314; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u>
315; AVX2-FAST-NEXT:    vpermps (%rdi), %ymm0, %ymm0
316; AVX2-FAST-NEXT:    vmovaps %xmm0, (%rsi)
317; AVX2-FAST-NEXT:    vzeroupper
318; AVX2-FAST-NEXT:    retq
319;
320; AVX512F-LABEL: trunc_v4i64_to_v4i32:
321; AVX512F:       # %bb.0:
322; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
323; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
324; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
325; AVX512F-NEXT:    vzeroupper
326; AVX512F-NEXT:    retq
327;
328; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
329; AVX512VL:       # %bb.0:
330; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
331; AVX512VL-NEXT:    vpmovqd %ymm0, (%rsi)
332; AVX512VL-NEXT:    vzeroupper
333; AVX512VL-NEXT:    retq
334;
335; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
336; AVX512BW:       # %bb.0:
337; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
338; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
339; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
340; AVX512BW-NEXT:    vzeroupper
341; AVX512BW-NEXT:    retq
342;
343; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
344; AVX512BWVL:       # %bb.0:
345; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
346; AVX512BWVL-NEXT:    vpmovqd %ymm0, (%rsi)
347; AVX512BWVL-NEXT:    vzeroupper
348; AVX512BWVL-NEXT:    retq
349;
350; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32:
351; AVX512VBMIVL:       # %bb.0:
352; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
353; AVX512VBMIVL-NEXT:    vpmovqd %ymm0, (%rsi)
354; AVX512VBMIVL-NEXT:    vzeroupper
355; AVX512VBMIVL-NEXT:    retq
356  %vec = load <8 x i32>, <8 x i32>* %L
357  %bc = bitcast <8 x i32> %vec to <4 x i64>
358  %strided.vec = trunc <4 x i64> %bc to <4 x i32>
359  store <4 x i32> %strided.vec, <4 x i32>* %S
360  ret void
361}
362
363define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
364; AVX-LABEL: shuffle_v32i8_to_v8i8:
365; AVX:       # %bb.0:
366; AVX-NEXT:    vmovdqa (%rdi), %xmm0
367; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
368; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
369; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
370; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
371; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
372; AVX-NEXT:    vmovq %xmm0, (%rsi)
373; AVX-NEXT:    retq
374;
375; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
376; AVX512F:       # %bb.0:
377; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
378; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
379; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
380; AVX512F-NEXT:    vzeroupper
381; AVX512F-NEXT:    retq
382;
383; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
384; AVX512VL:       # %bb.0:
385; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
386; AVX512VL-NEXT:    vpmovdb %ymm0, (%rsi)
387; AVX512VL-NEXT:    vzeroupper
388; AVX512VL-NEXT:    retq
389;
390; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
391; AVX512BW:       # %bb.0:
392; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
393; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
394; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
395; AVX512BW-NEXT:    vzeroupper
396; AVX512BW-NEXT:    retq
397;
398; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
399; AVX512BWVL:       # %bb.0:
400; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
401; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rsi)
402; AVX512BWVL-NEXT:    vzeroupper
403; AVX512BWVL-NEXT:    retq
404;
405; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
406; AVX512VBMIVL:       # %bb.0:
407; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
408; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, (%rsi)
409; AVX512VBMIVL-NEXT:    vzeroupper
410; AVX512VBMIVL-NEXT:    retq
411  %vec = load <32 x i8>, <32 x i8>* %L
412  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
413  store <8 x i8> %strided.vec, <8 x i8>* %S
414  ret void
415}
416
417define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
418; AVX-LABEL: trunc_v8i32_to_v8i8:
419; AVX:       # %bb.0:
420; AVX-NEXT:    vmovdqa (%rdi), %xmm0
421; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
422; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
423; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
424; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
425; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
426; AVX-NEXT:    vmovq %xmm0, (%rsi)
427; AVX-NEXT:    retq
428;
429; AVX512F-LABEL: trunc_v8i32_to_v8i8:
430; AVX512F:       # %bb.0:
431; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
432; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
433; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
434; AVX512F-NEXT:    vzeroupper
435; AVX512F-NEXT:    retq
436;
437; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
438; AVX512VL:       # %bb.0:
439; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
440; AVX512VL-NEXT:    vpmovdb %ymm0, (%rsi)
441; AVX512VL-NEXT:    vzeroupper
442; AVX512VL-NEXT:    retq
443;
444; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
445; AVX512BW:       # %bb.0:
446; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
447; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
448; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
449; AVX512BW-NEXT:    vzeroupper
450; AVX512BW-NEXT:    retq
451;
452; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
453; AVX512BWVL:       # %bb.0:
454; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
455; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rsi)
456; AVX512BWVL-NEXT:    vzeroupper
457; AVX512BWVL-NEXT:    retq
458;
459; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8:
460; AVX512VBMIVL:       # %bb.0:
461; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
462; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, (%rsi)
463; AVX512VBMIVL-NEXT:    vzeroupper
464; AVX512VBMIVL-NEXT:    retq
465  %vec = load <32 x i8>, <32 x i8>* %L
466  %bc = bitcast <32 x i8> %vec to <8 x i32>
467  %strided.vec = trunc <8 x i32> %bc to <8 x i8>
468  store <8 x i8> %strided.vec, <8 x i8>* %S
469  ret void
470}
471
472define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
473; IR generated from:
474; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0};
475; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
476; AVX1:       # %bb.0:
477; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
478; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
479; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
480; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
481; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
482; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
483; AVX1-NEXT:    vzeroupper
484; AVX1-NEXT:    retq
485;
486; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
487; AVX2:       # %bb.0:
488; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
489; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
490; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
491; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
492; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
493; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
494; AVX2-NEXT:    vzeroupper
495; AVX2-NEXT:    retq
496;
497; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
498; AVX512F:       # %bb.0:
499; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
500; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
501; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
502; AVX512F-NEXT:    vzeroupper
503; AVX512F-NEXT:    retq
504;
505; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
506; AVX512VL:       # %bb.0:
507; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
508; AVX512VL-NEXT:    vzeroupper
509; AVX512VL-NEXT:    retq
510;
511; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
512; AVX512BW:       # %bb.0:
513; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
514; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
515; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
516; AVX512BW-NEXT:    vzeroupper
517; AVX512BW-NEXT:    retq
518;
519; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
520; AVX512BWVL:       # %bb.0:
521; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
522; AVX512BWVL-NEXT:    vzeroupper
523; AVX512BWVL-NEXT:    retq
524;
525; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
526; AVX512VBMIVL:       # %bb.0:
527; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
528; AVX512VBMIVL-NEXT:    vzeroupper
529; AVX512VBMIVL-NEXT:    retq
530  %truncated.vec = trunc <8 x i32> %vec to <8 x i8>
531  %bc = bitcast <8 x i8> %truncated.vec to i64
532  %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
533  ret <2 x i64> %result
534}
535
536define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind {
537; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
538; AVX1:       # %bb.0:
539; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
540; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
541; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
542; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
543; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
544; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
545; AVX1-NEXT:    vzeroupper
546; AVX1-NEXT:    retq
547;
548; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
549; AVX2:       # %bb.0:
550; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
551; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
552; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
553; AVX2-NEXT:    vzeroupper
554; AVX2-NEXT:    retq
555;
556; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
557; AVX512F:       # %bb.0:
558; AVX512F-NEXT:    vmovdqa %ymm0, %ymm0
559; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
560; AVX512F-NEXT:    vzeroupper
561; AVX512F-NEXT:    retq
562;
563; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
564; AVX512VL:       # %bb.0:
565; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
566; AVX512VL-NEXT:    vzeroupper
567; AVX512VL-NEXT:    retq
568;
569; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
570; AVX512BW:       # %bb.0:
571; AVX512BW-NEXT:    vmovdqa %ymm0, %ymm0
572; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
573; AVX512BW-NEXT:    vzeroupper
574; AVX512BW-NEXT:    retq
575;
576; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
577; AVX512BWVL:       # %bb.0:
578; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
579; AVX512BWVL-NEXT:    vzeroupper
580; AVX512BWVL-NEXT:    retq
581;
582; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
583; AVX512VBMIVL:       # %bb.0:
584; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
585; AVX512VBMIVL-NEXT:    vzeroupper
586; AVX512VBMIVL-NEXT:    retq
587  %truncated = trunc <8 x i32> %vec to <8 x i8>
588  %truncated.ext = zext <8 x i8> %truncated to <8 x i16>
589  %bc = bitcast <8 x i16> %truncated.ext to <16 x i8>
590  %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
591  ret <16 x i8> %result
592}
593
594define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind {
595; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
596; AVX1:       # %bb.0:
597; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
598; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
599; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
600; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
601; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
602; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
603; AVX1-NEXT:    vzeroupper
604; AVX1-NEXT:    retq
605;
606; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
607; AVX2:       # %bb.0:
608; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
609; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
610; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
611; AVX2-NEXT:    vzeroupper
612; AVX2-NEXT:    retq
613;
614; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
615; AVX512F:       # %bb.0:
616; AVX512F-NEXT:    vmovdqa %ymm0, %ymm0
617; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
618; AVX512F-NEXT:    vzeroupper
619; AVX512F-NEXT:    retq
620;
621; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
622; AVX512VL:       # %bb.0:
623; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
624; AVX512VL-NEXT:    vzeroupper
625; AVX512VL-NEXT:    retq
626;
627; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
628; AVX512BW:       # %bb.0:
629; AVX512BW-NEXT:    vmovdqa %ymm0, %ymm0
630; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
631; AVX512BW-NEXT:    vzeroupper
632; AVX512BW-NEXT:    retq
633;
634; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
635; AVX512BWVL:       # %bb.0:
636; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
637; AVX512BWVL-NEXT:    vzeroupper
638; AVX512BWVL-NEXT:    retq
639;
640; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
641; AVX512VBMIVL:       # %bb.0:
642; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
643; AVX512VBMIVL-NEXT:    vzeroupper
644; AVX512VBMIVL-NEXT:    retq
645  %truncated = trunc <8 x i32> %vec to <8 x i16>
646  %bc = bitcast <8 x i16> %truncated to <16 x i8>
647  %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 17, i32 20, i32 24, i32 22, i32 31, i32 28, i32 28, i32 29>
648  ret <16 x i8> %result
649}
650
651define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
652; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
653; AVX1:       # %bb.0:
654; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
655; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
656; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
657; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
658; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
659; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
660; AVX1-NEXT:    vzeroupper
661; AVX1-NEXT:    retq
662;
663; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
664; AVX2:       # %bb.0:
665; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
666; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
667; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
668; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
669; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
670; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
671; AVX2-NEXT:    vzeroupper
672; AVX2-NEXT:    retq
673;
674; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
675; AVX512F:       # %bb.0:
676; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
677; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
678; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
679; AVX512F-NEXT:    vzeroupper
680; AVX512F-NEXT:    retq
681;
682; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
683; AVX512VL:       # %bb.0:
684; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
685; AVX512VL-NEXT:    vzeroupper
686; AVX512VL-NEXT:    retq
687;
688; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
689; AVX512BW:       # %bb.0:
690; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
691; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
692; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
693; AVX512BW-NEXT:    vzeroupper
694; AVX512BW-NEXT:    retq
695;
696; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
697; AVX512BWVL:       # %bb.0:
698; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
699; AVX512BWVL-NEXT:    vzeroupper
700; AVX512BWVL-NEXT:    retq
701;
702; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
703; AVX512VBMIVL:       # %bb.0:
704; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
705; AVX512VBMIVL-NEXT:    vzeroupper
706; AVX512VBMIVL-NEXT:    retq
707  %truncated = trunc <8 x i32> %vec to <8 x i8>
708  %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
709  ret <16 x i8> %result
710}
711
712define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind {
713; IR generated from:
714; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0};
715; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
716; AVX1:       # %bb.0:
717; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
718; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
719; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
720; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
721; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
722; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
723; AVX1-NEXT:    vzeroupper
724; AVX1-NEXT:    retq
725;
726; AVX2-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
727; AVX2:       # %bb.0:
728; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
729; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
730; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
731; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
732; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
733; AVX2-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
734; AVX2-NEXT:    vzeroupper
735; AVX2-NEXT:    retq
736;
737; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
738; AVX512F:       # %bb.0:
739; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
740; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
741; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
742; AVX512F-NEXT:    vzeroupper
743; AVX512F-NEXT:    retq
744;
745; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
746; AVX512VL:       # %bb.0:
747; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
748; AVX512VL-NEXT:    vzeroupper
749; AVX512VL-NEXT:    retq
750;
751; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
752; AVX512BW:       # %bb.0:
753; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
754; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
755; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
756; AVX512BW-NEXT:    vzeroupper
757; AVX512BW-NEXT:    retq
758;
759; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
760; AVX512BWVL:       # %bb.0:
761; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
762; AVX512BWVL-NEXT:    vzeroupper
763; AVX512BWVL-NEXT:    retq
764;
765; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
766; AVX512VBMIVL:       # %bb.0:
767; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
768; AVX512VBMIVL-NEXT:    vzeroupper
769; AVX512VBMIVL-NEXT:    retq
770  %truncated = trunc <4 x i64> %vec to <4 x i16>
771  %bc = bitcast <4 x i16> %truncated to i64
772  %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
773  ret <2 x i64> %result
774}
775
776define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind {
777; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
778; AVX1:       # %bb.0:
779; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
780; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
781; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
782; AVX1-NEXT:    vzeroupper
783; AVX1-NEXT:    retq
784;
785; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
786; AVX2-SLOW:       # %bb.0:
787; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
788; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
789; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
790; AVX2-SLOW-NEXT:    vzeroupper
791; AVX2-SLOW-NEXT:    retq
792;
793; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
794; AVX2-FAST:       # %bb.0:
795; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
796; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
797; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
798; AVX2-FAST-NEXT:    vzeroupper
799; AVX2-FAST-NEXT:    retq
800;
801; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
802; AVX512F:       # %bb.0:
803; AVX512F-NEXT:    vmovdqa %ymm0, %ymm0
804; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
805; AVX512F-NEXT:    vzeroupper
806; AVX512F-NEXT:    retq
807;
808; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
809; AVX512VL:       # %bb.0:
810; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
811; AVX512VL-NEXT:    vzeroupper
812; AVX512VL-NEXT:    retq
813;
814; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
815; AVX512BW:       # %bb.0:
816; AVX512BW-NEXT:    vmovdqa %ymm0, %ymm0
817; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
818; AVX512BW-NEXT:    vzeroupper
819; AVX512BW-NEXT:    retq
820;
821; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
822; AVX512BWVL:       # %bb.0:
823; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
824; AVX512BWVL-NEXT:    vzeroupper
825; AVX512BWVL-NEXT:    retq
826;
827; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
828; AVX512VBMIVL:       # %bb.0:
829; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
830; AVX512VBMIVL-NEXT:    vzeroupper
831; AVX512VBMIVL-NEXT:    retq
832  %truncated = trunc <4 x i64> %vec to <4 x i16>
833  %truncated.ext = zext <4 x i16> %truncated to <4 x i32>
834  %bc = bitcast <4 x i32> %truncated.ext to <8 x i16>
835  %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
836  ret <8 x i16> %result
837}
838
839define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind {
840; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
841; AVX1:       # %bb.0:
842; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
843; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
844; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
845; AVX1-NEXT:    vzeroupper
846; AVX1-NEXT:    retq
847;
848; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
849; AVX2-SLOW:       # %bb.0:
850; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
851; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
852; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
853; AVX2-SLOW-NEXT:    vzeroupper
854; AVX2-SLOW-NEXT:    retq
855;
856; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
857; AVX2-FAST:       # %bb.0:
858; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
859; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
860; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
861; AVX2-FAST-NEXT:    vzeroupper
862; AVX2-FAST-NEXT:    retq
863;
864; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
865; AVX512F:       # %bb.0:
866; AVX512F-NEXT:    vmovdqa %ymm0, %ymm0
867; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
868; AVX512F-NEXT:    vzeroupper
869; AVX512F-NEXT:    retq
870;
871; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
872; AVX512VL:       # %bb.0:
873; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
874; AVX512VL-NEXT:    vzeroupper
875; AVX512VL-NEXT:    retq
876;
877; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
878; AVX512BW:       # %bb.0:
879; AVX512BW-NEXT:    vmovdqa %ymm0, %ymm0
880; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
881; AVX512BW-NEXT:    vzeroupper
882; AVX512BW-NEXT:    retq
883;
884; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
885; AVX512BWVL:       # %bb.0:
886; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
887; AVX512BWVL-NEXT:    vzeroupper
888; AVX512BWVL-NEXT:    retq
889;
890; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
891; AVX512VBMIVL:       # %bb.0:
892; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
893; AVX512VBMIVL-NEXT:    vzeroupper
894; AVX512VBMIVL-NEXT:    retq
895  %truncated = trunc <4 x i64> %vec to <4 x i32>
896  %bc = bitcast <4 x i32> %truncated to <8 x i16>
897  %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 8, i32 undef, i32 13>
898  ret <8 x i16> %result
899}
900
901define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind {
902; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
903; AVX1:       # %bb.0:
904; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
905; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
906; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
907; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
908; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
909; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
910; AVX1-NEXT:    vzeroupper
911; AVX1-NEXT:    retq
912;
913; AVX2-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
914; AVX2:       # %bb.0:
915; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
916; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
917; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
918; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
919; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
920; AVX2-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
921; AVX2-NEXT:    vzeroupper
922; AVX2-NEXT:    retq
923;
924; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
925; AVX512F:       # %bb.0:
926; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
927; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
928; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
929; AVX512F-NEXT:    vzeroupper
930; AVX512F-NEXT:    retq
931;
932; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
933; AVX512VL:       # %bb.0:
934; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
935; AVX512VL-NEXT:    vzeroupper
936; AVX512VL-NEXT:    retq
937;
938; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
939; AVX512BW:       # %bb.0:
940; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
941; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
942; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
943; AVX512BW-NEXT:    vzeroupper
944; AVX512BW-NEXT:    retq
945;
946; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
947; AVX512BWVL:       # %bb.0:
948; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
949; AVX512BWVL-NEXT:    vzeroupper
950; AVX512BWVL-NEXT:    retq
951;
952; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
953; AVX512VBMIVL:       # %bb.0:
954; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
955; AVX512VBMIVL-NEXT:    vzeroupper
956; AVX512VBMIVL-NEXT:    retq
957  %truncated = trunc <4 x i64> %vec to <4 x i16>
958  %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
959  ret <8 x i16> %result
960}
961
962define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
963; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
964; AVX1:       # %bb.0:
965; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
966; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
967; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
968; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
969; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
970; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
971; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
972; AVX1-NEXT:    vzeroupper
973; AVX1-NEXT:    retq
974;
975; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
976; AVX2:       # %bb.0:
977; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
978; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
979; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
980; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
981; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
982; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
983; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
984; AVX2-NEXT:    vzeroupper
985; AVX2-NEXT:    retq
986;
987; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
988; AVX512F:       # %bb.0:
989; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
990; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
991; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
992; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
993; AVX512F-NEXT:    vzeroupper
994; AVX512F-NEXT:    retq
995;
996; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
997; AVX512VL:       # %bb.0:
998; AVX512VL-NEXT:    vpmovqb %ymm0, %xmm0
999; AVX512VL-NEXT:    vzeroupper
1000; AVX512VL-NEXT:    retq
1001;
1002; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1003; AVX512BW:       # %bb.0:
1004; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1005; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
1006; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1007; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1008; AVX512BW-NEXT:    vzeroupper
1009; AVX512BW-NEXT:    retq
1010;
1011; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1012; AVX512BWVL:       # %bb.0:
1013; AVX512BWVL-NEXT:    vpmovqb %ymm0, %xmm0
1014; AVX512BWVL-NEXT:    vzeroupper
1015; AVX512BWVL-NEXT:    retq
1016;
1017; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1018; AVX512VBMIVL:       # %bb.0:
1019; AVX512VBMIVL-NEXT:    vpmovqb %ymm0, %xmm0
1020; AVX512VBMIVL-NEXT:    vzeroupper
1021; AVX512VBMIVL-NEXT:    retq
1022  %truncated = trunc <4 x i64> %vec to <4 x i8>
1023  %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 5, i32 5, i32 undef, i32 7>
1024  ret <16 x i8> %result
1025}
1026
1027define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
1028; AVX-LABEL: shuffle_v16i16_to_v4i16:
1029; AVX:       # %bb.0:
1030; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1031; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1032; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1033; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1034; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
1035; AVX-NEXT:    vmovq %xmm0, (%rsi)
1036; AVX-NEXT:    retq
1037;
1038; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
1039; AVX512F:       # %bb.0:
1040; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1041; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
1042; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
1043; AVX512F-NEXT:    vzeroupper
1044; AVX512F-NEXT:    retq
1045;
1046; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
1047; AVX512VL:       # %bb.0:
1048; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
1049; AVX512VL-NEXT:    vpmovqw %ymm0, (%rsi)
1050; AVX512VL-NEXT:    vzeroupper
1051; AVX512VL-NEXT:    retq
1052;
1053; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
1054; AVX512BW:       # %bb.0:
1055; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
1056; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
1057; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
1058; AVX512BW-NEXT:    vzeroupper
1059; AVX512BW-NEXT:    retq
1060;
1061; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
1062; AVX512BWVL:       # %bb.0:
1063; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
1064; AVX512BWVL-NEXT:    vpmovqw %ymm0, (%rsi)
1065; AVX512BWVL-NEXT:    vzeroupper
1066; AVX512BWVL-NEXT:    retq
1067;
1068; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
1069; AVX512VBMIVL:       # %bb.0:
1070; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
1071; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, (%rsi)
1072; AVX512VBMIVL-NEXT:    vzeroupper
1073; AVX512VBMIVL-NEXT:    retq
1074  %vec = load <16 x i16>, <16 x i16>* %L
1075  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
1076  store <4 x i16> %strided.vec, <4 x i16>* %S
1077  ret void
1078}
1079
1080define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
1081; AVX-LABEL: trunc_v4i64_to_v4i16:
1082; AVX:       # %bb.0:
1083; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1084; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1085; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1086; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1087; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
1088; AVX-NEXT:    vmovq %xmm0, (%rsi)
1089; AVX-NEXT:    retq
1090;
1091; AVX512F-LABEL: trunc_v4i64_to_v4i16:
1092; AVX512F:       # %bb.0:
1093; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1094; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
1095; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
1096; AVX512F-NEXT:    vzeroupper
1097; AVX512F-NEXT:    retq
1098;
1099; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
1100; AVX512VL:       # %bb.0:
1101; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
1102; AVX512VL-NEXT:    vpmovqw %ymm0, (%rsi)
1103; AVX512VL-NEXT:    vzeroupper
1104; AVX512VL-NEXT:    retq
1105;
1106; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
1107; AVX512BW:       # %bb.0:
1108; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
1109; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
1110; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
1111; AVX512BW-NEXT:    vzeroupper
1112; AVX512BW-NEXT:    retq
1113;
1114; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
1115; AVX512BWVL:       # %bb.0:
1116; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
1117; AVX512BWVL-NEXT:    vpmovqw %ymm0, (%rsi)
1118; AVX512BWVL-NEXT:    vzeroupper
1119; AVX512BWVL-NEXT:    retq
1120;
1121; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16:
1122; AVX512VBMIVL:       # %bb.0:
1123; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
1124; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, (%rsi)
1125; AVX512VBMIVL-NEXT:    vzeroupper
1126; AVX512VBMIVL-NEXT:    retq
1127  %vec = load <16 x i16>, <16 x i16>* %L
1128  %bc = bitcast <16 x i16> %vec to <4 x i64>
1129  %strided.vec = trunc <4 x i64> %bc to <4 x i16>
1130  store <4 x i16> %strided.vec, <4 x i16>* %S
1131  ret void
1132}
1133
1134define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1135; AVX-LABEL: shuffle_v32i8_to_v4i8:
1136; AVX:       # %bb.0:
1137; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1138; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
1139; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1140; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1141; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1142; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1143; AVX-NEXT:    vmovd %xmm0, (%rsi)
1144; AVX-NEXT:    retq
1145;
1146; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
1147; AVX512F:       # %bb.0:
1148; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1149; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
1150; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
1151; AVX512F-NEXT:    vzeroupper
1152; AVX512F-NEXT:    retq
1153;
1154; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
1155; AVX512VL:       # %bb.0:
1156; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
1157; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
1158; AVX512VL-NEXT:    vzeroupper
1159; AVX512VL-NEXT:    retq
1160;
1161; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
1162; AVX512BW:       # %bb.0:
1163; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
1164; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
1165; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
1166; AVX512BW-NEXT:    vzeroupper
1167; AVX512BW-NEXT:    retq
1168;
1169; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
1170; AVX512BWVL:       # %bb.0:
1171; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
1172; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
1173; AVX512BWVL-NEXT:    vzeroupper
1174; AVX512BWVL-NEXT:    retq
1175;
1176; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
1177; AVX512VBMIVL:       # %bb.0:
1178; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
1179; AVX512VBMIVL-NEXT:    vpmovqb %ymm0, (%rsi)
1180; AVX512VBMIVL-NEXT:    vzeroupper
1181; AVX512VBMIVL-NEXT:    retq
1182  %vec = load <32 x i8>, <32 x i8>* %L
1183  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
1184  store <4 x i8> %strided.vec, <4 x i8>* %S
1185  ret void
1186}
1187
1188define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1189; AVX-LABEL: trunc_v4i64_to_v4i8:
1190; AVX:       # %bb.0:
1191; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1192; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
1193; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1194; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1195; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1196; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1197; AVX-NEXT:    vmovd %xmm0, (%rsi)
1198; AVX-NEXT:    retq
1199;
1200; AVX512F-LABEL: trunc_v4i64_to_v4i8:
1201; AVX512F:       # %bb.0:
1202; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1203; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
1204; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
1205; AVX512F-NEXT:    vzeroupper
1206; AVX512F-NEXT:    retq
1207;
1208; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
1209; AVX512VL:       # %bb.0:
1210; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
1211; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
1212; AVX512VL-NEXT:    vzeroupper
1213; AVX512VL-NEXT:    retq
1214;
1215; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
1216; AVX512BW:       # %bb.0:
1217; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
1218; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
1219; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
1220; AVX512BW-NEXT:    vzeroupper
1221; AVX512BW-NEXT:    retq
1222;
1223; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
1224; AVX512BWVL:       # %bb.0:
1225; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
1226; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
1227; AVX512BWVL-NEXT:    vzeroupper
1228; AVX512BWVL-NEXT:    retq
1229;
1230; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8:
1231; AVX512VBMIVL:       # %bb.0:
1232; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
1233; AVX512VBMIVL-NEXT:    vpmovqb %ymm0, (%rsi)
1234; AVX512VBMIVL-NEXT:    vzeroupper
1235; AVX512VBMIVL-NEXT:    retq
1236  %vec = load <32 x i8>, <32 x i8>* %L
1237  %bc = bitcast <32 x i8> %vec to <4 x i64>
1238  %strided.vec = trunc <4 x i64> %bc to <4 x i8>
1239  store <4 x i8> %strided.vec, <4 x i8>* %S
1240  ret void
1241}
1242
1243; In this case not all elements are collected from the same source vector, so
1244; the resulting BUILD_VECTOR should not be combined to a truncate.
1245define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
1246; AVX1-LABEL: negative:
1247; AVX1:       # %bb.0:
1248; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1249; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14]
1250; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1251; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1252; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1253; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
1254; AVX1-NEXT:    vzeroupper
1255; AVX1-NEXT:    retq
1256;
1257; AVX2-LABEL: negative:
1258; AVX2:       # %bb.0:
1259; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1260; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1261; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1262; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1263; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1264; AVX2-NEXT:    vzeroupper
1265; AVX2-NEXT:    retq
1266;
1267; AVX512F-LABEL: negative:
1268; AVX512F:       # %bb.0:
1269; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1270; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1271; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1272; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1273; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1274; AVX512F-NEXT:    vzeroupper
1275; AVX512F-NEXT:    retq
1276;
1277; AVX512VL-LABEL: negative:
1278; AVX512VL:       # %bb.0:
1279; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm0[2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1280; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1281; AVX512VL-NEXT:    # ymm2 = mem[0,1,0,1]
1282; AVX512VL-NEXT:    vpternlogq $206, %ymm1, %ymm0, %ymm2
1283; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[0,3,2,3]
1284; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1285; AVX512VL-NEXT:    vzeroupper
1286; AVX512VL-NEXT:    retq
1287;
1288; AVX512BW-LABEL: negative:
1289; AVX512BW:       # %bb.0:
1290; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1291; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1292; AVX512BW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1293; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1294; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1295; AVX512BW-NEXT:    vzeroupper
1296; AVX512BW-NEXT:    retq
1297;
1298; AVX512BWVL-LABEL: negative:
1299; AVX512BWVL:       # %bb.0:
1300; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1301; AVX512BWVL-NEXT:    movl $65537, %eax # imm = 0x10001
1302; AVX512BWVL-NEXT:    kmovd %eax, %k1
1303; AVX512BWVL-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
1304; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1305; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1306; AVX512BWVL-NEXT:    vzeroupper
1307; AVX512BWVL-NEXT:    retq
1308;
1309; AVX512VBMIVL-LABEL: negative:
1310; AVX512VBMIVL:       # %bb.0:
1311; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1312; AVX512VBMIVL-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
1313; AVX512VBMIVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1314; AVX512VBMIVL-NEXT:    vzeroupper
1315; AVX512VBMIVL-NEXT:    retq
1316  %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1317  %w0 = extractelement <32 x i8> %w, i32 0
1318  %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0
1319  ret <16 x i8> %merged
1320}
1321