• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
12
13define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
14; SSE-LABEL: trunc8i64_8i32:
15; SSE:       # %bb.0: # %entry
16; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
17; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
18; SSE-NEXT:    movaps %xmm2, %xmm1
19; SSE-NEXT:    retq
20;
21; AVX1-LABEL: trunc8i64_8i32:
22; AVX1:       # %bb.0: # %entry
23; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
24; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
25; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
26; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
27; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
28; AVX1-NEXT:    retq
29;
30; AVX2-SLOW-LABEL: trunc8i64_8i32:
31; AVX2-SLOW:       # %bb.0: # %entry
32; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
33; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
34; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
35; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
36; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
37; AVX2-SLOW-NEXT:    retq
38;
39; AVX2-FAST-LABEL: trunc8i64_8i32:
40; AVX2-FAST:       # %bb.0: # %entry
41; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
42; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
43; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
44; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
45; AVX2-FAST-NEXT:    retq
46;
47; AVX512-LABEL: trunc8i64_8i32:
48; AVX512:       # %bb.0: # %entry
49; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
50; AVX512-NEXT:    retq
51entry:
52  %0 = trunc <8 x i64> %a to <8 x i32>
53  ret <8 x i32> %0
54}
55
56define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
57; SSE-LABEL: trunc8i64_8i32_ashr:
58; SSE:       # %bb.0: # %entry
59; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
60; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
61; SSE-NEXT:    movaps %xmm2, %xmm1
62; SSE-NEXT:    retq
63;
64; AVX1-LABEL: trunc8i64_8i32_ashr:
65; AVX1:       # %bb.0: # %entry
66; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
67; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
68; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
69; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
70; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
71; AVX1-NEXT:    retq
72;
73; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
74; AVX2-SLOW:       # %bb.0: # %entry
75; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
76; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
77; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
78; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
79; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
80; AVX2-SLOW-NEXT:    retq
81;
82; AVX2-FAST-LABEL: trunc8i64_8i32_ashr:
83; AVX2-FAST:       # %bb.0: # %entry
84; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7]
85; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
86; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
87; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
88; AVX2-FAST-NEXT:    retq
89;
90; AVX512-LABEL: trunc8i64_8i32_ashr:
91; AVX512:       # %bb.0: # %entry
92; AVX512-NEXT:    vpsraq $32, %zmm0, %zmm0
93; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
94; AVX512-NEXT:    retq
95entry:
96  %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
97  %1 = trunc <8 x i64> %0 to <8 x i32>
98  ret <8 x i32> %1
99}
100
101define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
102; SSE-LABEL: trunc8i64_8i32_lshr:
103; SSE:       # %bb.0: # %entry
104; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
105; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
106; SSE-NEXT:    movaps %xmm2, %xmm1
107; SSE-NEXT:    retq
108;
109; AVX1-LABEL: trunc8i64_8i32_lshr:
110; AVX1:       # %bb.0: # %entry
111; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
112; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
113; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
114; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
115; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
116; AVX1-NEXT:    retq
117;
118; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
119; AVX2-SLOW:       # %bb.0: # %entry
120; AVX2-SLOW-NEXT:    vpsrlq $32, %ymm1, %ymm1
121; AVX2-SLOW-NEXT:    vpsrlq $32, %ymm0, %ymm0
122; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
123; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
124; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
125; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
126; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
127; AVX2-SLOW-NEXT:    retq
128;
129; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
130; AVX2-FAST:       # %bb.0: # %entry
131; AVX2-FAST-NEXT:    vpsrlq $32, %ymm1, %ymm1
132; AVX2-FAST-NEXT:    vpsrlq $32, %ymm0, %ymm0
133; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
134; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
135; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
136; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
137; AVX2-FAST-NEXT:    retq
138;
139; AVX512-LABEL: trunc8i64_8i32_lshr:
140; AVX512:       # %bb.0: # %entry
141; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
142; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
143; AVX512-NEXT:    retq
144entry:
145  %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
146  %1 = trunc <8 x i64> %0 to <8 x i32>
147  ret <8 x i32> %1
148}
149
150define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
151; SSE2-LABEL: trunc8i64_8i16:
152; SSE2:       # %bb.0: # %entry
153; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
154; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
155; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
156; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
157; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
158; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
159; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
160; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
161; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
162; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
163; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
164; SSE2-NEXT:    retq
165;
166; SSSE3-LABEL: trunc8i64_8i16:
167; SSSE3:       # %bb.0: # %entry
168; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
169; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
170; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
171; SSSE3-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
172; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
173; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
174; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
175; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
176; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
177; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
178; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
179; SSSE3-NEXT:    retq
180;
181; SSE41-LABEL: trunc8i64_8i16:
182; SSE41:       # %bb.0: # %entry
183; SSE41-NEXT:    pxor %xmm4, %xmm4
184; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
185; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
186; SSE41-NEXT:    packusdw %xmm3, %xmm2
187; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
188; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
189; SSE41-NEXT:    packusdw %xmm1, %xmm0
190; SSE41-NEXT:    packusdw %xmm2, %xmm0
191; SSE41-NEXT:    retq
192;
193; AVX1-LABEL: trunc8i64_8i16:
194; AVX1:       # %bb.0: # %entry
195; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
196; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
197; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
198; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
199; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
200; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
201; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
202; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
203; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
204; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
205; AVX1-NEXT:    vzeroupper
206; AVX1-NEXT:    retq
207;
208; AVX2-SLOW-LABEL: trunc8i64_8i16:
209; AVX2-SLOW:       # %bb.0: # %entry
210; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
211; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
212; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
213; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
214; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
215; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
216; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
217; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
218; AVX2-SLOW-NEXT:    vzeroupper
219; AVX2-SLOW-NEXT:    retq
220;
221; AVX2-FAST-LABEL: trunc8i64_8i16:
222; AVX2-FAST:       # %bb.0: # %entry
223; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
224; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
225; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
226; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
227; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
228; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
229; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
230; AVX2-FAST-NEXT:    vzeroupper
231; AVX2-FAST-NEXT:    retq
232;
233; AVX512-LABEL: trunc8i64_8i16:
234; AVX512:       # %bb.0: # %entry
235; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
236; AVX512-NEXT:    vzeroupper
237; AVX512-NEXT:    retq
238entry:
239  %0 = trunc <8 x i64> %a to <8 x i16>
240  ret <8 x i16> %0
241}
242
243define void @trunc8i64_8i8(<8 x i64> %a) {
244; SSE2-LABEL: trunc8i64_8i8:
245; SSE2:       # %bb.0: # %entry
246; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
247; SSE2-NEXT:    pand %xmm4, %xmm3
248; SSE2-NEXT:    pand %xmm4, %xmm2
249; SSE2-NEXT:    packuswb %xmm3, %xmm2
250; SSE2-NEXT:    pand %xmm4, %xmm1
251; SSE2-NEXT:    pand %xmm4, %xmm0
252; SSE2-NEXT:    packuswb %xmm1, %xmm0
253; SSE2-NEXT:    packuswb %xmm2, %xmm0
254; SSE2-NEXT:    packuswb %xmm0, %xmm0
255; SSE2-NEXT:    movq %xmm0, (%rax)
256; SSE2-NEXT:    retq
257;
258; SSSE3-LABEL: trunc8i64_8i8:
259; SSSE3:       # %bb.0: # %entry
260; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
261; SSSE3-NEXT:    pand %xmm4, %xmm3
262; SSSE3-NEXT:    pand %xmm4, %xmm2
263; SSSE3-NEXT:    packuswb %xmm3, %xmm2
264; SSSE3-NEXT:    pand %xmm4, %xmm1
265; SSSE3-NEXT:    pand %xmm4, %xmm0
266; SSSE3-NEXT:    packuswb %xmm1, %xmm0
267; SSSE3-NEXT:    packuswb %xmm2, %xmm0
268; SSSE3-NEXT:    packuswb %xmm0, %xmm0
269; SSSE3-NEXT:    movq %xmm0, (%rax)
270; SSSE3-NEXT:    retq
271;
272; SSE41-LABEL: trunc8i64_8i8:
273; SSE41:       # %bb.0: # %entry
274; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
275; SSE41-NEXT:    pand %xmm4, %xmm3
276; SSE41-NEXT:    pand %xmm4, %xmm2
277; SSE41-NEXT:    packusdw %xmm3, %xmm2
278; SSE41-NEXT:    pand %xmm4, %xmm1
279; SSE41-NEXT:    pand %xmm4, %xmm0
280; SSE41-NEXT:    packusdw %xmm1, %xmm0
281; SSE41-NEXT:    packusdw %xmm2, %xmm0
282; SSE41-NEXT:    packuswb %xmm0, %xmm0
283; SSE41-NEXT:    movq %xmm0, (%rax)
284; SSE41-NEXT:    retq
285;
286; AVX1-LABEL: trunc8i64_8i8:
287; AVX1:       # %bb.0: # %entry
288; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
289; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
290; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
291; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
292; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
293; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
294; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
295; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
296; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
297; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
298; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
299; AVX1-NEXT:    vmovq %xmm0, (%rax)
300; AVX1-NEXT:    vzeroupper
301; AVX1-NEXT:    retq
302;
303; AVX2-SLOW-LABEL: trunc8i64_8i8:
304; AVX2-SLOW:       # %bb.0: # %entry
305; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
306; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
307; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
308; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
309; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
310; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
311; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
312; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
313; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rax)
314; AVX2-SLOW-NEXT:    vzeroupper
315; AVX2-SLOW-NEXT:    retq
316;
317; AVX2-FAST-LABEL: trunc8i64_8i8:
318; AVX2-FAST:       # %bb.0: # %entry
319; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
320; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
321; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
322; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
323; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
324; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
325; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
326; AVX2-FAST-NEXT:    vmovq %xmm0, (%rax)
327; AVX2-FAST-NEXT:    vzeroupper
328; AVX2-FAST-NEXT:    retq
329;
330; AVX512-LABEL: trunc8i64_8i8:
331; AVX512:       # %bb.0: # %entry
332; AVX512-NEXT:    vpmovqb %zmm0, (%rax)
333; AVX512-NEXT:    vzeroupper
334; AVX512-NEXT:    retq
335entry:
336  %0 = trunc <8 x i64> %a to <8 x i8>
337  store <8 x i8> %0, <8 x i8>* undef, align 4
338  ret void
339}
340
341define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
342; SSE2-LABEL: trunc8i32_8i16:
343; SSE2:       # %bb.0: # %entry
344; SSE2-NEXT:    pslld $16, %xmm1
345; SSE2-NEXT:    psrad $16, %xmm1
346; SSE2-NEXT:    pslld $16, %xmm0
347; SSE2-NEXT:    psrad $16, %xmm0
348; SSE2-NEXT:    packssdw %xmm1, %xmm0
349; SSE2-NEXT:    retq
350;
351; SSSE3-LABEL: trunc8i32_8i16:
352; SSSE3:       # %bb.0: # %entry
353; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
354; SSSE3-NEXT:    pshufb %xmm2, %xmm1
355; SSSE3-NEXT:    pshufb %xmm2, %xmm0
356; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
357; SSSE3-NEXT:    retq
358;
359; SSE41-LABEL: trunc8i32_8i16:
360; SSE41:       # %bb.0: # %entry
361; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
362; SSE41-NEXT:    pshufb %xmm2, %xmm1
363; SSE41-NEXT:    pshufb %xmm2, %xmm0
364; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
365; SSE41-NEXT:    retq
366;
367; AVX1-LABEL: trunc8i32_8i16:
368; AVX1:       # %bb.0: # %entry
369; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
370; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
371; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
372; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
373; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
374; AVX1-NEXT:    vzeroupper
375; AVX1-NEXT:    retq
376;
377; AVX2-LABEL: trunc8i32_8i16:
378; AVX2:       # %bb.0: # %entry
379; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
380; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
381; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
382; AVX2-NEXT:    vzeroupper
383; AVX2-NEXT:    retq
384;
385; AVX512F-LABEL: trunc8i32_8i16:
386; AVX512F:       # %bb.0: # %entry
387; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
388; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
389; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
390; AVX512F-NEXT:    vzeroupper
391; AVX512F-NEXT:    retq
392;
393; AVX512VL-LABEL: trunc8i32_8i16:
394; AVX512VL:       # %bb.0: # %entry
395; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
396; AVX512VL-NEXT:    vzeroupper
397; AVX512VL-NEXT:    retq
398;
399; AVX512BW-LABEL: trunc8i32_8i16:
400; AVX512BW:       # %bb.0: # %entry
401; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
402; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
403; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
404; AVX512BW-NEXT:    vzeroupper
405; AVX512BW-NEXT:    retq
406;
407; AVX512BWVL-LABEL: trunc8i32_8i16:
408; AVX512BWVL:       # %bb.0: # %entry
409; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
410; AVX512BWVL-NEXT:    vzeroupper
411; AVX512BWVL-NEXT:    retq
412entry:
413  %0 = trunc <8 x i32> %a to <8 x i16>
414  ret <8 x i16> %0
415}
416
417define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) {
418; SSE-LABEL: trunc8i32_8i16_ashr:
419; SSE:       # %bb.0: # %entry
420; SSE-NEXT:    psrad $16, %xmm1
421; SSE-NEXT:    psrad $16, %xmm0
422; SSE-NEXT:    packssdw %xmm1, %xmm0
423; SSE-NEXT:    retq
424;
425; AVX1-LABEL: trunc8i32_8i16_ashr:
426; AVX1:       # %bb.0: # %entry
427; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
428; AVX1-NEXT:    vpsrad $16, %xmm1, %xmm1
429; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
430; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
431; AVX1-NEXT:    vzeroupper
432; AVX1-NEXT:    retq
433;
434; AVX2-LABEL: trunc8i32_8i16_ashr:
435; AVX2:       # %bb.0: # %entry
436; AVX2-NEXT:    vpsrad $16, %ymm0, %ymm0
437; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
438; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
439; AVX2-NEXT:    vzeroupper
440; AVX2-NEXT:    retq
441;
442; AVX512F-LABEL: trunc8i32_8i16_ashr:
443; AVX512F:       # %bb.0: # %entry
444; AVX512F-NEXT:    vpsrad $16, %ymm0, %ymm0
445; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
446; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
447; AVX512F-NEXT:    vzeroupper
448; AVX512F-NEXT:    retq
449;
450; AVX512VL-LABEL: trunc8i32_8i16_ashr:
451; AVX512VL:       # %bb.0: # %entry
452; AVX512VL-NEXT:    vpsrad $16, %ymm0, %ymm0
453; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
454; AVX512VL-NEXT:    vzeroupper
455; AVX512VL-NEXT:    retq
456;
457; AVX512BW-LABEL: trunc8i32_8i16_ashr:
458; AVX512BW:       # %bb.0: # %entry
459; AVX512BW-NEXT:    vpsrad $16, %ymm0, %ymm0
460; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
461; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
462; AVX512BW-NEXT:    vzeroupper
463; AVX512BW-NEXT:    retq
464;
465; AVX512BWVL-LABEL: trunc8i32_8i16_ashr:
466; AVX512BWVL:       # %bb.0: # %entry
467; AVX512BWVL-NEXT:    vpsrad $16, %ymm0, %ymm0
468; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
469; AVX512BWVL-NEXT:    vzeroupper
470; AVX512BWVL-NEXT:    retq
471entry:
472  %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
473  %1 = trunc <8 x i32> %0 to <8 x i16>
474  ret <8 x i16> %1
475}
476
477define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) {
478; SSE2-LABEL: trunc8i32_8i16_lshr:
479; SSE2:       # %bb.0: # %entry
480; SSE2-NEXT:    psrld $16, %xmm0
481; SSE2-NEXT:    psrld $16, %xmm1
482; SSE2-NEXT:    pslld $16, %xmm1
483; SSE2-NEXT:    psrad $16, %xmm1
484; SSE2-NEXT:    pslld $16, %xmm0
485; SSE2-NEXT:    psrad $16, %xmm0
486; SSE2-NEXT:    packssdw %xmm1, %xmm0
487; SSE2-NEXT:    retq
488;
489; SSSE3-LABEL: trunc8i32_8i16_lshr:
490; SSSE3:       # %bb.0: # %entry
491; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,10,11,14,15,14,15,255,255]
492; SSSE3-NEXT:    pshufb %xmm2, %xmm1
493; SSSE3-NEXT:    pshufb %xmm2, %xmm0
494; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
495; SSSE3-NEXT:    retq
496;
497; SSE41-LABEL: trunc8i32_8i16_lshr:
498; SSE41:       # %bb.0: # %entry
499; SSE41-NEXT:    psrld $16, %xmm1
500; SSE41-NEXT:    psrld $16, %xmm0
501; SSE41-NEXT:    packusdw %xmm1, %xmm0
502; SSE41-NEXT:    retq
503;
504; AVX1-LABEL: trunc8i32_8i16_lshr:
505; AVX1:       # %bb.0: # %entry
506; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
507; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
508; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
509; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
510; AVX1-NEXT:    vzeroupper
511; AVX1-NEXT:    retq
512;
513; AVX2-LABEL: trunc8i32_8i16_lshr:
514; AVX2:       # %bb.0: # %entry
515; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
516; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
517; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
518; AVX2-NEXT:    vzeroupper
519; AVX2-NEXT:    retq
520;
521; AVX512F-LABEL: trunc8i32_8i16_lshr:
522; AVX512F:       # %bb.0: # %entry
523; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
524; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
525; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
526; AVX512F-NEXT:    vzeroupper
527; AVX512F-NEXT:    retq
528;
529; AVX512VL-LABEL: trunc8i32_8i16_lshr:
530; AVX512VL:       # %bb.0: # %entry
531; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
532; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
533; AVX512VL-NEXT:    vzeroupper
534; AVX512VL-NEXT:    retq
535;
536; AVX512BW-LABEL: trunc8i32_8i16_lshr:
537; AVX512BW:       # %bb.0: # %entry
538; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm0
539; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
540; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
541; AVX512BW-NEXT:    vzeroupper
542; AVX512BW-NEXT:    retq
543;
544; AVX512BWVL-LABEL: trunc8i32_8i16_lshr:
545; AVX512BWVL:       # %bb.0: # %entry
546; AVX512BWVL-NEXT:    vpsrld $16, %ymm0, %ymm0
547; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
548; AVX512BWVL-NEXT:    vzeroupper
549; AVX512BWVL-NEXT:    retq
550entry:
551  %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
552  %1 = trunc <8 x i32> %0 to <8 x i16>
553  ret <8 x i16> %1
554}
555
556define void @trunc8i32_8i8(<8 x i32> %a) {
557; SSE2-LABEL: trunc8i32_8i8:
558; SSE2:       # %bb.0: # %entry
559; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
560; SSE2-NEXT:    pand %xmm2, %xmm1
561; SSE2-NEXT:    pand %xmm2, %xmm0
562; SSE2-NEXT:    packuswb %xmm1, %xmm0
563; SSE2-NEXT:    packuswb %xmm0, %xmm0
564; SSE2-NEXT:    movq %xmm0, (%rax)
565; SSE2-NEXT:    retq
566;
567; SSSE3-LABEL: trunc8i32_8i8:
568; SSSE3:       # %bb.0: # %entry
569; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
570; SSSE3-NEXT:    pshufb %xmm2, %xmm1
571; SSSE3-NEXT:    pshufb %xmm2, %xmm0
572; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
573; SSSE3-NEXT:    movq %xmm0, (%rax)
574; SSSE3-NEXT:    retq
575;
576; SSE41-LABEL: trunc8i32_8i8:
577; SSE41:       # %bb.0: # %entry
578; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
579; SSE41-NEXT:    pshufb %xmm2, %xmm1
580; SSE41-NEXT:    pshufb %xmm2, %xmm0
581; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
582; SSE41-NEXT:    movq %xmm0, (%rax)
583; SSE41-NEXT:    retq
584;
585; AVX1-LABEL: trunc8i32_8i8:
586; AVX1:       # %bb.0: # %entry
587; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
588; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
589; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
590; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
591; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
592; AVX1-NEXT:    vmovq %xmm0, (%rax)
593; AVX1-NEXT:    vzeroupper
594; AVX1-NEXT:    retq
595;
596; AVX2-LABEL: trunc8i32_8i8:
597; AVX2:       # %bb.0: # %entry
598; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
599; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
600; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
601; AVX2-NEXT:    vmovq %xmm0, (%rax)
602; AVX2-NEXT:    vzeroupper
603; AVX2-NEXT:    retq
604;
605; AVX512F-LABEL: trunc8i32_8i8:
606; AVX512F:       # %bb.0: # %entry
607; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
608; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
609; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
610; AVX512F-NEXT:    vmovq %xmm0, (%rax)
611; AVX512F-NEXT:    vzeroupper
612; AVX512F-NEXT:    retq
613;
614; AVX512VL-LABEL: trunc8i32_8i8:
615; AVX512VL:       # %bb.0: # %entry
616; AVX512VL-NEXT:    vpmovdb %ymm0, (%rax)
617; AVX512VL-NEXT:    vzeroupper
618; AVX512VL-NEXT:    retq
619;
620; AVX512BW-LABEL: trunc8i32_8i8:
621; AVX512BW:       # %bb.0: # %entry
622; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
623; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
624; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
625; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
626; AVX512BW-NEXT:    vzeroupper
627; AVX512BW-NEXT:    retq
628;
629; AVX512BWVL-LABEL: trunc8i32_8i8:
630; AVX512BWVL:       # %bb.0: # %entry
631; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rax)
632; AVX512BWVL-NEXT:    vzeroupper
633; AVX512BWVL-NEXT:    retq
634entry:
635  %0 = trunc <8 x i32> %a to <8 x i8>
636  store <8 x i8> %0, <8 x i8>* undef, align 4
637  ret void
638}
639
640define void @trunc16i32_16i16(<16 x i32> %a) {
641; SSE2-LABEL: trunc16i32_16i16:
642; SSE2:       # %bb.0: # %entry
643; SSE2-NEXT:    pslld $16, %xmm1
644; SSE2-NEXT:    psrad $16, %xmm1
645; SSE2-NEXT:    pslld $16, %xmm0
646; SSE2-NEXT:    psrad $16, %xmm0
647; SSE2-NEXT:    packssdw %xmm1, %xmm0
648; SSE2-NEXT:    pslld $16, %xmm3
649; SSE2-NEXT:    psrad $16, %xmm3
650; SSE2-NEXT:    pslld $16, %xmm2
651; SSE2-NEXT:    psrad $16, %xmm2
652; SSE2-NEXT:    packssdw %xmm3, %xmm2
653; SSE2-NEXT:    movdqu %xmm2, (%rax)
654; SSE2-NEXT:    movdqu %xmm0, (%rax)
655; SSE2-NEXT:    retq
656;
657; SSSE3-LABEL: trunc16i32_16i16:
658; SSSE3:       # %bb.0: # %entry
659; SSSE3-NEXT:    pslld $16, %xmm1
660; SSSE3-NEXT:    psrad $16, %xmm1
661; SSSE3-NEXT:    pslld $16, %xmm0
662; SSSE3-NEXT:    psrad $16, %xmm0
663; SSSE3-NEXT:    packssdw %xmm1, %xmm0
664; SSSE3-NEXT:    pslld $16, %xmm3
665; SSSE3-NEXT:    psrad $16, %xmm3
666; SSSE3-NEXT:    pslld $16, %xmm2
667; SSSE3-NEXT:    psrad $16, %xmm2
668; SSSE3-NEXT:    packssdw %xmm3, %xmm2
669; SSSE3-NEXT:    movdqu %xmm2, (%rax)
670; SSSE3-NEXT:    movdqu %xmm0, (%rax)
671; SSSE3-NEXT:    retq
672;
673; SSE41-LABEL: trunc16i32_16i16:
674; SSE41:       # %bb.0: # %entry
675; SSE41-NEXT:    pxor %xmm4, %xmm4
676; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
677; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
678; SSE41-NEXT:    packusdw %xmm1, %xmm0
679; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
680; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
681; SSE41-NEXT:    packusdw %xmm3, %xmm2
682; SSE41-NEXT:    movdqu %xmm2, (%rax)
683; SSE41-NEXT:    movdqu %xmm0, (%rax)
684; SSE41-NEXT:    retq
685;
686; AVX1-LABEL: trunc16i32_16i16:
687; AVX1:       # %bb.0: # %entry
688; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
689; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
690; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
691; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
692; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
693; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
694; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
695; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
696; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
697; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
698; AVX1-NEXT:    vmovups %ymm0, (%rax)
699; AVX1-NEXT:    vzeroupper
700; AVX1-NEXT:    retq
701;
702; AVX2-LABEL: trunc16i32_16i16:
703; AVX2:       # %bb.0: # %entry
704; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
705; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
706; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
707; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
708; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
709; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
710; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
711; AVX2-NEXT:    vzeroupper
712; AVX2-NEXT:    retq
713;
714; AVX512-LABEL: trunc16i32_16i16:
715; AVX512:       # %bb.0: # %entry
716; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
717; AVX512-NEXT:    vzeroupper
718; AVX512-NEXT:    retq
719entry:
720  %0 = trunc <16 x i32> %a to <16 x i16>
721  store <16 x i16> %0, <16 x i16>* undef, align 4
722  ret void
723}
724
725define void @trunc16i32_16i16_ashr(<16 x i32> %a) {
726; SSE-LABEL: trunc16i32_16i16_ashr:
727; SSE:       # %bb.0: # %entry
728; SSE-NEXT:    psrad $16, %xmm3
729; SSE-NEXT:    psrad $16, %xmm2
730; SSE-NEXT:    packssdw %xmm3, %xmm2
731; SSE-NEXT:    psrad $16, %xmm1
732; SSE-NEXT:    psrad $16, %xmm0
733; SSE-NEXT:    packssdw %xmm1, %xmm0
734; SSE-NEXT:    movdqu %xmm2, (%rax)
735; SSE-NEXT:    movdqu %xmm0, (%rax)
736; SSE-NEXT:    retq
737;
738; AVX1-LABEL: trunc16i32_16i16_ashr:
739; AVX1:       # %bb.0: # %entry
740; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
741; AVX1-NEXT:    vpsrad $16, %xmm2, %xmm2
742; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
743; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
744; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
745; AVX1-NEXT:    vpsrad $16, %xmm2, %xmm2
746; AVX1-NEXT:    vpsrad $16, %xmm1, %xmm1
747; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
748; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
749; AVX1-NEXT:    vmovups %ymm0, (%rax)
750; AVX1-NEXT:    vzeroupper
751; AVX1-NEXT:    retq
752;
753; AVX2-LABEL: trunc16i32_16i16_ashr:
754; AVX2:       # %bb.0: # %entry
755; AVX2-NEXT:    vpsrad $16, %ymm1, %ymm1
756; AVX2-NEXT:    vpsrad $16, %ymm0, %ymm0
757; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
758; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
759; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
760; AVX2-NEXT:    vzeroupper
761; AVX2-NEXT:    retq
762;
763; AVX512-LABEL: trunc16i32_16i16_ashr:
764; AVX512:       # %bb.0: # %entry
765; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
766; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
767; AVX512-NEXT:    vzeroupper
768; AVX512-NEXT:    retq
769entry:
770  %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
771  %1 = trunc <16 x i32> %0 to <16 x i16>
772  store <16 x i16> %1, <16 x i16>* undef, align 4
773  ret void
774}
775
776define void @trunc16i32_16i16_lshr(<16 x i32> %a) {
777; SSE2-LABEL: trunc16i32_16i16_lshr:
778; SSE2:       # %bb.0: # %entry
779; SSE2-NEXT:    psrld $16, %xmm2
780; SSE2-NEXT:    psrld $16, %xmm3
781; SSE2-NEXT:    psrld $16, %xmm0
782; SSE2-NEXT:    psrld $16, %xmm1
783; SSE2-NEXT:    pslld $16, %xmm1
784; SSE2-NEXT:    psrad $16, %xmm1
785; SSE2-NEXT:    pslld $16, %xmm0
786; SSE2-NEXT:    psrad $16, %xmm0
787; SSE2-NEXT:    packssdw %xmm1, %xmm0
788; SSE2-NEXT:    pslld $16, %xmm3
789; SSE2-NEXT:    psrad $16, %xmm3
790; SSE2-NEXT:    pslld $16, %xmm2
791; SSE2-NEXT:    psrad $16, %xmm2
792; SSE2-NEXT:    packssdw %xmm3, %xmm2
793; SSE2-NEXT:    movdqu %xmm2, (%rax)
794; SSE2-NEXT:    movdqu %xmm0, (%rax)
795; SSE2-NEXT:    retq
796;
797; SSSE3-LABEL: trunc16i32_16i16_lshr:
798; SSSE3:       # %bb.0: # %entry
799; SSSE3-NEXT:    psrld $16, %xmm2
800; SSSE3-NEXT:    psrld $16, %xmm3
801; SSSE3-NEXT:    psrld $16, %xmm0
802; SSSE3-NEXT:    psrld $16, %xmm1
803; SSSE3-NEXT:    pslld $16, %xmm1
804; SSSE3-NEXT:    psrad $16, %xmm1
805; SSSE3-NEXT:    pslld $16, %xmm0
806; SSSE3-NEXT:    psrad $16, %xmm0
807; SSSE3-NEXT:    packssdw %xmm1, %xmm0
808; SSSE3-NEXT:    pslld $16, %xmm3
809; SSSE3-NEXT:    psrad $16, %xmm3
810; SSSE3-NEXT:    pslld $16, %xmm2
811; SSSE3-NEXT:    psrad $16, %xmm2
812; SSSE3-NEXT:    packssdw %xmm3, %xmm2
813; SSSE3-NEXT:    movdqu %xmm2, (%rax)
814; SSSE3-NEXT:    movdqu %xmm0, (%rax)
815; SSSE3-NEXT:    retq
816;
817; SSE41-LABEL: trunc16i32_16i16_lshr:
818; SSE41:       # %bb.0: # %entry
819; SSE41-NEXT:    psrld $16, %xmm3
820; SSE41-NEXT:    psrld $16, %xmm2
821; SSE41-NEXT:    packusdw %xmm3, %xmm2
822; SSE41-NEXT:    psrld $16, %xmm1
823; SSE41-NEXT:    psrld $16, %xmm0
824; SSE41-NEXT:    packusdw %xmm1, %xmm0
825; SSE41-NEXT:    movdqu %xmm2, (%rax)
826; SSE41-NEXT:    movdqu %xmm0, (%rax)
827; SSE41-NEXT:    retq
828;
829; AVX1-LABEL: trunc16i32_16i16_lshr:
830; AVX1:       # %bb.0: # %entry
831; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
832; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
833; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
834; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
835; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
836; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
837; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
838; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
839; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
840; AVX1-NEXT:    vmovups %ymm0, (%rax)
841; AVX1-NEXT:    vzeroupper
842; AVX1-NEXT:    retq
843;
844; AVX2-LABEL: trunc16i32_16i16_lshr:
845; AVX2:       # %bb.0: # %entry
846; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
847; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
848; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
849; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
850; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
851; AVX2-NEXT:    vzeroupper
852; AVX2-NEXT:    retq
853;
854; AVX512-LABEL: trunc16i32_16i16_lshr:
855; AVX512:       # %bb.0: # %entry
856; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
857; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
858; AVX512-NEXT:    vzeroupper
859; AVX512-NEXT:    retq
860entry:
861  %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
862  %1 = trunc <16 x i32> %0 to <16 x i16>
863  store <16 x i16> %1, <16 x i16>* undef, align 4
864  ret void
865}
866
867define void @trunc16i32_16i8(<16 x i32> %a) {
868; SSE2-LABEL: trunc16i32_16i8:
869; SSE2:       # %bb.0: # %entry
870; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
871; SSE2-NEXT:    pand %xmm4, %xmm3
872; SSE2-NEXT:    pand %xmm4, %xmm2
873; SSE2-NEXT:    packuswb %xmm3, %xmm2
874; SSE2-NEXT:    pand %xmm4, %xmm1
875; SSE2-NEXT:    pand %xmm4, %xmm0
876; SSE2-NEXT:    packuswb %xmm1, %xmm0
877; SSE2-NEXT:    packuswb %xmm2, %xmm0
878; SSE2-NEXT:    movdqu %xmm0, (%rax)
879; SSE2-NEXT:    retq
880;
881; SSSE3-LABEL: trunc16i32_16i8:
882; SSSE3:       # %bb.0: # %entry
883; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
884; SSSE3-NEXT:    pand %xmm4, %xmm3
885; SSSE3-NEXT:    pand %xmm4, %xmm2
886; SSSE3-NEXT:    packuswb %xmm3, %xmm2
887; SSSE3-NEXT:    pand %xmm4, %xmm1
888; SSSE3-NEXT:    pand %xmm4, %xmm0
889; SSSE3-NEXT:    packuswb %xmm1, %xmm0
890; SSSE3-NEXT:    packuswb %xmm2, %xmm0
891; SSSE3-NEXT:    movdqu %xmm0, (%rax)
892; SSSE3-NEXT:    retq
893;
894; SSE41-LABEL: trunc16i32_16i8:
895; SSE41:       # %bb.0: # %entry
896; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
897; SSE41-NEXT:    pand %xmm4, %xmm3
898; SSE41-NEXT:    pand %xmm4, %xmm2
899; SSE41-NEXT:    packusdw %xmm3, %xmm2
900; SSE41-NEXT:    pand %xmm4, %xmm1
901; SSE41-NEXT:    pand %xmm4, %xmm0
902; SSE41-NEXT:    packusdw %xmm1, %xmm0
903; SSE41-NEXT:    packuswb %xmm2, %xmm0
904; SSE41-NEXT:    movdqu %xmm0, (%rax)
905; SSE41-NEXT:    retq
906;
907; AVX1-LABEL: trunc16i32_16i8:
908; AVX1:       # %bb.0: # %entry
909; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
910; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
911; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
912; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
913; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
914; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
915; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
916; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
917; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
918; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
919; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
920; AVX1-NEXT:    vzeroupper
921; AVX1-NEXT:    retq
922;
923; AVX2-LABEL: trunc16i32_16i8:
924; AVX2:       # %bb.0: # %entry
925; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
926; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
927; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
928; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
929; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
930; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
931; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
932; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
933; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
934; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
935; AVX2-NEXT:    vzeroupper
936; AVX2-NEXT:    retq
937;
938; AVX512-LABEL: trunc16i32_16i8:
939; AVX512:       # %bb.0: # %entry
940; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
941; AVX512-NEXT:    vzeroupper
942; AVX512-NEXT:    retq
943entry:
944  %0 = trunc <16 x i32> %a to <16 x i8>
945  store <16 x i8> %0, <16 x i8>* undef, align 4
946  ret void
947}
948
949define void @trunc16i32_16i8_ashr(<16 x i32> %a) {
950; SSE-LABEL: trunc16i32_16i8_ashr:
951; SSE:       # %bb.0: # %entry
952; SSE-NEXT:    psrad $24, %xmm1
953; SSE-NEXT:    psrad $24, %xmm0
954; SSE-NEXT:    packssdw %xmm1, %xmm0
955; SSE-NEXT:    psrad $24, %xmm3
956; SSE-NEXT:    psrad $24, %xmm2
957; SSE-NEXT:    packssdw %xmm3, %xmm2
958; SSE-NEXT:    packsswb %xmm2, %xmm0
959; SSE-NEXT:    movdqu %xmm0, (%rax)
960; SSE-NEXT:    retq
961;
962; AVX1-LABEL: trunc16i32_16i8_ashr:
963; AVX1:       # %bb.0: # %entry
964; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
965; AVX1-NEXT:    vpsrad $24, %xmm2, %xmm2
966; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
967; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
968; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
969; AVX1-NEXT:    vpsrad $24, %xmm2, %xmm2
970; AVX1-NEXT:    vpsrad $24, %xmm1, %xmm1
971; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
972; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
973; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
974; AVX1-NEXT:    vzeroupper
975; AVX1-NEXT:    retq
976;
977; AVX2-LABEL: trunc16i32_16i8_ashr:
978; AVX2:       # %bb.0: # %entry
979; AVX2-NEXT:    vpsrad $24, %ymm1, %ymm1
980; AVX2-NEXT:    vpsrad $24, %ymm0, %ymm0
981; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
982; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
983; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
984; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
985; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
986; AVX2-NEXT:    vzeroupper
987; AVX2-NEXT:    retq
988;
989; AVX512-LABEL: trunc16i32_16i8_ashr:
990; AVX512:       # %bb.0: # %entry
991; AVX512-NEXT:    vpsrld $24, %zmm0, %zmm0
992; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
993; AVX512-NEXT:    vzeroupper
994; AVX512-NEXT:    retq
995entry:
996  %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
997  %1 = trunc <16 x i32> %0 to <16 x i8>
998  store <16 x i8> %1, <16 x i8>* undef, align 4
999  ret void
1000}
1001
1002define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
1003; SSE2-LABEL: trunc16i32_16i8_lshr:
1004; SSE2:       # %bb.0: # %entry
1005; SSE2-NEXT:    psrld $24, %xmm1
1006; SSE2-NEXT:    psrld $24, %xmm0
1007; SSE2-NEXT:    packuswb %xmm1, %xmm0
1008; SSE2-NEXT:    psrld $24, %xmm3
1009; SSE2-NEXT:    psrld $24, %xmm2
1010; SSE2-NEXT:    packuswb %xmm3, %xmm2
1011; SSE2-NEXT:    packuswb %xmm2, %xmm0
1012; SSE2-NEXT:    movdqu %xmm0, (%rax)
1013; SSE2-NEXT:    retq
1014;
1015; SSSE3-LABEL: trunc16i32_16i8_lshr:
1016; SSSE3:       # %bb.0: # %entry
1017; SSSE3-NEXT:    psrld $24, %xmm1
1018; SSSE3-NEXT:    psrld $24, %xmm0
1019; SSSE3-NEXT:    packuswb %xmm1, %xmm0
1020; SSSE3-NEXT:    psrld $24, %xmm3
1021; SSSE3-NEXT:    psrld $24, %xmm2
1022; SSSE3-NEXT:    packuswb %xmm3, %xmm2
1023; SSSE3-NEXT:    packuswb %xmm2, %xmm0
1024; SSSE3-NEXT:    movdqu %xmm0, (%rax)
1025; SSSE3-NEXT:    retq
1026;
1027; SSE41-LABEL: trunc16i32_16i8_lshr:
1028; SSE41:       # %bb.0: # %entry
1029; SSE41-NEXT:    psrld $24, %xmm1
1030; SSE41-NEXT:    psrld $24, %xmm0
1031; SSE41-NEXT:    packusdw %xmm1, %xmm0
1032; SSE41-NEXT:    psrld $24, %xmm3
1033; SSE41-NEXT:    psrld $24, %xmm2
1034; SSE41-NEXT:    packusdw %xmm3, %xmm2
1035; SSE41-NEXT:    packuswb %xmm2, %xmm0
1036; SSE41-NEXT:    movdqu %xmm0, (%rax)
1037; SSE41-NEXT:    retq
1038;
1039; AVX1-LABEL: trunc16i32_16i8_lshr:
1040; AVX1:       # %bb.0: # %entry
1041; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1042; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
1043; AVX1-NEXT:    vpsrld $24, %xmm0, %xmm0
1044; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1045; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1046; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
1047; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm1
1048; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
1049; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1050; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1051; AVX1-NEXT:    vzeroupper
1052; AVX1-NEXT:    retq
1053;
1054; AVX2-LABEL: trunc16i32_16i8_lshr:
1055; AVX2:       # %bb.0: # %entry
1056; AVX2-NEXT:    vpsrld $24, %ymm1, %ymm1
1057; AVX2-NEXT:    vpsrld $24, %ymm0, %ymm0
1058; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1059; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1060; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1061; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1062; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1063; AVX2-NEXT:    vzeroupper
1064; AVX2-NEXT:    retq
1065;
1066; AVX512-LABEL: trunc16i32_16i8_lshr:
1067; AVX512:       # %bb.0: # %entry
1068; AVX512-NEXT:    vpsrld $24, %zmm0, %zmm0
1069; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
1070; AVX512-NEXT:    vzeroupper
1071; AVX512-NEXT:    retq
1072entry:
1073  %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
1074  %1 = trunc <16 x i32> %0 to <16 x i8>
1075  store <16 x i8> %1, <16 x i8>* undef, align 4
1076  ret void
1077}
1078
1079;PR25684
1080define void @trunc16i16_16i8(<16 x i16> %a) {
1081; SSE2-LABEL: trunc16i16_16i8:
1082; SSE2:       # %bb.0: # %entry
1083; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1084; SSE2-NEXT:    pand %xmm2, %xmm1
1085; SSE2-NEXT:    pand %xmm2, %xmm0
1086; SSE2-NEXT:    packuswb %xmm1, %xmm0
1087; SSE2-NEXT:    movdqu %xmm0, (%rax)
1088; SSE2-NEXT:    retq
1089;
1090; SSSE3-LABEL: trunc16i16_16i8:
1091; SSSE3:       # %bb.0: # %entry
1092; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1093; SSSE3-NEXT:    pshufb %xmm2, %xmm1
1094; SSSE3-NEXT:    pshufb %xmm2, %xmm0
1095; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1096; SSSE3-NEXT:    movdqu %xmm0, (%rax)
1097; SSSE3-NEXT:    retq
1098;
1099; SSE41-LABEL: trunc16i16_16i8:
1100; SSE41:       # %bb.0: # %entry
1101; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1102; SSE41-NEXT:    pshufb %xmm2, %xmm1
1103; SSE41-NEXT:    pshufb %xmm2, %xmm0
1104; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1105; SSE41-NEXT:    movdqu %xmm0, (%rax)
1106; SSE41-NEXT:    retq
1107;
1108; AVX1-LABEL: trunc16i16_16i8:
1109; AVX1:       # %bb.0: # %entry
1110; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1111; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1112; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1113; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1114; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1115; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1116; AVX1-NEXT:    vzeroupper
1117; AVX1-NEXT:    retq
1118;
1119; AVX2-LABEL: trunc16i16_16i8:
1120; AVX2:       # %bb.0: # %entry
1121; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1122; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1123; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1124; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1125; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1126; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1127; AVX2-NEXT:    vzeroupper
1128; AVX2-NEXT:    retq
1129;
1130; AVX512F-LABEL: trunc16i16_16i8:
1131; AVX512F:       # %bb.0: # %entry
1132; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
1133; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1134; AVX512F-NEXT:    vzeroupper
1135; AVX512F-NEXT:    retq
1136;
1137; AVX512VL-LABEL: trunc16i16_16i8:
1138; AVX512VL:       # %bb.0: # %entry
1139; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
1140; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1141; AVX512VL-NEXT:    vzeroupper
1142; AVX512VL-NEXT:    retq
1143;
1144; AVX512BW-LABEL: trunc16i16_16i8:
1145; AVX512BW:       # %bb.0: # %entry
1146; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1147; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1148; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1149; AVX512BW-NEXT:    vzeroupper
1150; AVX512BW-NEXT:    retq
1151;
1152; AVX512BWVL-LABEL: trunc16i16_16i8:
1153; AVX512BWVL:       # %bb.0: # %entry
1154; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
1155; AVX512BWVL-NEXT:    vzeroupper
1156; AVX512BWVL-NEXT:    retq
1157entry:
1158  %0 = trunc <16 x i16> %a to <16 x i8>
1159  store <16 x i8> %0, <16 x i8>* undef, align 4
1160  ret void
1161}
1162
1163define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
1164; SSE-LABEL: trunc16i16_16i8_ashr:
1165; SSE:       # %bb.0: # %entry
1166; SSE-NEXT:    psraw $8, %xmm1
1167; SSE-NEXT:    psraw $8, %xmm0
1168; SSE-NEXT:    packsswb %xmm1, %xmm0
1169; SSE-NEXT:    movdqu %xmm0, (%rax)
1170; SSE-NEXT:    retq
1171;
1172; AVX1-LABEL: trunc16i16_16i8_ashr:
1173; AVX1:       # %bb.0: # %entry
1174; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1175; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
1176; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
1177; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
1178; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1179; AVX1-NEXT:    vzeroupper
1180; AVX1-NEXT:    retq
1181;
1182; AVX2-LABEL: trunc16i16_16i8_ashr:
1183; AVX2:       # %bb.0: # %entry
1184; AVX2-NEXT:    vpsraw $8, %ymm0, %ymm0
1185; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1186; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
1187; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1188; AVX2-NEXT:    vzeroupper
1189; AVX2-NEXT:    retq
1190;
1191; AVX512F-LABEL: trunc16i16_16i8_ashr:
1192; AVX512F:       # %bb.0: # %entry
1193; AVX512F-NEXT:    vpsraw $8, %ymm0, %ymm0
1194; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
1195; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1196; AVX512F-NEXT:    vzeroupper
1197; AVX512F-NEXT:    retq
1198;
1199; AVX512VL-LABEL: trunc16i16_16i8_ashr:
1200; AVX512VL:       # %bb.0: # %entry
1201; AVX512VL-NEXT:    vpsraw $8, %ymm0, %ymm0
1202; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
1203; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1204; AVX512VL-NEXT:    vzeroupper
1205; AVX512VL-NEXT:    retq
1206;
1207; AVX512BW-LABEL: trunc16i16_16i8_ashr:
1208; AVX512BW:       # %bb.0: # %entry
1209; AVX512BW-NEXT:    vpsraw $8, %ymm0, %ymm0
1210; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1211; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1212; AVX512BW-NEXT:    vzeroupper
1213; AVX512BW-NEXT:    retq
1214;
1215; AVX512BWVL-LABEL: trunc16i16_16i8_ashr:
1216; AVX512BWVL:       # %bb.0: # %entry
1217; AVX512BWVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1218; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
1219; AVX512BWVL-NEXT:    vzeroupper
1220; AVX512BWVL-NEXT:    retq
1221entry:
1222  %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1223  %1 = trunc <16 x i16> %0 to <16 x i8>
1224  store <16 x i8> %1, <16 x i8>* undef, align 4
1225  ret void
1226}
1227
1228define void @trunc16i16_16i8_lshr(<16 x i16> %a) {
1229; SSE-LABEL: trunc16i16_16i8_lshr:
1230; SSE:       # %bb.0: # %entry
1231; SSE-NEXT:    psrlw $8, %xmm1
1232; SSE-NEXT:    psrlw $8, %xmm0
1233; SSE-NEXT:    packuswb %xmm1, %xmm0
1234; SSE-NEXT:    movdqu %xmm0, (%rax)
1235; SSE-NEXT:    retq
1236;
1237; AVX1-LABEL: trunc16i16_16i8_lshr:
1238; AVX1:       # %bb.0: # %entry
1239; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1240; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1241; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1242; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1243; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1244; AVX1-NEXT:    vzeroupper
1245; AVX1-NEXT:    retq
1246;
1247; AVX2-LABEL: trunc16i16_16i8_lshr:
1248; AVX2:       # %bb.0: # %entry
1249; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1250; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1251; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1252; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1253; AVX2-NEXT:    vzeroupper
1254; AVX2-NEXT:    retq
1255;
1256; AVX512F-LABEL: trunc16i16_16i8_lshr:
1257; AVX512F:       # %bb.0: # %entry
1258; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1259; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
1260; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1261; AVX512F-NEXT:    vzeroupper
1262; AVX512F-NEXT:    retq
1263;
1264; AVX512VL-LABEL: trunc16i16_16i8_lshr:
1265; AVX512VL:       # %bb.0: # %entry
1266; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1267; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
1268; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1269; AVX512VL-NEXT:    vzeroupper
1270; AVX512VL-NEXT:    retq
1271;
1272; AVX512BW-LABEL: trunc16i16_16i8_lshr:
1273; AVX512BW:       # %bb.0: # %entry
1274; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
1275; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1276; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1277; AVX512BW-NEXT:    vzeroupper
1278; AVX512BW-NEXT:    retq
1279;
1280; AVX512BWVL-LABEL: trunc16i16_16i8_lshr:
1281; AVX512BWVL:       # %bb.0: # %entry
1282; AVX512BWVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1283; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
1284; AVX512BWVL-NEXT:    vzeroupper
1285; AVX512BWVL-NEXT:    retq
1286entry:
1287  %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1288  %1 = trunc <16 x i16> %0 to <16 x i8>
1289  store <16 x i8> %1, <16 x i8>* undef, align 4
1290  ret void
1291}
1292
1293define void @trunc32i16_32i8(<32 x i16> %a) {
1294; SSE2-LABEL: trunc32i16_32i8:
1295; SSE2:       # %bb.0: # %entry
1296; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1297; SSE2-NEXT:    pand %xmm4, %xmm1
1298; SSE2-NEXT:    pand %xmm4, %xmm0
1299; SSE2-NEXT:    packuswb %xmm1, %xmm0
1300; SSE2-NEXT:    pand %xmm4, %xmm3
1301; SSE2-NEXT:    pand %xmm4, %xmm2
1302; SSE2-NEXT:    packuswb %xmm3, %xmm2
1303; SSE2-NEXT:    movdqu %xmm2, (%rax)
1304; SSE2-NEXT:    movdqu %xmm0, (%rax)
1305; SSE2-NEXT:    retq
1306;
1307; SSSE3-LABEL: trunc32i16_32i8:
1308; SSSE3:       # %bb.0: # %entry
1309; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1310; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1311; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1312; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1313; SSSE3-NEXT:    pshufb %xmm4, %xmm3
1314; SSSE3-NEXT:    pshufb %xmm4, %xmm2
1315; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1316; SSSE3-NEXT:    movdqu %xmm2, (%rax)
1317; SSSE3-NEXT:    movdqu %xmm0, (%rax)
1318; SSSE3-NEXT:    retq
1319;
1320; SSE41-LABEL: trunc32i16_32i8:
1321; SSE41:       # %bb.0: # %entry
1322; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1323; SSE41-NEXT:    pshufb %xmm4, %xmm1
1324; SSE41-NEXT:    pshufb %xmm4, %xmm0
1325; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1326; SSE41-NEXT:    pshufb %xmm4, %xmm3
1327; SSE41-NEXT:    pshufb %xmm4, %xmm2
1328; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1329; SSE41-NEXT:    movdqu %xmm2, (%rax)
1330; SSE41-NEXT:    movdqu %xmm0, (%rax)
1331; SSE41-NEXT:    retq
1332;
1333; AVX1-LABEL: trunc32i16_32i8:
1334; AVX1:       # %bb.0: # %entry
1335; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1336; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1337; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1338; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1339; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1340; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1341; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1342; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1343; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1344; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1345; AVX1-NEXT:    vmovups %ymm0, (%rax)
1346; AVX1-NEXT:    vzeroupper
1347; AVX1-NEXT:    retq
1348;
1349; AVX2-LABEL: trunc32i16_32i8:
1350; AVX2:       # %bb.0: # %entry
1351; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1352; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1353; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1354; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1355; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1356; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1357; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1358; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1359; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1360; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1361; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
1362; AVX2-NEXT:    vzeroupper
1363; AVX2-NEXT:    retq
1364;
1365; AVX512F-LABEL: trunc32i16_32i8:
1366; AVX512F:       # %bb.0: # %entry
1367; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
1368; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1369; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
1370; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
1371; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1372; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
1373; AVX512F-NEXT:    vzeroupper
1374; AVX512F-NEXT:    retq
1375;
1376; AVX512VL-LABEL: trunc32i16_32i8:
1377; AVX512VL:       # %bb.0: # %entry
1378; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
1379; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
1380; AVX512VL-NEXT:    vpmovsxwd %ymm1, %zmm1
1381; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
1382; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1383; AVX512VL-NEXT:    vmovdqu %ymm0, (%rax)
1384; AVX512VL-NEXT:    vzeroupper
1385; AVX512VL-NEXT:    retq
1386;
1387; AVX512BW-LABEL: trunc32i16_32i8:
1388; AVX512BW:       # %bb.0: # %entry
1389; AVX512BW-NEXT:    vpmovwb %zmm0, (%rax)
1390; AVX512BW-NEXT:    vzeroupper
1391; AVX512BW-NEXT:    retq
1392;
1393; AVX512BWVL-LABEL: trunc32i16_32i8:
1394; AVX512BWVL:       # %bb.0: # %entry
1395; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rax)
1396; AVX512BWVL-NEXT:    vzeroupper
1397; AVX512BWVL-NEXT:    retq
1398entry:
1399  %0 = trunc <32 x i16> %a to <32 x i8>
1400  store <32 x i8> %0, <32 x i8>* undef, align 4
1401  ret void
1402}
1403
1404define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
1405; SSE-LABEL: trunc2x4i64_8i32:
1406; SSE:       # %bb.0: # %entry
1407; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1408; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
1409; SSE-NEXT:    movaps %xmm2, %xmm1
1410; SSE-NEXT:    retq
1411;
1412; AVX1-LABEL: trunc2x4i64_8i32:
1413; AVX1:       # %bb.0: # %entry
1414; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1415; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1416; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1417; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1418; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1419; AVX1-NEXT:    retq
1420;
1421; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
1422; AVX2-SLOW:       # %bb.0: # %entry
1423; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1424; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
1425; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1426; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
1427; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1428; AVX2-SLOW-NEXT:    retq
1429;
1430; AVX2-FAST-LABEL: trunc2x4i64_8i32:
1431; AVX2-FAST:       # %bb.0: # %entry
1432; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1433; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
1434; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
1435; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1436; AVX2-FAST-NEXT:    retq
1437;
1438; AVX512F-LABEL: trunc2x4i64_8i32:
1439; AVX512F:       # %bb.0: # %entry
1440; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1441; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1442; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1443; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
1444; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1445; AVX512F-NEXT:    retq
1446;
1447; AVX512VL-LABEL: trunc2x4i64_8i32:
1448; AVX512VL:       # %bb.0: # %entry
1449; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
1450; AVX512VL-NEXT:    vpmovqd %ymm1, %xmm1
1451; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1452; AVX512VL-NEXT:    retq
1453;
1454; AVX512BW-LABEL: trunc2x4i64_8i32:
1455; AVX512BW:       # %bb.0: # %entry
1456; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1457; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1458; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1459; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
1460; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1461; AVX512BW-NEXT:    retq
1462;
1463; AVX512BWVL-LABEL: trunc2x4i64_8i32:
1464; AVX512BWVL:       # %bb.0: # %entry
1465; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
1466; AVX512BWVL-NEXT:    vpmovqd %ymm1, %xmm1
1467; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1468; AVX512BWVL-NEXT:    retq
1469entry:
1470  %0 = trunc <4 x i64> %a to <4 x i32>
1471  %1 = trunc <4 x i64> %b to <4 x i32>
1472  %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1473  ret <8 x i32> %2
1474}
1475
1476define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
1477; SSE2-LABEL: trunc2x4i64_8i16:
1478; SSE2:       # %bb.0: # %entry
1479; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1480; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1481; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1482; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1483; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1484; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1485; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1486; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1487; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1488; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1489; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1490; SSE2-NEXT:    retq
1491;
1492; SSSE3-LABEL: trunc2x4i64_8i16:
1493; SSSE3:       # %bb.0: # %entry
1494; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1495; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1496; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1497; SSSE3-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1498; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1499; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1500; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1501; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1502; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1503; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1504; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1505; SSSE3-NEXT:    retq
1506;
1507; SSE41-LABEL: trunc2x4i64_8i16:
1508; SSE41:       # %bb.0: # %entry
1509; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1510; SSE41-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
1511; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1512; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
1513; SSE41-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1514; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1515; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1516; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1517; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1518; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1519; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1520; SSE41-NEXT:    retq
1521;
1522; AVX1-LABEL: trunc2x4i64_8i16:
1523; AVX1:       # %bb.0: # %entry
1524; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1525; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1526; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1527; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1528; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1529; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1530; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1531; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1532; AVX1-NEXT:    vzeroupper
1533; AVX1-NEXT:    retq
1534;
1535; AVX2-SLOW-LABEL: trunc2x4i64_8i16:
1536; AVX2-SLOW:       # %bb.0: # %entry
1537; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1538; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1539; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1540; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1541; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1542; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1543; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1544; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1545; AVX2-SLOW-NEXT:    vzeroupper
1546; AVX2-SLOW-NEXT:    retq
1547;
1548; AVX2-FAST-LABEL: trunc2x4i64_8i16:
1549; AVX2-FAST:       # %bb.0: # %entry
1550; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1551; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
1552; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
1553; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1554; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1555; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1556; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1557; AVX2-FAST-NEXT:    vzeroupper
1558; AVX2-FAST-NEXT:    retq
1559;
1560; AVX512F-LABEL: trunc2x4i64_8i16:
1561; AVX512F:       # %bb.0: # %entry
1562; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1563; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1564; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1565; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
1566; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1567; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1568; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1569; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1570; AVX512F-NEXT:    vzeroupper
1571; AVX512F-NEXT:    retq
1572;
1573; AVX512VL-LABEL: trunc2x4i64_8i16:
1574; AVX512VL:       # %bb.0: # %entry
1575; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
1576; AVX512VL-NEXT:    vpmovqd %ymm1, %xmm1
1577; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1578; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1579; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1580; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1581; AVX512VL-NEXT:    vzeroupper
1582; AVX512VL-NEXT:    retq
1583;
1584; AVX512BW-LABEL: trunc2x4i64_8i16:
1585; AVX512BW:       # %bb.0: # %entry
1586; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1587; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1588; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1589; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
1590; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1591; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1592; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1593; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1594; AVX512BW-NEXT:    vzeroupper
1595; AVX512BW-NEXT:    retq
1596;
1597; AVX512BWVL-LABEL: trunc2x4i64_8i16:
1598; AVX512BWVL:       # %bb.0: # %entry
1599; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
1600; AVX512BWVL-NEXT:    vpmovqd %ymm1, %xmm1
1601; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1602; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1603; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1604; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1605; AVX512BWVL-NEXT:    vzeroupper
1606; AVX512BWVL-NEXT:    retq
1607entry:
1608  %0 = trunc <4 x i64> %a to <4 x i16>
1609  %1 = trunc <4 x i64> %b to <4 x i16>
1610  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1611  ret <8 x i16> %2
1612}
1613
1614define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
1615; SSE-LABEL: trunc2x2i64_4i32:
1616; SSE:       # %bb.0: # %entry
1617; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1618; SSE-NEXT:    retq
1619;
1620; AVX-LABEL: trunc2x2i64_4i32:
1621; AVX:       # %bb.0: # %entry
1622; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1623; AVX-NEXT:    retq
1624;
1625; AVX512-LABEL: trunc2x2i64_4i32:
1626; AVX512:       # %bb.0: # %entry
1627; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1628; AVX512-NEXT:    retq
1629entry:
1630  %0 = trunc <2 x i64> %a to <2 x i32>
1631  %1 = trunc <2 x i64> %b to <2 x i32>
1632  %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1633  ret <4 x i32> %2
1634}
1635
1636define i64 @trunc2i64_i64(<2 x i64> %inval) {
1637; SSE-LABEL: trunc2i64_i64:
1638; SSE:       # %bb.0: # %entry
1639; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1640; SSE-NEXT:    movq %xmm0, %rax
1641; SSE-NEXT:    retq
1642;
1643; AVX-LABEL: trunc2i64_i64:
1644; AVX:       # %bb.0: # %entry
1645; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1646; AVX-NEXT:    vmovq %xmm0, %rax
1647; AVX-NEXT:    retq
1648;
1649; AVX512F-LABEL: trunc2i64_i64:
1650; AVX512F:       # %bb.0: # %entry
1651; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1652; AVX512F-NEXT:    vmovq %xmm0, %rax
1653; AVX512F-NEXT:    retq
1654;
1655; AVX512VL-LABEL: trunc2i64_i64:
1656; AVX512VL:       # %bb.0: # %entry
1657; AVX512VL-NEXT:    vpmovqd %xmm0, -{{[0-9]+}}(%rsp)
1658; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1659; AVX512VL-NEXT:    retq
1660;
1661; AVX512BW-LABEL: trunc2i64_i64:
1662; AVX512BW:       # %bb.0: # %entry
1663; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1664; AVX512BW-NEXT:    vmovq %xmm0, %rax
1665; AVX512BW-NEXT:    retq
1666;
1667; AVX512BWVL-LABEL: trunc2i64_i64:
1668; AVX512BWVL:       # %bb.0: # %entry
1669; AVX512BWVL-NEXT:    vpmovqd %xmm0, -{{[0-9]+}}(%rsp)
1670; AVX512BWVL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1671; AVX512BWVL-NEXT:    retq
1672entry:
1673  %0 = trunc <2 x i64> %inval to <2 x i32>
1674  %1 = bitcast <2 x i32> %0 to i64
1675  ret i64 %1
1676}
1677
1678define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
1679; SSE2-LABEL: trunc2x4i32_8i16:
1680; SSE2:       # %bb.0: # %entry
1681; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1682; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
1683; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1684; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1685; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1686; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1687; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1688; SSE2-NEXT:    retq
1689;
1690; SSSE3-LABEL: trunc2x4i32_8i16:
1691; SSSE3:       # %bb.0: # %entry
1692; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1693; SSSE3-NEXT:    pshufb %xmm2, %xmm1
1694; SSSE3-NEXT:    pshufb %xmm2, %xmm0
1695; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1696; SSSE3-NEXT:    retq
1697;
1698; SSE41-LABEL: trunc2x4i32_8i16:
1699; SSE41:       # %bb.0: # %entry
1700; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1701; SSE41-NEXT:    pshufb %xmm2, %xmm1
1702; SSE41-NEXT:    pshufb %xmm2, %xmm0
1703; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1704; SSE41-NEXT:    retq
1705;
1706; AVX-LABEL: trunc2x4i32_8i16:
1707; AVX:       # %bb.0: # %entry
1708; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1709; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1710; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1711; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1712; AVX-NEXT:    retq
1713;
1714; AVX512-LABEL: trunc2x4i32_8i16:
1715; AVX512:       # %bb.0: # %entry
1716; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1717; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1718; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1719; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1720; AVX512-NEXT:    retq
1721entry:
1722  %0 = trunc <4 x i32> %a to <4 x i16>
1723  %1 = trunc <4 x i32> %b to <4 x i16>
1724  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1725  ret <8 x i16> %2
1726}
1727
1728; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
1729define i64 @trunc4i32_i64(<4 x i32> %inval) {
1730; SSE2-LABEL: trunc4i32_i64:
1731; SSE2:       # %bb.0: # %entry
1732; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1733; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1734; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1735; SSE2-NEXT:    movq %xmm0, %rax
1736; SSE2-NEXT:    retq
1737;
1738; SSSE3-LABEL: trunc4i32_i64:
1739; SSSE3:       # %bb.0: # %entry
1740; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1741; SSSE3-NEXT:    movq %xmm0, %rax
1742; SSSE3-NEXT:    retq
1743;
1744; SSE41-LABEL: trunc4i32_i64:
1745; SSE41:       # %bb.0: # %entry
1746; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1747; SSE41-NEXT:    movq %xmm0, %rax
1748; SSE41-NEXT:    retq
1749;
1750; AVX-LABEL: trunc4i32_i64:
1751; AVX:       # %bb.0: # %entry
1752; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1753; AVX-NEXT:    vmovq %xmm0, %rax
1754; AVX-NEXT:    retq
1755;
1756; AVX512F-LABEL: trunc4i32_i64:
1757; AVX512F:       # %bb.0: # %entry
1758; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1759; AVX512F-NEXT:    vmovq %xmm0, %rax
1760; AVX512F-NEXT:    retq
1761;
1762; AVX512VL-LABEL: trunc4i32_i64:
1763; AVX512VL:       # %bb.0: # %entry
1764; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
1765; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1766; AVX512VL-NEXT:    retq
1767;
1768; AVX512BW-LABEL: trunc4i32_i64:
1769; AVX512BW:       # %bb.0: # %entry
1770; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1771; AVX512BW-NEXT:    vmovq %xmm0, %rax
1772; AVX512BW-NEXT:    retq
1773;
1774; AVX512BWVL-LABEL: trunc4i32_i64:
1775; AVX512BWVL:       # %bb.0: # %entry
1776; AVX512BWVL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
1777; AVX512BWVL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1778; AVX512BWVL-NEXT:    retq
1779entry:
1780  %0 = trunc <4 x i32> %inval to <4 x i16>
1781  %1 = bitcast <4 x i16> %0 to i64
1782  ret i64 %1
1783}
1784
1785define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
1786; SSE2-LABEL: trunc2x8i16_16i8:
1787; SSE2:       # %bb.0: # %entry
1788; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1789; SSE2-NEXT:    pand %xmm2, %xmm1
1790; SSE2-NEXT:    pand %xmm2, %xmm0
1791; SSE2-NEXT:    packuswb %xmm1, %xmm0
1792; SSE2-NEXT:    retq
1793;
1794; SSSE3-LABEL: trunc2x8i16_16i8:
1795; SSSE3:       # %bb.0: # %entry
1796; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1797; SSSE3-NEXT:    pshufb %xmm2, %xmm1
1798; SSSE3-NEXT:    pshufb %xmm2, %xmm0
1799; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1800; SSSE3-NEXT:    retq
1801;
1802; SSE41-LABEL: trunc2x8i16_16i8:
1803; SSE41:       # %bb.0: # %entry
1804; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1805; SSE41-NEXT:    pshufb %xmm2, %xmm1
1806; SSE41-NEXT:    pshufb %xmm2, %xmm0
1807; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1808; SSE41-NEXT:    retq
1809;
1810; AVX-LABEL: trunc2x8i16_16i8:
1811; AVX:       # %bb.0: # %entry
1812; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1813; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1814; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1815; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1816; AVX-NEXT:    retq
1817;
1818; AVX512-LABEL: trunc2x8i16_16i8:
1819; AVX512:       # %bb.0: # %entry
1820; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1821; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1822; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1823; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1824; AVX512-NEXT:    retq
1825entry:
1826  %0 = trunc <8 x i16> %a to <8 x i8>
1827  %1 = trunc <8 x i16> %b to <8 x i8>
1828  %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1829  ret <16 x i8> %2
1830}
1831
1832; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
1833define i64 @trunc8i16_i64(<8 x i16> %inval) {
1834; SSE2-LABEL: trunc8i16_i64:
1835; SSE2:       # %bb.0: # %entry
1836; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1837; SSE2-NEXT:    packuswb %xmm0, %xmm0
1838; SSE2-NEXT:    movq %xmm0, %rax
1839; SSE2-NEXT:    retq
1840;
1841; SSSE3-LABEL: trunc8i16_i64:
1842; SSSE3:       # %bb.0: # %entry
1843; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1844; SSSE3-NEXT:    movq %xmm0, %rax
1845; SSSE3-NEXT:    retq
1846;
1847; SSE41-LABEL: trunc8i16_i64:
1848; SSE41:       # %bb.0: # %entry
1849; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1850; SSE41-NEXT:    movq %xmm0, %rax
1851; SSE41-NEXT:    retq
1852;
1853; AVX-LABEL: trunc8i16_i64:
1854; AVX:       # %bb.0: # %entry
1855; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1856; AVX-NEXT:    vmovq %xmm0, %rax
1857; AVX-NEXT:    retq
1858;
1859; AVX512F-LABEL: trunc8i16_i64:
1860; AVX512F:       # %bb.0: # %entry
1861; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1862; AVX512F-NEXT:    vmovq %xmm0, %rax
1863; AVX512F-NEXT:    retq
1864;
1865; AVX512VL-LABEL: trunc8i16_i64:
1866; AVX512VL:       # %bb.0: # %entry
1867; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1868; AVX512VL-NEXT:    vmovq %xmm0, %rax
1869; AVX512VL-NEXT:    retq
1870;
1871; AVX512BW-LABEL: trunc8i16_i64:
1872; AVX512BW:       # %bb.0: # %entry
1873; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1874; AVX512BW-NEXT:    vmovq %xmm0, %rax
1875; AVX512BW-NEXT:    retq
1876;
1877; AVX512BWVL-LABEL: trunc8i16_i64:
1878; AVX512BWVL:       # %bb.0: # %entry
1879; AVX512BWVL-NEXT:    vpmovwb %xmm0, -{{[0-9]+}}(%rsp)
1880; AVX512BWVL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1881; AVX512BWVL-NEXT:    retq
1882entry:
1883  %0 = trunc <8 x i16> %inval to <8 x i8>
1884  %1 = bitcast <8 x i8> %0 to i64
1885  ret i64 %1
1886}
1887
1888define <16 x i8> @trunc16i64_16i8_const() {
1889; SSE-LABEL: trunc16i64_16i8_const:
1890; SSE:       # %bb.0: # %entry
1891; SSE-NEXT:    xorps %xmm0, %xmm0
1892; SSE-NEXT:    retq
1893;
1894; AVX-LABEL: trunc16i64_16i8_const:
1895; AVX:       # %bb.0: # %entry
1896; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1897; AVX-NEXT:    retq
1898;
1899; AVX512-LABEL: trunc16i64_16i8_const:
1900; AVX512:       # %bb.0: # %entry
1901; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1902; AVX512-NEXT:    retq
1903
1904entry:
1905  %0 = trunc <16 x i64> zeroinitializer to <16 x i8>
1906  %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26>
1907  ret <16 x i8> %1
1908}
1909
1910define <8 x i16> @PR32160(<8 x i32> %x) {
1911; SSE-LABEL: PR32160:
1912; SSE:       # %bb.0:
1913; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
1914; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1915; SSE-NEXT:    retq
1916;
1917; AVX1-LABEL: PR32160:
1918; AVX1:       # %bb.0:
1919; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
1920; AVX1-NEXT:    vzeroupper
1921; AVX1-NEXT:    retq
1922;
1923; AVX2-SLOW-LABEL: PR32160:
1924; AVX2-SLOW:       # %bb.0:
1925; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1926; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1927; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
1928; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
1929; AVX2-SLOW-NEXT:    vzeroupper
1930; AVX2-SLOW-NEXT:    retq
1931;
1932; AVX2-FAST-LABEL: PR32160:
1933; AVX2-FAST:       # %bb.0:
1934; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1935; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1936; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1937; AVX2-FAST-NEXT:    vzeroupper
1938; AVX2-FAST-NEXT:    retq
1939;
1940; AVX512F-LABEL: PR32160:
1941; AVX512F:       # %bb.0:
1942; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1943; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
1944; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
1945; AVX512F-NEXT:    vpbroadcastd %xmm0, %xmm0
1946; AVX512F-NEXT:    vzeroupper
1947; AVX512F-NEXT:    retq
1948;
1949; AVX512VL-LABEL: PR32160:
1950; AVX512VL:       # %bb.0:
1951; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
1952; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1953; AVX512VL-NEXT:    vzeroupper
1954; AVX512VL-NEXT:    retq
1955;
1956; AVX512BW-LABEL: PR32160:
1957; AVX512BW:       # %bb.0:
1958; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1959; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
1960; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1961; AVX512BW-NEXT:    vzeroupper
1962; AVX512BW-NEXT:    retq
1963;
1964; AVX512BWVL-LABEL: PR32160:
1965; AVX512BWVL:       # %bb.0:
1966; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
1967; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1968; AVX512BWVL-NEXT:    vzeroupper
1969; AVX512BWVL-NEXT:    retq
1970  %shuf = trunc <8 x i32> %x to <8 x i16>
1971  %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1972  ret <8 x i16> %trunc
1973}
1974
1975define void @PR34773(i16* %a0, i8* %a1) {
1976; SSE-LABEL: PR34773:
1977; SSE:       # %bb.0:
1978; SSE-NEXT:    movdqu (%rdi), %xmm0
1979; SSE-NEXT:    movdqu 16(%rdi), %xmm1
1980; SSE-NEXT:    movdqu 32(%rdi), %xmm2
1981; SSE-NEXT:    movdqu 48(%rdi), %xmm3
1982; SSE-NEXT:    psrlw $8, %xmm1
1983; SSE-NEXT:    psrlw $8, %xmm0
1984; SSE-NEXT:    packuswb %xmm1, %xmm0
1985; SSE-NEXT:    psrlw $8, %xmm3
1986; SSE-NEXT:    psrlw $8, %xmm2
1987; SSE-NEXT:    packuswb %xmm3, %xmm2
1988; SSE-NEXT:    movdqu %xmm0, (%rsi)
1989; SSE-NEXT:    movdqu %xmm2, 16(%rsi)
1990; SSE-NEXT:    retq
1991;
1992; AVX1-LABEL: PR34773:
1993; AVX1:       # %bb.0:
1994; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
1995; AVX1-NEXT:    vmovdqu 32(%rdi), %ymm1
1996; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1997; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1998; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1999; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2000; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2001; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
2002; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
2003; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2004; AVX1-NEXT:    vmovdqu %xmm0, (%rsi)
2005; AVX1-NEXT:    vmovdqu %xmm1, 16(%rsi)
2006; AVX1-NEXT:    vzeroupper
2007; AVX1-NEXT:    retq
2008;
2009; AVX2-LABEL: PR34773:
2010; AVX2:       # %bb.0:
2011; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
2012; AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
2013; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
2014; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
2015; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2016; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2017; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2018; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2019; AVX2-NEXT:    vmovdqu %xmm0, (%rsi)
2020; AVX2-NEXT:    vmovdqu %xmm1, 16(%rsi)
2021; AVX2-NEXT:    vzeroupper
2022; AVX2-NEXT:    retq
2023;
2024; AVX512F-LABEL: PR34773:
2025; AVX512F:       # %bb.0:
2026; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
2027; AVX512F-NEXT:    vmovdqu 32(%rdi), %ymm1
2028; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
2029; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
2030; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
2031; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
2032; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
2033; AVX512F-NEXT:    vpmovdb %zmm1, 16(%rsi)
2034; AVX512F-NEXT:    vzeroupper
2035; AVX512F-NEXT:    retq
2036;
2037; AVX512VL-LABEL: PR34773:
2038; AVX512VL:       # %bb.0:
2039; AVX512VL-NEXT:    vmovdqu (%rdi), %ymm0
2040; AVX512VL-NEXT:    vmovdqu 32(%rdi), %ymm1
2041; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
2042; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
2043; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
2044; AVX512VL-NEXT:    vpmovsxwd %ymm1, %zmm1
2045; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
2046; AVX512VL-NEXT:    vpmovdb %zmm1, 16(%rsi)
2047; AVX512VL-NEXT:    vzeroupper
2048; AVX512VL-NEXT:    retq
2049;
2050; AVX512BW-LABEL: PR34773:
2051; AVX512BW:       # %bb.0:
2052; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm0
2053; AVX512BW-NEXT:    vmovdqu 32(%rdi), %ymm1
2054; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
2055; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
2056; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2057; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
2058; AVX512BW-NEXT:    vmovdqu %xmm0, (%rsi)
2059; AVX512BW-NEXT:    vmovdqu %xmm1, 16(%rsi)
2060; AVX512BW-NEXT:    vzeroupper
2061; AVX512BW-NEXT:    retq
2062;
2063; AVX512BWVL-LABEL: PR34773:
2064; AVX512BWVL:       # %bb.0:
2065; AVX512BWVL-NEXT:    vpsrlw $8, (%rdi), %ymm0
2066; AVX512BWVL-NEXT:    vpsrlw $8, 32(%rdi), %ymm1
2067; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
2068; AVX512BWVL-NEXT:    vpmovwb %ymm1, 16(%rsi)
2069; AVX512BWVL-NEXT:    vzeroupper
2070; AVX512BWVL-NEXT:    retq
2071  %1  = getelementptr i16, i16* %a0, i64 16
2072  %2  = getelementptr i8, i8* %a1, i64 16
2073  %3  = bitcast i16* %a0 to <16 x i16>*
2074  %4  = bitcast i16* %1 to <16 x i16>*
2075  %5  = bitcast i8* %a1 to <16 x i8>*
2076  %6  = bitcast i8* %2 to <16 x i8>*
2077  %7  = load <16 x i16>, <16 x i16>* %3, align 2
2078  %8  = load <16 x i16>, <16 x i16>* %4, align 2
2079  %9  = lshr <16 x i16> %7, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2080  %10 = lshr <16 x i16> %8, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2081  %11 = trunc <16 x i16> %9  to <16 x i8>
2082  %12 = trunc <16 x i16> %10 to <16 x i8>
2083  store <16 x i8> %11, <16 x i8>* %5, align 1
2084  store <16 x i8> %12, <16 x i8>* %6, align 1
2085  ret void
2086}
2087