• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
12
13; PR31551
14; Pairs of shufflevector:trunc functions with functional equivalence.
15; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
16
17define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
18; SSE2-LABEL: shuffle_v16i8_to_v8i8:
19; SSE2:       # %bb.0:
20; SSE2-NEXT:    movdqa (%rdi), %xmm0
21; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
22; SSE2-NEXT:    packuswb %xmm0, %xmm0
23; SSE2-NEXT:    movq %xmm0, (%rsi)
24; SSE2-NEXT:    retq
25;
26; SSE42-LABEL: shuffle_v16i8_to_v8i8:
27; SSE42:       # %bb.0:
28; SSE42-NEXT:    movdqa (%rdi), %xmm0
29; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
30; SSE42-NEXT:    movq %xmm0, (%rsi)
31; SSE42-NEXT:    retq
32;
33; AVX-LABEL: shuffle_v16i8_to_v8i8:
34; AVX:       # %bb.0:
35; AVX-NEXT:    vmovdqa (%rdi), %xmm0
36; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
37; AVX-NEXT:    vmovq %xmm0, (%rsi)
38; AVX-NEXT:    retq
39;
40; AVX512F-LABEL: shuffle_v16i8_to_v8i8:
41; AVX512F:       # %bb.0:
42; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
43; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
44; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
45; AVX512F-NEXT:    retq
46;
47; AVX512VL-LABEL: shuffle_v16i8_to_v8i8:
48; AVX512VL:       # %bb.0:
49; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
50; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
51; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
52; AVX512VL-NEXT:    retq
53;
54; AVX512BW-LABEL: shuffle_v16i8_to_v8i8:
55; AVX512BW:       # %bb.0:
56; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
57; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
58; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
59; AVX512BW-NEXT:    retq
60;
61; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8:
62; AVX512BWVL:       # %bb.0:
63; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
64; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
65; AVX512BWVL-NEXT:    retq
66  %vec = load <16 x i8>, <16 x i8>* %L
67  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
68  store <8 x i8> %strided.vec, <8 x i8>* %S
69  ret void
70}
71
72define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
73; SSE2-LABEL: trunc_v8i16_to_v8i8:
74; SSE2:       # %bb.0:
75; SSE2-NEXT:    movdqa (%rdi), %xmm0
76; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
77; SSE2-NEXT:    packuswb %xmm0, %xmm0
78; SSE2-NEXT:    movq %xmm0, (%rsi)
79; SSE2-NEXT:    retq
80;
81; SSE42-LABEL: trunc_v8i16_to_v8i8:
82; SSE42:       # %bb.0:
83; SSE42-NEXT:    movdqa (%rdi), %xmm0
84; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
85; SSE42-NEXT:    movq %xmm0, (%rsi)
86; SSE42-NEXT:    retq
87;
88; AVX-LABEL: trunc_v8i16_to_v8i8:
89; AVX:       # %bb.0:
90; AVX-NEXT:    vmovdqa (%rdi), %xmm0
91; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
92; AVX-NEXT:    vmovq %xmm0, (%rsi)
93; AVX-NEXT:    retq
94;
95; AVX512F-LABEL: trunc_v8i16_to_v8i8:
96; AVX512F:       # %bb.0:
97; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
98; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
99; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
100; AVX512F-NEXT:    retq
101;
102; AVX512VL-LABEL: trunc_v8i16_to_v8i8:
103; AVX512VL:       # %bb.0:
104; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
105; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
106; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
107; AVX512VL-NEXT:    retq
108;
109; AVX512BW-LABEL: trunc_v8i16_to_v8i8:
110; AVX512BW:       # %bb.0:
111; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
112; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
113; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
114; AVX512BW-NEXT:    retq
115;
116; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8:
117; AVX512BWVL:       # %bb.0:
118; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
119; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
120; AVX512BWVL-NEXT:    retq
121  %vec = load <16 x i8>, <16 x i8>* %L
122  %bc = bitcast <16 x i8> %vec to <8 x i16>
123  %strided.vec = trunc <8 x i16> %bc to <8 x i8>
124  store <8 x i8> %strided.vec, <8 x i8>* %S
125  ret void
126}
127
128define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
129; SSE2-LABEL: shuffle_v8i16_to_v4i16:
130; SSE2:       # %bb.0:
131; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
132; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
133; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
134; SSE2-NEXT:    movq %xmm0, (%rsi)
135; SSE2-NEXT:    retq
136;
137; SSE42-LABEL: shuffle_v8i16_to_v4i16:
138; SSE42:       # %bb.0:
139; SSE42-NEXT:    movdqa (%rdi), %xmm0
140; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
141; SSE42-NEXT:    movq %xmm0, (%rsi)
142; SSE42-NEXT:    retq
143;
144; AVX-LABEL: shuffle_v8i16_to_v4i16:
145; AVX:       # %bb.0:
146; AVX-NEXT:    vmovdqa (%rdi), %xmm0
147; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
148; AVX-NEXT:    vmovq %xmm0, (%rsi)
149; AVX-NEXT:    retq
150;
151; AVX512F-LABEL: shuffle_v8i16_to_v4i16:
152; AVX512F:       # %bb.0:
153; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
154; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
155; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
156; AVX512F-NEXT:    retq
157;
158; AVX512VL-LABEL: shuffle_v8i16_to_v4i16:
159; AVX512VL:       # %bb.0:
160; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
161; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
162; AVX512VL-NEXT:    retq
163;
164; AVX512BW-LABEL: shuffle_v8i16_to_v4i16:
165; AVX512BW:       # %bb.0:
166; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
167; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
168; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
169; AVX512BW-NEXT:    retq
170;
171; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16:
172; AVX512BWVL:       # %bb.0:
173; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
174; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
175; AVX512BWVL-NEXT:    retq
176  %vec = load <8 x i16>, <8 x i16>* %L
177  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
178  store <4 x i16> %strided.vec, <4 x i16>* %S
179  ret void
180}
181
182define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
183; SSE2-LABEL: trunc_v4i32_to_v4i16:
184; SSE2:       # %bb.0:
185; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
186; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
187; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
188; SSE2-NEXT:    movq %xmm0, (%rsi)
189; SSE2-NEXT:    retq
190;
191; SSE42-LABEL: trunc_v4i32_to_v4i16:
192; SSE42:       # %bb.0:
193; SSE42-NEXT:    movdqa (%rdi), %xmm0
194; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
195; SSE42-NEXT:    movq %xmm0, (%rsi)
196; SSE42-NEXT:    retq
197;
198; AVX-LABEL: trunc_v4i32_to_v4i16:
199; AVX:       # %bb.0:
200; AVX-NEXT:    vmovdqa (%rdi), %xmm0
201; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
202; AVX-NEXT:    vmovq %xmm0, (%rsi)
203; AVX-NEXT:    retq
204;
205; AVX512F-LABEL: trunc_v4i32_to_v4i16:
206; AVX512F:       # %bb.0:
207; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
208; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
209; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
210; AVX512F-NEXT:    retq
211;
212; AVX512VL-LABEL: trunc_v4i32_to_v4i16:
213; AVX512VL:       # %bb.0:
214; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
215; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
216; AVX512VL-NEXT:    retq
217;
218; AVX512BW-LABEL: trunc_v4i32_to_v4i16:
219; AVX512BW:       # %bb.0:
220; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
221; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
222; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
223; AVX512BW-NEXT:    retq
224;
225; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16:
226; AVX512BWVL:       # %bb.0:
227; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
228; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
229; AVX512BWVL-NEXT:    retq
230  %vec = load <8 x i16>, <8 x i16>* %L
231  %bc = bitcast <8 x i16> %vec to <4 x i32>
232  %strided.vec = trunc <4 x i32> %bc to <4 x i16>
233  store <4 x i16> %strided.vec, <4 x i16>* %S
234  ret void
235}
236
237define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
238; SSE-LABEL: shuffle_v4i32_to_v2i32:
239; SSE:       # %bb.0:
240; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
241; SSE-NEXT:    movq %xmm0, (%rsi)
242; SSE-NEXT:    retq
243;
244; AVX-LABEL: shuffle_v4i32_to_v2i32:
245; AVX:       # %bb.0:
246; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
247; AVX-NEXT:    vmovlps %xmm0, (%rsi)
248; AVX-NEXT:    retq
249;
250; AVX512-LABEL: shuffle_v4i32_to_v2i32:
251; AVX512:       # %bb.0:
252; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
253; AVX512-NEXT:    vmovlps %xmm0, (%rsi)
254; AVX512-NEXT:    retq
255  %vec = load <4 x i32>, <4 x i32>* %L
256  %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
257  store <2 x i32> %strided.vec, <2 x i32>* %S
258  ret void
259}
260
261define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
262; SSE-LABEL: trunc_v2i64_to_v2i32:
263; SSE:       # %bb.0:
264; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
265; SSE-NEXT:    movq %xmm0, (%rsi)
266; SSE-NEXT:    retq
267;
268; AVX-LABEL: trunc_v2i64_to_v2i32:
269; AVX:       # %bb.0:
270; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
271; AVX-NEXT:    vmovlps %xmm0, (%rsi)
272; AVX-NEXT:    retq
273;
274; AVX512F-LABEL: trunc_v2i64_to_v2i32:
275; AVX512F:       # %bb.0:
276; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
277; AVX512F-NEXT:    vmovlps %xmm0, (%rsi)
278; AVX512F-NEXT:    retq
279;
280; AVX512VL-LABEL: trunc_v2i64_to_v2i32:
281; AVX512VL:       # %bb.0:
282; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
283; AVX512VL-NEXT:    vpmovqd %xmm0, (%rsi)
284; AVX512VL-NEXT:    retq
285;
286; AVX512BW-LABEL: trunc_v2i64_to_v2i32:
287; AVX512BW:       # %bb.0:
288; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
289; AVX512BW-NEXT:    vmovlps %xmm0, (%rsi)
290; AVX512BW-NEXT:    retq
291;
292; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32:
293; AVX512BWVL:       # %bb.0:
294; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
295; AVX512BWVL-NEXT:    vpmovqd %xmm0, (%rsi)
296; AVX512BWVL-NEXT:    retq
297  %vec = load <4 x i32>, <4 x i32>* %L
298  %bc = bitcast <4 x i32> %vec to <2 x i64>
299  %strided.vec = trunc <2 x i64> %bc to <2 x i32>
300  store <2 x i32> %strided.vec, <2 x i32>* %S
301  ret void
302}
303
304define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
305; SSE2-LABEL: shuffle_v16i8_to_v4i8:
306; SSE2:       # %bb.0:
307; SSE2-NEXT:    movdqa (%rdi), %xmm0
308; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
309; SSE2-NEXT:    packuswb %xmm0, %xmm0
310; SSE2-NEXT:    packuswb %xmm0, %xmm0
311; SSE2-NEXT:    movd %xmm0, (%rsi)
312; SSE2-NEXT:    retq
313;
314; SSE42-LABEL: shuffle_v16i8_to_v4i8:
315; SSE42:       # %bb.0:
316; SSE42-NEXT:    movdqa (%rdi), %xmm0
317; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
318; SSE42-NEXT:    movd %xmm0, (%rsi)
319; SSE42-NEXT:    retq
320;
321; AVX-LABEL: shuffle_v16i8_to_v4i8:
322; AVX:       # %bb.0:
323; AVX-NEXT:    vmovdqa (%rdi), %xmm0
324; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
325; AVX-NEXT:    vmovd %xmm0, (%rsi)
326; AVX-NEXT:    retq
327;
328; AVX512F-LABEL: shuffle_v16i8_to_v4i8:
329; AVX512F:       # %bb.0:
330; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
331; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
332; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
333; AVX512F-NEXT:    retq
334;
335; AVX512VL-LABEL: shuffle_v16i8_to_v4i8:
336; AVX512VL:       # %bb.0:
337; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
338; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
339; AVX512VL-NEXT:    retq
340;
341; AVX512BW-LABEL: shuffle_v16i8_to_v4i8:
342; AVX512BW:       # %bb.0:
343; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
344; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
345; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
346; AVX512BW-NEXT:    retq
347;
348; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8:
349; AVX512BWVL:       # %bb.0:
350; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
351; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
352; AVX512BWVL-NEXT:    retq
353  %vec = load <16 x i8>, <16 x i8>* %L
354  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
355  store <4 x i8> %strided.vec, <4 x i8>* %S
356  ret void
357}
358
359define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
360; SSE2-LABEL: trunc_v4i32_to_v4i8:
361; SSE2:       # %bb.0:
362; SSE2-NEXT:    movdqa (%rdi), %xmm0
363; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
364; SSE2-NEXT:    packuswb %xmm0, %xmm0
365; SSE2-NEXT:    packuswb %xmm0, %xmm0
366; SSE2-NEXT:    movd %xmm0, (%rsi)
367; SSE2-NEXT:    retq
368;
369; SSE42-LABEL: trunc_v4i32_to_v4i8:
370; SSE42:       # %bb.0:
371; SSE42-NEXT:    movdqa (%rdi), %xmm0
372; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
373; SSE42-NEXT:    movd %xmm0, (%rsi)
374; SSE42-NEXT:    retq
375;
376; AVX-LABEL: trunc_v4i32_to_v4i8:
377; AVX:       # %bb.0:
378; AVX-NEXT:    vmovdqa (%rdi), %xmm0
379; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
380; AVX-NEXT:    vmovd %xmm0, (%rsi)
381; AVX-NEXT:    retq
382;
383; AVX512F-LABEL: trunc_v4i32_to_v4i8:
384; AVX512F:       # %bb.0:
385; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
386; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
387; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
388; AVX512F-NEXT:    retq
389;
390; AVX512VL-LABEL: trunc_v4i32_to_v4i8:
391; AVX512VL:       # %bb.0:
392; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
393; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
394; AVX512VL-NEXT:    retq
395;
396; AVX512BW-LABEL: trunc_v4i32_to_v4i8:
397; AVX512BW:       # %bb.0:
398; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
399; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
400; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
401; AVX512BW-NEXT:    retq
402;
403; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8:
404; AVX512BWVL:       # %bb.0:
405; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
406; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
407; AVX512BWVL-NEXT:    retq
408  %vec = load <16 x i8>, <16 x i8>* %L
409  %bc = bitcast <16 x i8> %vec to <4 x i32>
410  %strided.vec = trunc <4 x i32> %bc to <4 x i8>
411  store <4 x i8> %strided.vec, <4 x i8>* %S
412  ret void
413}
414
415define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
416; SSE-LABEL: shuffle_v8i16_to_v2i16:
417; SSE:       # %bb.0:
418; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
419; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
420; SSE-NEXT:    movd %xmm0, (%rsi)
421; SSE-NEXT:    retq
422;
423; AVX1-LABEL: shuffle_v8i16_to_v2i16:
424; AVX1:       # %bb.0:
425; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
426; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
427; AVX1-NEXT:    vmovd %xmm0, (%rsi)
428; AVX1-NEXT:    retq
429;
430; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16:
431; AVX2-SLOW:       # %bb.0:
432; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
433; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
434; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
435; AVX2-SLOW-NEXT:    retq
436;
437; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16:
438; AVX2-FAST:       # %bb.0:
439; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
440; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
441; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
442; AVX2-FAST-NEXT:    retq
443;
444; AVX512F-LABEL: shuffle_v8i16_to_v2i16:
445; AVX512F:       # %bb.0:
446; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
447; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
448; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
449; AVX512F-NEXT:    retq
450;
451; AVX512VL-LABEL: shuffle_v8i16_to_v2i16:
452; AVX512VL:       # %bb.0:
453; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
454; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
455; AVX512VL-NEXT:    retq
456;
457; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
458; AVX512BW:       # %bb.0:
459; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
460; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
461; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
462; AVX512BW-NEXT:    retq
463;
464; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16:
465; AVX512BWVL:       # %bb.0:
466; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
467; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
468; AVX512BWVL-NEXT:    retq
469  %vec = load <8 x i16>, <8 x i16>* %L
470  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
471  store <2 x i16> %strided.vec, <2 x i16>* %S
472  ret void
473}
474
475define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
476; SSE-LABEL: trunc_v2i64_to_v2i16:
477; SSE:       # %bb.0:
478; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
479; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
480; SSE-NEXT:    movd %xmm0, (%rsi)
481; SSE-NEXT:    retq
482;
483; AVX1-LABEL: trunc_v2i64_to_v2i16:
484; AVX1:       # %bb.0:
485; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
486; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
487; AVX1-NEXT:    vmovd %xmm0, (%rsi)
488; AVX1-NEXT:    retq
489;
490; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16:
491; AVX2-SLOW:       # %bb.0:
492; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
493; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
494; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
495; AVX2-SLOW-NEXT:    retq
496;
497; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16:
498; AVX2-FAST:       # %bb.0:
499; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
500; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
501; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
502; AVX2-FAST-NEXT:    retq
503;
504; AVX512F-LABEL: trunc_v2i64_to_v2i16:
505; AVX512F:       # %bb.0:
506; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
507; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
508; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
509; AVX512F-NEXT:    retq
510;
511; AVX512VL-LABEL: trunc_v2i64_to_v2i16:
512; AVX512VL:       # %bb.0:
513; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
514; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
515; AVX512VL-NEXT:    retq
516;
517; AVX512BW-LABEL: trunc_v2i64_to_v2i16:
518; AVX512BW:       # %bb.0:
519; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
520; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
521; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
522; AVX512BW-NEXT:    retq
523;
524; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16:
525; AVX512BWVL:       # %bb.0:
526; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
527; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
528; AVX512BWVL-NEXT:    retq
529  %vec = load <8 x i16>, <8 x i16>* %L
530  %bc = bitcast <8 x i16> %vec to <2 x i64>
531  %strided.vec = trunc <2 x i64> %bc to <2 x i16>
532  store <2 x i16> %strided.vec, <2 x i16>* %S
533  ret void
534}
535
536define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
537; SSE2-LABEL: shuffle_v16i8_to_v2i8:
538; SSE2:       # %bb.0:
539; SSE2-NEXT:    movdqa (%rdi), %xmm0
540; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
541; SSE2-NEXT:    packuswb %xmm0, %xmm0
542; SSE2-NEXT:    packuswb %xmm0, %xmm0
543; SSE2-NEXT:    packuswb %xmm0, %xmm0
544; SSE2-NEXT:    movd %xmm0, %eax
545; SSE2-NEXT:    movw %ax, (%rsi)
546; SSE2-NEXT:    retq
547;
548; SSE42-LABEL: shuffle_v16i8_to_v2i8:
549; SSE42:       # %bb.0:
550; SSE42-NEXT:    movdqa (%rdi), %xmm0
551; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
552; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
553; SSE42-NEXT:    retq
554;
555; AVX-LABEL: shuffle_v16i8_to_v2i8:
556; AVX:       # %bb.0:
557; AVX-NEXT:    vmovdqa (%rdi), %xmm0
558; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
559; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
560; AVX-NEXT:    retq
561;
562; AVX512F-LABEL: shuffle_v16i8_to_v2i8:
563; AVX512F:       # %bb.0:
564; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
565; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
566; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
567; AVX512F-NEXT:    retq
568;
569; AVX512VL-LABEL: shuffle_v16i8_to_v2i8:
570; AVX512VL:       # %bb.0:
571; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
572; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
573; AVX512VL-NEXT:    retq
574;
575; AVX512BW-LABEL: shuffle_v16i8_to_v2i8:
576; AVX512BW:       # %bb.0:
577; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
578; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
579; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
580; AVX512BW-NEXT:    retq
581;
582; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8:
583; AVX512BWVL:       # %bb.0:
584; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
585; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
586; AVX512BWVL-NEXT:    retq
587  %vec = load <16 x i8>, <16 x i8>* %L
588  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8>
589  store <2 x i8> %strided.vec, <2 x i8>* %S
590  ret void
591}
592
593define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
594; SSE2-LABEL: trunc_v2i64_to_v2i8:
595; SSE2:       # %bb.0:
596; SSE2-NEXT:    movdqa (%rdi), %xmm0
597; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
598; SSE2-NEXT:    packuswb %xmm0, %xmm0
599; SSE2-NEXT:    packuswb %xmm0, %xmm0
600; SSE2-NEXT:    packuswb %xmm0, %xmm0
601; SSE2-NEXT:    movd %xmm0, %eax
602; SSE2-NEXT:    movw %ax, (%rsi)
603; SSE2-NEXT:    retq
604;
605; SSE42-LABEL: trunc_v2i64_to_v2i8:
606; SSE42:       # %bb.0:
607; SSE42-NEXT:    movdqa (%rdi), %xmm0
608; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
609; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
610; SSE42-NEXT:    retq
611;
612; AVX-LABEL: trunc_v2i64_to_v2i8:
613; AVX:       # %bb.0:
614; AVX-NEXT:    vmovdqa (%rdi), %xmm0
615; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
616; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
617; AVX-NEXT:    retq
618;
619; AVX512F-LABEL: trunc_v2i64_to_v2i8:
620; AVX512F:       # %bb.0:
621; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
622; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
623; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
624; AVX512F-NEXT:    retq
625;
626; AVX512VL-LABEL: trunc_v2i64_to_v2i8:
627; AVX512VL:       # %bb.0:
628; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
629; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
630; AVX512VL-NEXT:    retq
631;
632; AVX512BW-LABEL: trunc_v2i64_to_v2i8:
633; AVX512BW:       # %bb.0:
634; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
635; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
636; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
637; AVX512BW-NEXT:    retq
638;
639; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8:
640; AVX512BWVL:       # %bb.0:
641; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
642; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
643; AVX512BWVL-NEXT:    retq
644  %vec = load <16 x i8>, <16 x i8>* %L
645  %bc = bitcast <16 x i8> %vec to <2 x i64>
646  %strided.vec = trunc <2 x i64> %bc to <2 x i8>
647  store <2 x i8> %strided.vec, <2 x i8>* %S
648  ret void
649}
650