• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
6
7define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
8; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1:
9; AVX512F:       # %bb.0:
10; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
11; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
12; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
13; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
14; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
15; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
16; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
17; AVX512F-NEXT:    vzeroupper
18; AVX512F-NEXT:    retq
19;
20; AVX512VL-LABEL: shuffle_v64i8_to_v32i8_1:
21; AVX512VL:       # %bb.0:
22; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
23; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
24; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
25; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
26; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
27; AVX512VL-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
28; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
29; AVX512VL-NEXT:    vzeroupper
30; AVX512VL-NEXT:    retq
31;
32; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1:
33; AVX512BW:       # %bb.0:
34; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
35; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
36; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
37; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
38; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
39; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
40; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
41; AVX512BW-NEXT:    vzeroupper
42; AVX512BW-NEXT:    retq
43;
44; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8_1:
45; AVX512BWVL:       # %bb.0:
46; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
47; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
48; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
49; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
50; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
51; AVX512BWVL-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
52; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
53; AVX512BWVL-NEXT:    vzeroupper
54; AVX512BWVL-NEXT:    retq
55  %vec = load <64 x i8>, <64 x i8>* %L
56  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
57  store <32 x i8> %strided.vec, <32 x i8>* %S
58  ret void
59}
60
61define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind {
62; AVX512F-LABEL: shuffle_v32i16_to_v16i16_1:
63; AVX512F:       # %bb.0:
64; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
65; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
66; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
67; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
68; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
69; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
70; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
71; AVX512F-NEXT:    vzeroupper
72; AVX512F-NEXT:    retq
73;
74; AVX512VL-LABEL: shuffle_v32i16_to_v16i16_1:
75; AVX512VL:       # %bb.0:
76; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
77; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
78; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
79; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
80; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
81; AVX512VL-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
82; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
83; AVX512VL-NEXT:    vzeroupper
84; AVX512VL-NEXT:    retq
85;
86; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
87; AVX512BW:       # %bb.0:
88; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
89; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
90; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
91; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
92; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
93; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
94; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
95; AVX512BW-NEXT:    vzeroupper
96; AVX512BW-NEXT:    retq
97;
98; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16_1:
99; AVX512BWVL:       # %bb.0:
100; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
101; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
102; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
103; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
104; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
105; AVX512BWVL-NEXT:    vzeroupper
106; AVX512BWVL-NEXT:    retq
107  %vec = load <32 x i16>, <32 x i16>* %L
108  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
109  store <16 x i16> %strided.vec, <16 x i16>* %S
110  ret void
111}
112
113define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
114; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1:
115; AVX512F:       # %bb.0:
116; AVX512F-NEXT:    vmovaps (%rdi), %zmm0
117; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
118; AVX512F-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
119; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
120; AVX512F-NEXT:    vmovaps %ymm0, (%rsi)
121; AVX512F-NEXT:    vzeroupper
122; AVX512F-NEXT:    retq
123;
124; AVX512VL-LABEL: shuffle_v16i32_to_v8i32_1:
125; AVX512VL:       # %bb.0:
126; AVX512VL-NEXT:    vmovdqa64 (%rdi), %zmm0
127; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
128; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
129; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
130; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
131; AVX512VL-NEXT:    vzeroupper
132; AVX512VL-NEXT:    retq
133;
134; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1:
135; AVX512BW:       # %bb.0:
136; AVX512BW-NEXT:    vmovaps (%rdi), %zmm0
137; AVX512BW-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
138; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
139; AVX512BW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
140; AVX512BW-NEXT:    vmovaps %ymm0, (%rsi)
141; AVX512BW-NEXT:    vzeroupper
142; AVX512BW-NEXT:    retq
143;
144; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32_1:
145; AVX512BWVL:       # %bb.0:
146; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
147; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
148; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
149; AVX512BWVL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
150; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
151; AVX512BWVL-NEXT:    vzeroupper
152; AVX512BWVL-NEXT:    retq
153  %vec = load <16 x i32>, <16 x i32>* %L
154  %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
155  store <8 x i32> %strided.vec, <8 x i32>* %S
156  ret void
157}
158
159define void @shuffle_v64i8_to_v16i8_1(<64 x i8>* %L, <16 x i8>* %S) nounwind {
160; AVX512F-LABEL: shuffle_v64i8_to_v16i8_1:
161; AVX512F:       # %bb.0:
162; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
163; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
164; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
165; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
166; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
167; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
168; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
169; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
170; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
171; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
172; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
173; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
174; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
175; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
176; AVX512F-NEXT:    vzeroupper
177; AVX512F-NEXT:    retq
178;
179; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_1:
180; AVX512VL:       # %bb.0:
181; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
182; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
183; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
184; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
185; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
186; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
187; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
188; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
189; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
190; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
191; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
192; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
193; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
194; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
195; AVX512VL-NEXT:    vzeroupper
196; AVX512VL-NEXT:    retq
197;
198; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_1:
199; AVX512BW:       # %bb.0:
200; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
201; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
202; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
203; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
204; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
205; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
206; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
207; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
208; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
209; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
210; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
211; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
212; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
213; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
214; AVX512BW-NEXT:    vzeroupper
215; AVX512BW-NEXT:    retq
216;
217; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_1:
218; AVX512BWVL:       # %bb.0:
219; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
220; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
221; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
222; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
223; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
224; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
225; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
226; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
227; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
228; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
229; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
230; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
231; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
232; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
233; AVX512BWVL-NEXT:    vzeroupper
234; AVX512BWVL-NEXT:    retq
235  %vec = load <64 x i8>, <64 x i8>* %L
236  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
237  store <16 x i8> %strided.vec, <16 x i8>* %S
238  ret void
239}
240
241define void @shuffle_v64i8_to_v16i8_2(<64 x i8>* %L, <16 x i8>* %S) nounwind {
242; AVX512F-LABEL: shuffle_v64i8_to_v16i8_2:
243; AVX512F:       # %bb.0:
244; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
245; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
246; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
247; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
248; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
249; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
250; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
251; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
252; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
253; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
254; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
255; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
256; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
257; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
258; AVX512F-NEXT:    vzeroupper
259; AVX512F-NEXT:    retq
260;
261; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_2:
262; AVX512VL:       # %bb.0:
263; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
264; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
265; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
266; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
267; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
268; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
269; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
270; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
271; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
272; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
273; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
274; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
275; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
276; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
277; AVX512VL-NEXT:    vzeroupper
278; AVX512VL-NEXT:    retq
279;
280; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_2:
281; AVX512BW:       # %bb.0:
282; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
283; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
284; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
285; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
286; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
287; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
288; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
289; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
290; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
291; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
292; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
293; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
294; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
295; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
296; AVX512BW-NEXT:    vzeroupper
297; AVX512BW-NEXT:    retq
298;
299; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_2:
300; AVX512BWVL:       # %bb.0:
301; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
302; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
303; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
304; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
305; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
306; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
307; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
308; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
309; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
310; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
311; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
312; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
313; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
314; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
315; AVX512BWVL-NEXT:    vzeroupper
316; AVX512BWVL-NEXT:    retq
317  %vec = load <64 x i8>, <64 x i8>* %L
318  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
319  store <16 x i8> %strided.vec, <16 x i8>* %S
320  ret void
321}
322
323define void @shuffle_v64i8_to_v16i8_3(<64 x i8>* %L, <16 x i8>* %S) nounwind {
324; AVX512F-LABEL: shuffle_v64i8_to_v16i8_3:
325; AVX512F:       # %bb.0:
326; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
327; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
328; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
329; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
330; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
331; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
332; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
333; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
334; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
335; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
336; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
337; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
338; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
339; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
340; AVX512F-NEXT:    vzeroupper
341; AVX512F-NEXT:    retq
342;
343; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_3:
344; AVX512VL:       # %bb.0:
345; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
346; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
347; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
348; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
349; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
350; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
351; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
352; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
353; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
354; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
355; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
356; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
357; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
358; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
359; AVX512VL-NEXT:    vzeroupper
360; AVX512VL-NEXT:    retq
361;
362; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_3:
363; AVX512BW:       # %bb.0:
364; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
365; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
366; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
367; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
368; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
369; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
370; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
371; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
372; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
373; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
374; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
375; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
376; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
377; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
378; AVX512BW-NEXT:    vzeroupper
379; AVX512BW-NEXT:    retq
380;
381; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_3:
382; AVX512BWVL:       # %bb.0:
383; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
384; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
385; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
386; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
387; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
388; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
389; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
390; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
391; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
392; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
393; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
394; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
395; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
396; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
397; AVX512BWVL-NEXT:    vzeroupper
398; AVX512BWVL-NEXT:    retq
399  %vec = load <64 x i8>, <64 x i8>* %L
400  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
401  store <16 x i8> %strided.vec, <16 x i8>* %S
402  ret void
403}
404
405define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
406; AVX512F-LABEL: shuffle_v32i16_to_v8i16_1:
407; AVX512F:       # %bb.0:
408; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
409; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
410; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
411; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
412; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
413; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
414; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
415; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
416; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
417; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
418; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
419; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
420; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
421; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
422; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
423; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
424; AVX512F-NEXT:    vzeroupper
425; AVX512F-NEXT:    retq
426;
427; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_1:
428; AVX512VL:       # %bb.0:
429; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
430; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
431; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
432; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
433; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
434; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
435; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
436; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
437; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
438; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
439; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
440; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
441; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
442; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
443; AVX512VL-NEXT:    vzeroupper
444; AVX512VL-NEXT:    retq
445;
446; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
447; AVX512BW:       # %bb.0:
448; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
449; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
450; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
451; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
452; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
453; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
454; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
455; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
456; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
457; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
458; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
459; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
460; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
461; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
462; AVX512BW-NEXT:    vzeroupper
463; AVX512BW-NEXT:    retq
464;
465; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
466; AVX512BWVL:       # %bb.0:
467; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
468; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
469; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
470; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
471; AVX512BWVL-NEXT:    vmovdqa %xmm2, (%rsi)
472; AVX512BWVL-NEXT:    vzeroupper
473; AVX512BWVL-NEXT:    retq
474  %vec = load <32 x i16>, <32 x i16>* %L
475  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
476  store <8 x i16> %strided.vec, <8 x i16>* %S
477  ret void
478}
479
480define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
481; AVX512F-LABEL: shuffle_v32i16_to_v8i16_2:
482; AVX512F:       # %bb.0:
483; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
484; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
485; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
486; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
487; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
488; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
489; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
490; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
491; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
492; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
493; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
494; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
495; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
496; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
497; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
498; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
499; AVX512F-NEXT:    vzeroupper
500; AVX512F-NEXT:    retq
501;
502; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_2:
503; AVX512VL:       # %bb.0:
504; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
505; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
506; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
507; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
508; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
509; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
510; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
511; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
512; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
513; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
514; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
515; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
516; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
517; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
518; AVX512VL-NEXT:    vzeroupper
519; AVX512VL-NEXT:    retq
520;
521; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
522; AVX512BW:       # %bb.0:
523; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
524; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
525; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
526; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
527; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
528; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
529; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
530; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
531; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
532; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
533; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
534; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
535; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
536; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
537; AVX512BW-NEXT:    vzeroupper
538; AVX512BW-NEXT:    retq
539;
540; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
541; AVX512BWVL:       # %bb.0:
542; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
543; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
544; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
545; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
546; AVX512BWVL-NEXT:    vmovdqa %xmm2, (%rsi)
547; AVX512BWVL-NEXT:    vzeroupper
548; AVX512BWVL-NEXT:    retq
549  %vec = load <32 x i16>, <32 x i16>* %L
550  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
551  store <8 x i16> %strided.vec, <8 x i16>* %S
552  ret void
553}
554
555define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
556; AVX512F-LABEL: shuffle_v32i16_to_v8i16_3:
557; AVX512F:       # %bb.0:
558; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
559; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
560; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
561; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
562; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
563; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
564; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
565; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
566; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
567; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
568; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
569; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
570; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
571; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
572; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
573; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
574; AVX512F-NEXT:    vzeroupper
575; AVX512F-NEXT:    retq
576;
577; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_3:
578; AVX512VL:       # %bb.0:
579; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
580; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
581; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
582; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
583; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
584; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
585; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
586; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
587; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
588; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
589; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
590; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
591; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
592; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
593; AVX512VL-NEXT:    vzeroupper
594; AVX512VL-NEXT:    retq
595;
596; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
597; AVX512BW:       # %bb.0:
598; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
599; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
600; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
601; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
602; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
603; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
604; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
605; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
606; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
607; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
608; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
609; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
610; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
611; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
612; AVX512BW-NEXT:    vzeroupper
613; AVX512BW-NEXT:    retq
614;
615; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:
616; AVX512BWVL:       # %bb.0:
617; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
618; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
619; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
620; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
621; AVX512BWVL-NEXT:    vmovdqa %xmm2, (%rsi)
622; AVX512BWVL-NEXT:    vzeroupper
623; AVX512BWVL-NEXT:    retq
624  %vec = load <32 x i16>, <32 x i16>* %L
625  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
626  store <8 x i16> %strided.vec, <8 x i16>* %S
627  ret void
628}
629
630define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
631; AVX512F-LABEL: shuffle_v64i8_to_v8i8_1:
632; AVX512F:       # %bb.0:
633; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
634; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
635; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
636; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
637; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
638; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
639; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
640; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
641; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
642; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
643; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
644; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
645; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
646; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
647; AVX512F-NEXT:    vzeroupper
648; AVX512F-NEXT:    retq
649;
650; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_1:
651; AVX512VL:       # %bb.0:
652; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
653; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
654; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
655; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
656; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
657; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
658; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
659; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
660; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
661; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
662; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
663; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
664; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
665; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
666; AVX512VL-NEXT:    vzeroupper
667; AVX512VL-NEXT:    retq
668;
669; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_1:
670; AVX512BW:       # %bb.0:
671; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
672; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
673; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
674; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
675; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
676; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
677; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
678; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
679; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
680; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
681; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
682; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
683; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
684; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
685; AVX512BW-NEXT:    vzeroupper
686; AVX512BW-NEXT:    retq
687;
688; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_1:
689; AVX512BWVL:       # %bb.0:
690; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
691; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
692; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
693; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11]
694; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
695; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
696; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
697; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
698; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11]
699; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
700; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
701; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
702; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
703; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
704; AVX512BWVL-NEXT:    vzeroupper
705; AVX512BWVL-NEXT:    retq
706  %vec = load <64 x i8>, <64 x i8>* %L
707  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
708  store <8 x i8> %strided.vec, <8 x i8>* %S
709  ret void
710}
711
712define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind {
713; AVX512F-LABEL: shuffle_v64i8_to_v8i8_2:
714; AVX512F:       # %bb.0:
715; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
716; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
717; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
718; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
719; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
720; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
721; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
722; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
723; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
724; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
725; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
726; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
727; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
728; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
729; AVX512F-NEXT:    vzeroupper
730; AVX512F-NEXT:    retq
731;
732; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_2:
733; AVX512VL:       # %bb.0:
734; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
735; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
736; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
737; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
738; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
739; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
740; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
741; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
742; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
743; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
744; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
745; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
746; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
747; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
748; AVX512VL-NEXT:    vzeroupper
749; AVX512VL-NEXT:    retq
750;
751; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_2:
752; AVX512BW:       # %bb.0:
753; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
754; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
755; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
756; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
757; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
758; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
759; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
760; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
761; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
762; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
763; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
764; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
765; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
766; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
767; AVX512BW-NEXT:    vzeroupper
768; AVX512BW-NEXT:    retq
769;
770; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2:
771; AVX512BWVL:       # %bb.0:
772; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
773; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
774; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
775; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
776; AVX512BWVL-NEXT:    vpmovwb %xmm2, (%rsi)
777; AVX512BWVL-NEXT:    vzeroupper
778; AVX512BWVL-NEXT:    retq
779  %vec = load <64 x i8>, <64 x i8>* %L
780  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
781  store <8 x i8> %strided.vec, <8 x i8>* %S
782  ret void
783}
784
785define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
786; AVX512F-LABEL: shuffle_v64i8_to_v8i8_3:
787; AVX512F:       # %bb.0:
788; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
789; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
790; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
791; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
792; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
793; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
794; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
795; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
796; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
797; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
798; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
799; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
800; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
801; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
802; AVX512F-NEXT:    vzeroupper
803; AVX512F-NEXT:    retq
804;
805; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_3:
806; AVX512VL:       # %bb.0:
807; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
808; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
809; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
810; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
811; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
812; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
813; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
814; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
815; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
816; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
817; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
818; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
819; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
820; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
821; AVX512VL-NEXT:    vzeroupper
822; AVX512VL-NEXT:    retq
823;
824; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_3:
825; AVX512BW:       # %bb.0:
826; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
827; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
828; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
829; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
830; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
831; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
832; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
833; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
834; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
835; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
836; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
837; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
838; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
839; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
840; AVX512BW-NEXT:    vzeroupper
841; AVX512BW-NEXT:    retq
842;
843; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_3:
844; AVX512BWVL:       # %bb.0:
845; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
846; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
847; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
848; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11]
849; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
850; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
851; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
852; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
853; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
854; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
855; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
856; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
857; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
858; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
859; AVX512BWVL-NEXT:    vzeroupper
860; AVX512BWVL-NEXT:    retq
861  %vec = load <64 x i8>, <64 x i8>* %L
862  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
863  store <8 x i8> %strided.vec, <8 x i8>* %S
864  ret void
865}
866
867define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind {
868; AVX512F-LABEL: shuffle_v64i8_to_v8i8_4:
869; AVX512F:       # %bb.0:
870; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
871; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
872; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
873; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
874; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
875; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
876; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
877; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
878; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
879; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
880; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
881; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
882; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
883; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
884; AVX512F-NEXT:    vzeroupper
885; AVX512F-NEXT:    retq
886;
887; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_4:
888; AVX512VL:       # %bb.0:
889; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
890; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
891; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
892; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
893; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
894; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
895; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
896; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
897; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
898; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
899; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
900; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
901; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
902; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
903; AVX512VL-NEXT:    vzeroupper
904; AVX512VL-NEXT:    retq
905;
906; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_4:
907; AVX512BW:       # %bb.0:
908; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
909; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
910; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
911; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
912; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
913; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
914; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
915; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
916; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
917; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
918; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
919; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
920; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
921; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
922; AVX512BW-NEXT:    vzeroupper
923; AVX512BW-NEXT:    retq
924;
925; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4:
926; AVX512BWVL:       # %bb.0:
927; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
928; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
929; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
930; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
931; AVX512BWVL-NEXT:    vpmovwb %xmm2, (%rsi)
932; AVX512BWVL-NEXT:    vzeroupper
933; AVX512BWVL-NEXT:    retq
934  %vec = load <64 x i8>, <64 x i8>* %L
935  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
936  store <8 x i8> %strided.vec, <8 x i8>* %S
937  ret void
938}
939
940define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
941; AVX512F-LABEL: shuffle_v64i8_to_v8i8_5:
942; AVX512F:       # %bb.0:
943; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
944; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
945; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
946; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
947; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
948; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
949; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
950; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
951; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
952; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
953; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
954; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
955; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
956; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
957; AVX512F-NEXT:    vzeroupper
958; AVX512F-NEXT:    retq
959;
960; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_5:
961; AVX512VL:       # %bb.0:
962; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
963; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
964; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
965; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
966; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
967; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
968; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
969; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
970; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
971; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
972; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
973; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
974; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
975; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
976; AVX512VL-NEXT:    vzeroupper
977; AVX512VL-NEXT:    retq
978;
979; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_5:
980; AVX512BW:       # %bb.0:
981; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
982; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
983; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
984; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
985; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
986; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
987; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
988; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
989; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
990; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
991; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
992; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
993; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
994; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
995; AVX512BW-NEXT:    vzeroupper
996; AVX512BW-NEXT:    retq
997;
998; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_5:
999; AVX512BWVL:       # %bb.0:
1000; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
1001; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1002; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
1003; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7]
1004; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1005; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1006; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1007; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
1008; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7]
1009; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1010; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1011; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1012; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1013; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
1014; AVX512BWVL-NEXT:    vzeroupper
1015; AVX512BWVL-NEXT:    retq
1016  %vec = load <64 x i8>, <64 x i8>* %L
1017  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
1018  store <8 x i8> %strided.vec, <8 x i8>* %S
1019  ret void
1020}
1021
1022define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind {
1023; AVX512F-LABEL: shuffle_v64i8_to_v8i8_6:
1024; AVX512F:       # %bb.0:
1025; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1026; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
1027; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
1028; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
1029; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1030; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1031; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1032; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
1033; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1034; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1035; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1036; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1037; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1038; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
1039; AVX512F-NEXT:    vzeroupper
1040; AVX512F-NEXT:    retq
1041;
1042; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_6:
1043; AVX512VL:       # %bb.0:
1044; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
1045; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
1046; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
1047; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
1048; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1049; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1050; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1051; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
1052; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1053; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1054; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1055; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1056; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1057; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
1058; AVX512VL-NEXT:    vzeroupper
1059; AVX512VL-NEXT:    retq
1060;
1061; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_6:
1062; AVX512BW:       # %bb.0:
1063; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1064; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1065; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
1066; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
1067; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1068; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1069; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1070; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
1071; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1072; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1073; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1074; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1075; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1076; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
1077; AVX512BW-NEXT:    vzeroupper
1078; AVX512BW-NEXT:    retq
1079;
1080; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6:
1081; AVX512BWVL:       # %bb.0:
1082; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
1083; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1084; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
1085; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
1086; AVX512BWVL-NEXT:    vpmovwb %xmm2, (%rsi)
1087; AVX512BWVL-NEXT:    vzeroupper
1088; AVX512BWVL-NEXT:    retq
1089  %vec = load <64 x i8>, <64 x i8>* %L
1090  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
1091  store <8 x i8> %strided.vec, <8 x i8>* %S
1092  ret void
1093}
1094
1095define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {
1096; AVX512F-LABEL: shuffle_v64i8_to_v8i8_7:
1097; AVX512F:       # %bb.0:
1098; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1099; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
1100; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
1101; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
1102; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1103; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1104; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1105; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
1106; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1107; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1108; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1109; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1110; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1111; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
1112; AVX512F-NEXT:    vzeroupper
1113; AVX512F-NEXT:    retq
1114;
1115; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_7:
1116; AVX512VL:       # %bb.0:
1117; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
1118; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
1119; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
1120; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
1121; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1122; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1123; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1124; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
1125; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1126; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1127; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1128; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1129; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1130; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
1131; AVX512VL-NEXT:    vzeroupper
1132; AVX512VL-NEXT:    retq
1133;
1134; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_7:
1135; AVX512BW:       # %bb.0:
1136; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1137; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1138; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
1139; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
1140; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1141; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1142; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1143; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
1144; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1145; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1146; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1147; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1148; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1149; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
1150; AVX512BW-NEXT:    vzeroupper
1151; AVX512BW-NEXT:    retq
1152;
1153; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_7:
1154; AVX512BWVL:       # %bb.0:
1155; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
1156; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1157; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
1158; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7]
1159; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1160; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1161; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1162; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
1163; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7]
1164; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1165; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1166; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1167; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1168; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
1169; AVX512BWVL-NEXT:    vzeroupper
1170; AVX512BWVL-NEXT:    retq
1171  %vec = load <64 x i8>, <64 x i8>* %L
1172  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
1173  store <8 x i8> %strided.vec, <8 x i8>* %S
1174  ret void
1175}
1176
1177