• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
7
8define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
9; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
10; AVX1:       # %bb.0:
11; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
12; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
13; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
14; AVX1-NEXT:    retq
15;
16; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
17; AVX2OR512VL:       # %bb.0:
18; AVX2OR512VL-NEXT:    vpbroadcastw %xmm0, %ymm0
19; AVX2OR512VL-NEXT:    retq
20  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
21  ret <16 x i16> %shuffle
22}
23
24define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
25; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
26; AVX1:       # %bb.0:
27; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
28; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
29; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
30; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
31; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
32; AVX1-NEXT:    retq
33;
34; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
35; AVX2-SLOW:       # %bb.0:
36; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
37; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
38; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
39; AVX2-SLOW-NEXT:    retq
40;
41; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
42; AVX2-FAST:       # %bb.0:
43; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
44; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
45; AVX2-FAST-NEXT:    retq
46;
47; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
48; AVX512VL-SLOW:       # %bb.0:
49; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
50; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
51; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
52; AVX512VL-SLOW-NEXT:    retq
53;
54; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
55; AVX512VL-FAST:       # %bb.0:
56; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
57; AVX512VL-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
58; AVX512VL-FAST-NEXT:    retq
59  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
60  ret <16 x i16> %shuffle
61}
62
63define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
64; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
65; AVX1:       # %bb.0:
66; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
67; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
68; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
69; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
70; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
71; AVX1-NEXT:    retq
72;
73; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
74; AVX2-SLOW:       # %bb.0:
75; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
76; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
77; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
78; AVX2-SLOW-NEXT:    retq
79;
80; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
81; AVX2-FAST:       # %bb.0:
82; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
83; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
84; AVX2-FAST-NEXT:    retq
85;
86; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
87; AVX512VL-SLOW:       # %bb.0:
88; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
89; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
90; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
91; AVX512VL-SLOW-NEXT:    retq
92;
93; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
94; AVX512VL-FAST:       # %bb.0:
95; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
96; AVX512VL-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
97; AVX512VL-FAST-NEXT:    retq
98  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
99  ret <16 x i16> %shuffle
100}
101
102define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
103; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
104; AVX1:       # %bb.0:
105; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
106; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
107; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
108; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
109; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
110; AVX1-NEXT:    retq
111;
112; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
113; AVX2-SLOW:       # %bb.0:
114; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
115; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
116; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
117; AVX2-SLOW-NEXT:    retq
118;
119; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
120; AVX2-FAST:       # %bb.0:
121; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
122; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
123; AVX2-FAST-NEXT:    retq
124;
125; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
126; AVX512VL-SLOW:       # %bb.0:
127; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
128; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
129; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
130; AVX512VL-SLOW-NEXT:    retq
131;
132; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
133; AVX512VL-FAST:       # %bb.0:
134; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
135; AVX512VL-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
136; AVX512VL-FAST-NEXT:    retq
137  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
138  ret <16 x i16> %shuffle
139}
140
141define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
142; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
143; AVX1:       # %bb.0:
144; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
145; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
146; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
147; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
148; AVX1-NEXT:    retq
149;
150; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
151; AVX2OR512VL:       # %bb.0:
152; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
153; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
154; AVX2OR512VL-NEXT:    retq
155  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
156  ret <16 x i16> %shuffle
157}
158
159define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
160; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
161; AVX1:       # %bb.0:
162; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
163; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
164; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
165; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
166; AVX1-NEXT:    retq
167;
168; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
169; AVX2OR512VL:       # %bb.0:
170; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
171; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
172; AVX2OR512VL-NEXT:    retq
173  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
174  ret <16 x i16> %shuffle
175}
176
177define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
178; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
179; AVX1:       # %bb.0:
180; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
181; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
182; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
183; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
184; AVX1-NEXT:    retq
185;
186; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
187; AVX2OR512VL:       # %bb.0:
188; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
189; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
190; AVX2OR512VL-NEXT:    retq
191  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
192  ret <16 x i16> %shuffle
193}
194
195define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
196; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
197; AVX1:       # %bb.0:
198; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
199; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
200; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
201; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
202; AVX1-NEXT:    retq
203;
204; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
205; AVX2OR512VL:       # %bb.0:
206; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
207; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
208; AVX2OR512VL-NEXT:    retq
209  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
210  ret <16 x i16> %shuffle
211}
212
213define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
214; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
215; AVX1:       # %bb.0:
216; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
217; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
218; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7]
219; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1]
220; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
221; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
222; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
223; AVX1-NEXT:    retq
224;
225; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
226; AVX2-SLOW:       # %bb.0:
227; AVX2-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm1
228; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
229; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
230; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
231; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
232; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
233; AVX2-SLOW-NEXT:    retq
234;
235; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
236; AVX2-FAST:       # %bb.0:
237; AVX2-FAST-NEXT:    vpbroadcastw %xmm0, %xmm1
238; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
239; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
240; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
241; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
242; AVX2-FAST-NEXT:    retq
243;
244; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
245; AVX512VL:       # %bb.0:
246; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
247; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
248; AVX512VL-NEXT:    retq
249  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
250  ret <16 x i16> %shuffle
251}
252
253define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
254; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
255; AVX1:       # %bb.0:
256; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
257; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
258; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7]
259; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1]
260; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
261; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
262; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
263; AVX1-NEXT:    retq
264;
265; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
266; AVX2:       # %bb.0:
267; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
268; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
269; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
270; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
271; AVX2-NEXT:    retq
272;
273; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
274; AVX512VL:       # %bb.0:
275; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
276; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
277; AVX512VL-NEXT:    retq
278  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
279  ret <16 x i16> %shuffle
280}
281
282define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
283; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
284; AVX1:       # %bb.0:
285; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
286; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
287; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
288; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
289; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
290; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
291; AVX1-NEXT:    retq
292;
293; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
294; AVX2-SLOW:       # %bb.0:
295; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
296; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
297; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
298; AVX2-SLOW-NEXT:    retq
299;
300; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
301; AVX2-FAST:       # %bb.0:
302; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
303; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
304; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
305; AVX2-FAST-NEXT:    retq
306;
307; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
308; AVX512VL:       # %bb.0:
309; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
310; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
311; AVX512VL-NEXT:    retq
312  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
313  ret <16 x i16> %shuffle
314}
315
316define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
317; AVX1-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
318; AVX1:       # %bb.0:
319; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
320; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
321; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
322; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
323; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
324; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
325; AVX1-NEXT:    retq
326;
327; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
328; AVX2-SLOW:       # %bb.0:
329; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
330; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
331; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
332; AVX2-SLOW-NEXT:    retq
333;
334; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
335; AVX2-FAST:       # %bb.0:
336; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
337; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
338; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
339; AVX2-FAST-NEXT:    retq
340;
341; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
342; AVX512VL:       # %bb.0:
343; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
344; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
345; AVX512VL-NEXT:    retq
346  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
347  ret <16 x i16> %shuffle
348}
349
350define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
351; AVX1-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
352; AVX1:       # %bb.0:
353; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
354; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
355; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
356; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
357; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
358; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
359; AVX1-NEXT:    retq
360;
361; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
362; AVX2:       # %bb.0:
363; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
364; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
365; AVX2-NEXT:    retq
366;
367; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
368; AVX512VL:       # %bb.0:
369; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
370; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
371; AVX512VL-NEXT:    retq
372  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
373  ret <16 x i16> %shuffle
374}
375
376define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
377; AVX1-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
378; AVX1:       # %bb.0:
379; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
380; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
381; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
382; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
383; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
384; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
385; AVX1-NEXT:    retq
386;
387; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
388; AVX2:       # %bb.0:
389; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
390; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
391; AVX2-NEXT:    retq
392;
393; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
394; AVX512VL:       # %bb.0:
395; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
396; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
397; AVX512VL-NEXT:    retq
398  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
399  ret <16 x i16> %shuffle
400}
401
402define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
403; AVX1-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
404; AVX1:       # %bb.0:
405; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
406; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
407; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
408; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
409; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
410; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
411; AVX1-NEXT:    retq
412;
413; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
414; AVX2:       # %bb.0:
415; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
416; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
417; AVX2-NEXT:    retq
418;
419; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
420; AVX512VL:       # %bb.0:
421; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
422; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
423; AVX512VL-NEXT:    retq
424  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
425  ret <16 x i16> %shuffle
426}
427
428define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
429; AVX1-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
430; AVX1:       # %bb.0:
431; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
432; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
433; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
434; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
435; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
436; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
437; AVX1-NEXT:    retq
438;
439; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
440; AVX2:       # %bb.0:
441; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
442; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
443; AVX2-NEXT:    retq
444;
445; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
446; AVX512VL:       # %bb.0:
447; AVX512VL-NEXT:    movl $15, %eax
448; AVX512VL-NEXT:    vmovd %eax, %xmm1
449; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
450; AVX512VL-NEXT:    retq
451  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
452  ret <16 x i16> %shuffle
453}
454
455define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
456; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
457; AVX1:       # %bb.0:
458; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
459; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
460; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
461; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
462; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
463; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
464; AVX1-NEXT:    retq
465;
466; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
467; AVX2-SLOW:       # %bb.0:
468; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
469; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
470; AVX2-SLOW-NEXT:    retq
471;
472; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
473; AVX2-FAST:       # %bb.0:
474; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
475; AVX2-FAST-NEXT:    retq
476;
477; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
478; AVX512VL-SLOW:       # %bb.0:
479; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
480; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
481; AVX512VL-SLOW-NEXT:    retq
482;
483; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
484; AVX512VL-FAST:       # %bb.0:
485; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
486; AVX512VL-FAST-NEXT:    retq
487  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
488  ret <16 x i16> %shuffle
489}
490
491define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
492; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
493; AVX1:       # %bb.0:
494; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,6,7]
495; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
496; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
497; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,6,7]
498; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
499; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
500; AVX1-NEXT:    retq
501;
502; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
503; AVX2-SLOW:       # %bb.0:
504; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
505; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
506; AVX2-SLOW-NEXT:    retq
507;
508; AVX2-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
509; AVX2-FAST:       # %bb.0:
510; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
511; AVX2-FAST-NEXT:    retq
512;
513; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
514; AVX512VL-SLOW:       # %bb.0:
515; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
516; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
517; AVX512VL-SLOW-NEXT:    retq
518;
519; AVX512VL-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
520; AVX512VL-FAST:       # %bb.0:
521; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
522; AVX512VL-FAST-NEXT:    retq
523  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
524  ret <16 x i16> %shuffle
525}
526
527define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
528; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
529; AVX1:       # %bb.0:
530; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
531; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
532; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
533; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
534; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
535; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
536; AVX1-NEXT:    retq
537;
538; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
539; AVX2-SLOW:       # %bb.0:
540; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
541; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
542; AVX2-SLOW-NEXT:    retq
543;
544; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
545; AVX2-FAST:       # %bb.0:
546; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
547; AVX2-FAST-NEXT:    retq
548;
549; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
550; AVX512VL-SLOW:       # %bb.0:
551; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
552; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
553; AVX512VL-SLOW-NEXT:    retq
554;
555; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
556; AVX512VL-FAST:       # %bb.0:
557; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
558; AVX512VL-FAST-NEXT:    retq
559  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
560  ret <16 x i16> %shuffle
561}
562
563define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
564; AVX1-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
565; AVX1:       # %bb.0:
566; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
567; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
568; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
569; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
570; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
571; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
572; AVX1-NEXT:    retq
573;
574; AVX2-SLOW-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
575; AVX2-SLOW:       # %bb.0:
576; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
577; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
578; AVX2-SLOW-NEXT:    retq
579;
580; AVX2-FAST-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
581; AVX2-FAST:       # %bb.0:
582; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31]
583; AVX2-FAST-NEXT:    retq
584;
585; AVX512VL-SLOW-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
586; AVX512VL-SLOW:       # %bb.0:
587; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
588; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
589; AVX512VL-SLOW-NEXT:    retq
590;
591; AVX512VL-FAST-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
592; AVX512VL-FAST:       # %bb.0:
593; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31]
594; AVX512VL-FAST-NEXT:    retq
595  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15>
596  ret <16 x i16> %shuffle
597}
598
599define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x i16> %a, <16 x i16> %b) {
600; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
601; AVX1:       # %bb.0:
602; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7]
603; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
604; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
605; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
606; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
607; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
608; AVX1-NEXT:    retq
609;
610; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
611; AVX2-SLOW:       # %bb.0:
612; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15]
613; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14]
614; AVX2-SLOW-NEXT:    retq
615;
616; AVX2-FAST-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
617; AVX2-FAST:       # %bb.0:
618; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29]
619; AVX2-FAST-NEXT:    retq
620;
621; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
622; AVX512VL-SLOW:       # %bb.0:
623; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15]
624; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14]
625; AVX512VL-SLOW-NEXT:    retq
626;
627; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
628; AVX512VL-FAST:       # %bb.0:
629; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29]
630; AVX512VL-FAST-NEXT:    retq
631  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
632  ret <16 x i16> %shuffle
633}
634
635define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x i16> %a, <16 x i16> %b) {
636; AVX1-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
637; AVX1:       # %bb.0:
638; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7]
639; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,7,7]
640; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
641; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
642; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7]
643; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
644; AVX1-NEXT:    retq
645;
646; AVX2-SLOW-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
647; AVX2-SLOW:       # %bb.0:
648; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15]
649; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15]
650; AVX2-SLOW-NEXT:    retq
651;
652; AVX2-FAST-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
653; AVX2-FAST:       # %bb.0:
654; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31]
655; AVX2-FAST-NEXT:    retq
656;
657; AVX512VL-SLOW-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
658; AVX512VL-SLOW:       # %bb.0:
659; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15]
660; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15]
661; AVX512VL-SLOW-NEXT:    retq
662;
663; AVX512VL-FAST-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
664; AVX512VL-FAST:       # %bb.0:
665; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31]
666; AVX512VL-FAST-NEXT:    retq
667  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
668  ret <16 x i16> %shuffle
669}
670
671define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
672; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
673; AVX1:       # %bb.0:
674; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
675; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
676; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
677; AVX1-NEXT:    retq
678;
679; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
680; AVX2-SLOW:       # %bb.0:
681; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
682; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
683; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
684; AVX2-SLOW-NEXT:    retq
685;
686; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
687; AVX2-FAST:       # %bb.0:
688; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
689; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
690; AVX2-FAST-NEXT:    retq
691;
692; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
693; AVX512VL-SLOW:       # %bb.0:
694; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
695; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
696; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
697; AVX512VL-SLOW-NEXT:    retq
698;
699; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
700; AVX512VL-FAST:       # %bb.0:
701; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
702; AVX512VL-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
703; AVX512VL-FAST-NEXT:    retq
704  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
705  ret <16 x i16> %shuffle
706}
707
708define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
709; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
710; AVX1:       # %bb.0:
711; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
712; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
713; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
714; AVX1-NEXT:    retq
715;
716; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
717; AVX2-SLOW:       # %bb.0:
718; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
719; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
720; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
721; AVX2-SLOW-NEXT:    retq
722;
723; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
724; AVX2-FAST:       # %bb.0:
725; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
726; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
727; AVX2-FAST-NEXT:    retq
728;
729; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
730; AVX512VL-SLOW:       # %bb.0:
731; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
732; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
733; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
734; AVX512VL-SLOW-NEXT:    retq
735;
736; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
737; AVX512VL-FAST:       # %bb.0:
738; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
739; AVX512VL-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
740; AVX512VL-FAST-NEXT:    retq
741  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
742  ret <16 x i16> %shuffle
743}
744
745define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
746; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
747; AVX1:       # %bb.0:
748; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
749; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
750; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
751; AVX1-NEXT:    retq
752;
753; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
754; AVX2-SLOW:       # %bb.0:
755; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
756; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
757; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
758; AVX2-SLOW-NEXT:    retq
759;
760; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
761; AVX2-FAST:       # %bb.0:
762; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
763; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
764; AVX2-FAST-NEXT:    retq
765;
766; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
767; AVX512VL-SLOW:       # %bb.0:
768; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
769; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
770; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
771; AVX512VL-SLOW-NEXT:    retq
772;
773; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
774; AVX512VL-FAST:       # %bb.0:
775; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
776; AVX512VL-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
777; AVX512VL-FAST-NEXT:    retq
778  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
779  ret <16 x i16> %shuffle
780}
781
782define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
783; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
784; AVX1:       # %bb.0:
785; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
786; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
787; AVX1-NEXT:    retq
788;
789; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
790; AVX2OR512VL:       # %bb.0:
791; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
792; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
793; AVX2OR512VL-NEXT:    retq
794  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
795  ret <16 x i16> %shuffle
796}
797
798define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
799; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
800; AVX1:       # %bb.0:
801; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
802; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
803; AVX1-NEXT:    retq
804;
805; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
806; AVX2OR512VL:       # %bb.0:
807; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
808; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
809; AVX2OR512VL-NEXT:    retq
810  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
811  ret <16 x i16> %shuffle
812}
813
814define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
815; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
816; AVX1:       # %bb.0:
817; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
818; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
819; AVX1-NEXT:    retq
820;
821; AVX2OR512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
822; AVX2OR512VL:       # %bb.0:
823; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
824; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
825; AVX2OR512VL-NEXT:    retq
826  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
827  ret <16 x i16> %shuffle
828}
829
830define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
831; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
832; AVX1:       # %bb.0:
833; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
834; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
835; AVX1-NEXT:    retq
836;
837; AVX2OR512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
838; AVX2OR512VL:       # %bb.0:
839; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
840; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
841; AVX2OR512VL-NEXT:    retq
842  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
843  ret <16 x i16> %shuffle
844}
845
846define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
847; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
848; AVX1:       # %bb.0:
849; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
850; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
851; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
852; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
853; AVX1-NEXT:    retq
854;
855; AVX2OR512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
856; AVX2OR512VL:       # %bb.0:
857; AVX2OR512VL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
858; AVX2OR512VL-NEXT:    retq
859  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
860  ret <16 x i16> %shuffle
861}
862
863define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
864; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
865; AVX1:       # %bb.0:
866; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
867; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
868; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
869; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
870; AVX1-NEXT:    retq
871;
872; AVX2OR512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
873; AVX2OR512VL:       # %bb.0:
874; AVX2OR512VL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
875; AVX2OR512VL-NEXT:    retq
876  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
877  ret <16 x i16> %shuffle
878}
879
880define <16 x i16> @shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) {
881; ALL-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31:
882; ALL:       # %bb.0:
883; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
884; ALL-NEXT:    retq
885  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
886  ret <16 x i16> %shuffle
887}
888
889define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
890; ALL-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
891; ALL:       # %bb.0:
892; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
893; ALL-NEXT:    retq
894  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15>
895  ret <16 x i16> %shuffle
896}
897
898define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31(<16 x i16> %a, <16 x i16> %b) {
899; AVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
900; AVX1:       # %bb.0:
901; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0]
902; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
903; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
904; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
905; AVX1-NEXT:    retq
906;
907; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
908; AVX2:       # %bb.0:
909; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
910; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
911; AVX2-NEXT:    retq
912;
913; AVX512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
914; AVX512VL:       # %bb.0:
915; AVX512VL-NEXT:    movw $-32768, %ax # imm = 0x8000
916; AVX512VL-NEXT:    kmovd %eax, %k1
917; AVX512VL-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
918; AVX512VL-NEXT:    retq
919  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
920  ret <16 x i16> %shuffle
921}
922
923define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
924; AVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
925; AVX1:       # %bb.0:
926; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
927; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
928; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
929; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
930; AVX1-NEXT:    retq
931;
932; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
933; AVX2:       # %bb.0:
934; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
935; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
936; AVX2-NEXT:    retq
937;
938; AVX512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
939; AVX512VL:       # %bb.0:
940; AVX512VL-NEXT:    movw $1, %ax
941; AVX512VL-NEXT:    kmovd %eax, %k1
942; AVX512VL-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
943; AVX512VL-NEXT:    retq
944  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
945  ret <16 x i16> %shuffle
946}
947
948define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
949; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
950; AVX1:       # %bb.0:
951; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535]
952; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
953; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
954; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
955; AVX1-NEXT:    retq
956;
957; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
958; AVX2:       # %bb.0:
959; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255]
960; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
961; AVX2-NEXT:    retq
962;
963; AVX512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
964; AVX512VL:       # %bb.0:
965; AVX512VL-NEXT:    movw $21930, %ax # imm = 0x55AA
966; AVX512VL-NEXT:    kmovd %eax, %k1
967; AVX512VL-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
968; AVX512VL-NEXT:    retq
969  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
970  ret <16 x i16> %shuffle
971}
972
973define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
974; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
975; AVX1:       # %bb.0:
976; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0]
977; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
978; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
979; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
980; AVX1-NEXT:    retq
981;
982; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
983; AVX2:       # %bb.0:
984; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0]
985; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
986; AVX2-NEXT:    retq
987;
988; AVX512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
989; AVX512VL:       # %bb.0:
990; AVX512VL-NEXT:    movw $-21931, %ax # imm = 0xAA55
991; AVX512VL-NEXT:    kmovd %eax, %k1
992; AVX512VL-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
993; AVX512VL-NEXT:    retq
994  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
995  ret <16 x i16> %shuffle
996}
997
998define <16 x i16> @shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) {
999; ALL-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31:
1000; ALL:       # %bb.0:
1001; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
1002; ALL-NEXT:    retq
1003  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
1004  ret <16 x i16> %shuffle
1005}
1006
1007define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16(<16 x i16> %a, <16 x i16> %b) {
1008; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
1009; AVX1:       # %bb.0:
1010; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1011; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1012; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1013; AVX1-NEXT:    retq
1014;
1015; AVX2OR512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
1016; AVX2OR512VL:       # %bb.0:
1017; AVX2OR512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1018; AVX2OR512VL-NEXT:    vpbroadcastd %xmm0, %ymm0
1019; AVX2OR512VL-NEXT:    retq
1020  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16>
1021  ret <16 x i16> %shuffle
1022}
1023
1024define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24(<16 x i16> %a, <16 x i16> %b) {
1025; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
1026; AVX1:       # %bb.0:
1027; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1028; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1029; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1030; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1031; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1032; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1033; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
1034; AVX1-NEXT:    retq
1035;
1036; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
1037; AVX2-SLOW:       # %bb.0:
1038; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
1039; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
1040; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
1041; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1042; AVX2-SLOW-NEXT:    retq
1043;
1044; AVX2-FAST-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
1045; AVX2-FAST:       # %bb.0:
1046; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
1047; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
1048; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1049; AVX2-FAST-NEXT:    retq
1050;
1051; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
1052; AVX512VL:       # %bb.0:
1053; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24]
1054; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
1055; AVX512VL-NEXT:    retq
1056  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24>
1057  ret <16 x i16> %shuffle
1058}
1059
1060define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
1061; AVX1-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
1062; AVX1:       # %bb.0:
1063; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1064; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1065; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
1066; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1067; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1068; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1069; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1070; AVX1-NEXT:    retq
1071;
1072; AVX2-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
1073; AVX2:       # %bb.0:
1074; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
1075; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1076; AVX2-NEXT:    retq
1077;
1078; AVX512VL-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
1079; AVX512VL:       # %bb.0:
1080; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31]
1081; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
1082; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
1083; AVX512VL-NEXT:    retq
1084  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 24, i32 24, i32 24, i32 24, i32 12, i32 13, i32 14, i32 15>
1085  ret <16 x i16> %shuffle
1086}
1087
1088define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12(<16 x i16> %a, <16 x i16> %b) {
1089; AVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
1090; AVX1:       # %bb.0:
1091; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1092; AVX1-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1093; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1094; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1095; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1096; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1097; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1098; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1099; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1100; AVX1-NEXT:    retq
1101;
1102; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
1103; AVX2-SLOW:       # %bb.0:
1104; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1105; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
1106; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
1107; AVX2-SLOW-NEXT:    retq
1108;
1109; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
1110; AVX2-FAST:       # %bb.0:
1111; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1112; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15,12,13,10,11,8,9,22,23,20,21,18,19,16,17,30,31,28,29,26,27,24,25]
1113; AVX2-FAST-NEXT:    retq
1114;
1115; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
1116; AVX512VL:       # %bb.0:
1117; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28]
1118; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
1119; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
1120; AVX512VL-NEXT:    retq
1121  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 7, i32 6, i32 5, i32 4, i32 27, i32 26, i32 25, i32 24, i32 15, i32 14, i32 13, i32 12>
1122  ret <16 x i16> %shuffle
1123}
1124
1125define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08(<16 x i16> %a, <16 x i16> %b) {
1126; AVX1-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
1127; AVX1:       # %bb.0:
1128; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1129; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1130; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1131; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,8,9,4,5,0,1,14,15,10,11,6,7,2,3]
1132; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1133; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1134; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1135; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1136; AVX1-NEXT:    retq
1137;
1138; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
1139; AVX2:       # %bb.0:
1140; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
1141; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
1142; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
1143; AVX2-NEXT:    retq
1144;
1145; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
1146; AVX512VL:       # %bb.0:
1147; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24]
1148; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
1149; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
1150; AVX512VL-NEXT:    retq
1151  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 3, i32 2, i32 1, i32 0, i32 27, i32 26, i32 25, i32 24, i32 11, i32 10, i32 9, i32 8>
1152  ret <16 x i16> %shuffle
1153}
1154
1155define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08(<16 x i16> %a, <16 x i16> %b) {
1156; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
1157; AVX1:       # %bb.0:
1158; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,0,4,5,6,7]
1159; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1]
1160; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1161; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
1162; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
1163; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1164; AVX1-NEXT:    retq
1165;
1166; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
1167; AVX2-SLOW:       # %bb.0:
1168; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,1,0,4,5,6,7,8,8,9,8,12,13,14,15]
1169; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
1170; AVX2-SLOW-NEXT:    retq
1171;
1172; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
1173; AVX2-FAST:       # %bb.0:
1174; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
1175; AVX2-FAST-NEXT:    retq
1176;
1177; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
1178; AVX512VL-SLOW:       # %bb.0:
1179; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,1,0,4,5,6,7,8,8,9,8,12,13,14,15]
1180; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
1181; AVX512VL-SLOW-NEXT:    retq
1182;
1183; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
1184; AVX512VL-FAST:       # %bb.0:
1185; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
1186; AVX512VL-FAST-NEXT:    retq
1187  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 8>
1188  ret <16 x i16> %shuffle
1189}
1190
1191define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08(<16 x i16> %a, <16 x i16> %b) {
1192; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
1193; AVX1:       # %bb.0:
1194; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,2,4,5,6,7]
1195; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0]
1196; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1197; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
1198; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
1199; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1200; AVX1-NEXT:    retq
1201;
1202; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
1203; AVX2-SLOW:       # %bb.0:
1204; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,2,4,5,6,7,8,8,8,10,12,13,14,15]
1205; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
1206; AVX2-SLOW-NEXT:    retq
1207;
1208; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
1209; AVX2-FAST:       # %bb.0:
1210; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
1211; AVX2-FAST-NEXT:    retq
1212;
1213; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
1214; AVX512VL-SLOW:       # %bb.0:
1215; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,2,4,5,6,7,8,8,8,10,12,13,14,15]
1216; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
1217; AVX512VL-SLOW-NEXT:    retq
1218;
1219; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
1220; AVX512VL-FAST:       # %bb.0:
1221; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
1222; AVX512VL-FAST-NEXT:    retq
1223  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 10, i32 8, i32 8>
1224  ret <16 x i16> %shuffle
1225}
1226
1227define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08(<16 x i16> %a, <16 x i16> %b) {
1228; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
1229; AVX1:       # %bb.0:
1230; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,3,0,4,5,6,7]
1231; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0]
1232; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1233; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
1234; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
1235; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1236; AVX1-NEXT:    retq
1237;
1238; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
1239; AVX2-SLOW:       # %bb.0:
1240; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,0,4,5,6,7,8,8,11,8,12,13,14,15]
1241; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
1242; AVX2-SLOW-NEXT:    retq
1243;
1244; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
1245; AVX2-FAST:       # %bb.0:
1246; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
1247; AVX2-FAST-NEXT:    retq
1248;
1249; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
1250; AVX512VL-SLOW:       # %bb.0:
1251; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,0,4,5,6,7,8,8,11,8,12,13,14,15]
1252; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
1253; AVX512VL-SLOW-NEXT:    retq
1254;
1255; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
1256; AVX512VL-FAST:       # %bb.0:
1257; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
1258; AVX512VL-FAST-NEXT:    retq
1259  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8>
1260  ret <16 x i16> %shuffle
1261}
1262
1263define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
1264; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08:
1265; AVX1:       # %bb.0:
1266; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1267; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
1268; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1269; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1270; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1271; AVX1-NEXT:    retq
1272;
1273; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08:
1274; AVX2OR512VL:       # %bb.0:
1275; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17]
1276; AVX2OR512VL-NEXT:    retq
1277  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8, i32 8>
1278  ret <16 x i16> %shuffle
1279}
1280
1281define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
1282; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08:
1283; AVX1:       # %bb.0:
1284; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1285; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
1286; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1287; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1288; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1289; AVX1-NEXT:    retq
1290;
1291; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08:
1292; AVX2OR512VL:       # %bb.0:
1293; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17]
1294; AVX2OR512VL-NEXT:    retq
1295  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 13, i32 8, i32 8, i32 8, i32 8, i32 8>
1296  ret <16 x i16> %shuffle
1297}
1298
1299define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
1300; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08:
1301; AVX1:       # %bb.0:
1302; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1303; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
1304; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1305; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1306; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1307; AVX1-NEXT:    retq
1308;
1309; AVX2OR512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08:
1310; AVX2OR512VL:       # %bb.0:
1311; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17]
1312; AVX2OR512VL-NEXT:    retq
1313  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 14, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
1314  ret <16 x i16> %shuffle
1315}
1316
1317define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
1318; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08:
1319; AVX1:       # %bb.0:
1320; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1321; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
1322; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1323; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1324; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1325; AVX1-NEXT:    retq
1326;
1327; AVX2OR512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08:
1328; AVX2OR512VL:       # %bb.0:
1329; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
1330; AVX2OR512VL-NEXT:    retq
1331  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
1332  ret <16 x i16> %shuffle
1333}
1334
1335define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
1336; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27:
1337; AVX1:       # %bb.0:
1338; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1339; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1340; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1341; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1342; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1343; AVX1-NEXT:    retq
1344;
1345; AVX2OR512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27:
1346; AVX2OR512VL:       # %bb.0:
1347; AVX2OR512VL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
1348; AVX2OR512VL-NEXT:    retq
1349  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
1350  ret <16 x i16> %shuffle
1351}
1352
1353define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
1354; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31:
1355; AVX1:       # %bb.0:
1356; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1357; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1358; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1359; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1360; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1361; AVX1-NEXT:    retq
1362;
1363; AVX2OR512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31:
1364; AVX2OR512VL:       # %bb.0:
1365; AVX2OR512VL-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
1366; AVX2OR512VL-NEXT:    retq
1367  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
1368  ret <16 x i16> %shuffle
1369}
1370
1371define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
1372; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
1373; AVX1:       # %bb.0:
1374; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1375; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1376; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1377; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1378; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1379; AVX1-NEXT:    retq
1380;
1381; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
1382; AVX2:       # %bb.0:
1383; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31]
1384; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u]
1385; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1386; AVX2-NEXT:    retq
1387;
1388; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
1389; AVX512VL:       # %bb.0:
1390; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31]
1391; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
1392; AVX512VL-NEXT:    retq
1393  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
1394  ret <16 x i16> %shuffle
1395}
1396
1397define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
1398; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
1399; AVX1:       # %bb.0:
1400; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1401; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1402; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1403; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1404; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1405; AVX1-NEXT:    retq
1406;
1407; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
1408; AVX2:       # %bb.0:
1409; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
1410; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u]
1411; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1412; AVX2-NEXT:    retq
1413;
1414; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
1415; AVX512VL:       # %bb.0:
1416; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27]
1417; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
1418; AVX512VL-NEXT:    retq
1419  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
1420  ret <16 x i16> %shuffle
1421}
1422
1423define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
1424; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
1425; AVX1:       # %bb.0:
1426; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,0,4,5,6,7]
1427; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1]
1428; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1429; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,0,4,5,6,7]
1430; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1]
1431; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1432; AVX1-NEXT:    retq
1433;
1434; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
1435; AVX2OR512VL:       # %bb.0:
1436; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17]
1437; AVX2OR512VL-NEXT:    retq
1438  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 9, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
1439  ret <16 x i16> %shuffle
1440}
1441
1442define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
1443; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
1444; AVX1:       # %bb.0:
1445; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,2,4,5,6,7]
1446; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0]
1447; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1448; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,0,4,5,6,7]
1449; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
1450; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1451; AVX1-NEXT:    retq
1452;
1453; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
1454; AVX2OR512VL:       # %bb.0:
1455; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17]
1456; AVX2OR512VL-NEXT:    retq
1457  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 10, i32 8, i32 8, i32 8, i32 8, i32 8>
1458  ret <16 x i16> %shuffle
1459}
1460
1461define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
1462; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
1463; AVX1:       # %bb.0:
1464; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,3,0,4,5,6,7]
1465; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0]
1466; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1467; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7]
1468; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
1469; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1470; AVX1-NEXT:    retq
1471;
1472; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
1473; AVX2OR512VL:       # %bb.0:
1474; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17]
1475; AVX2OR512VL-NEXT:    retq
1476  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8, i32 8>
1477  ret <16 x i16> %shuffle
1478}
1479
1480define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08(<16 x i16> %a, <16 x i16> %b) {
1481; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08:
1482; AVX1:       # %bb.0:
1483; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
1484; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1485; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1]
1486; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1487; AVX1-NEXT:    retq
1488;
1489; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08:
1490; AVX2OR512VL:       # %bb.0:
1491; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17]
1492; AVX2OR512VL-NEXT:    retq
1493  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8>
1494  ret <16 x i16> %shuffle
1495}
1496
1497define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08(<16 x i16> %a, <16 x i16> %b) {
1498; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08:
1499; AVX1:       # %bb.0:
1500; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
1501; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1502; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
1503; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1504; AVX1-NEXT:    retq
1505;
1506; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08:
1507; AVX2OR512VL:       # %bb.0:
1508; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17]
1509; AVX2OR512VL-NEXT:    retq
1510  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 13, i32 8, i32 8>
1511  ret <16 x i16> %shuffle
1512}
1513
1514define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
1515; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08:
1516; AVX1:       # %bb.0:
1517; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
1518; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1519; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1]
1520; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1521; AVX1-NEXT:    retq
1522;
1523; AVX2OR512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08:
1524; AVX2OR512VL:       # %bb.0:
1525; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17]
1526; AVX2OR512VL-NEXT:    retq
1527  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 14, i32 8>
1528  ret <16 x i16> %shuffle
1529}
1530
1531define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15(<16 x i16> %a, <16 x i16> %b) {
1532; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15:
1533; AVX1:       # %bb.0:
1534; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
1535; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1536; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15]
1537; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1538; AVX1-NEXT:    retq
1539;
1540; AVX2OR512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15:
1541; AVX2OR512VL:       # %bb.0:
1542; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31]
1543; AVX2OR512VL-NEXT:    retq
1544  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 15>
1545  ret <16 x i16> %shuffle
1546}
1547
1548define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) {
1549; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08:
1550; AVX1:       # %bb.0:
1551; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7]
1552; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
1553; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1554; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1]
1555; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1556; AVX1-NEXT:    retq
1557;
1558; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08:
1559; AVX2OR512VL:       # %bb.0:
1560; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17]
1561; AVX2OR512VL-NEXT:    retq
1562  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 14, i32 14, i32 12, i32 12, i32 10, i32 10, i32 8, i32 8>
1563  ret <16 x i16> %shuffle
1564}
1565
1566define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
1567; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12:
1568; AVX1:       # %bb.0:
1569; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
1570; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1571; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1572; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1573; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1574; AVX1-NEXT:    retq
1575;
1576; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12:
1577; AVX2OR512VL:       # %bb.0:
1578; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
1579; AVX2OR512VL-NEXT:    retq
1580  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
1581  ret <16 x i16> %shuffle
1582}
1583
1584define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
1585; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
1586; AVX1:       # %bb.0:
1587; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
1588; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1589; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1590; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1]
1591; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1592; AVX1-NEXT:    retq
1593;
1594; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
1595; AVX2OR512VL:       # %bb.0:
1596; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17]
1597; AVX2OR512VL-NEXT:    retq
1598  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 14, i32 8>
1599  ret <16 x i16> %shuffle
1600}
1601
1602define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15(<16 x i16> %a, <16 x i16> %b) {
1603; AVX1-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15:
1604; AVX1:       # %bb.0:
1605; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
1606; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1607; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15]
1608; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1609; AVX1-NEXT:    retq
1610;
1611; AVX2OR512VL-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15:
1612; AVX2OR512VL:       # %bb.0:
1613; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31]
1614; AVX2OR512VL-NEXT:    retq
1615  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 15>
1616  ret <16 x i16> %shuffle
1617}
1618
1619define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) {
1620; AVX1-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08:
1621; AVX1:       # %bb.0:
1622; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,2,4,5,6,7]
1623; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
1624; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1625; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1]
1626; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1627; AVX1-NEXT:    retq
1628;
1629; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08:
1630; AVX2OR512VL:       # %bb.0:
1631; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17]
1632; AVX2OR512VL-NEXT:    retq
1633  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 2, i32 4, i32 4, i32 undef, i32 6, i32 14, i32 14, i32 undef, i32 12, i32 10, i32 10, i32 8, i32 8>
1634  ret <16 x i16> %shuffle
1635}
1636
1637define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12(<16 x i16> %a, <16 x i16> %b) {
1638; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
1639; AVX1:       # %bb.0:
1640; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,7]
1641; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3]
1642; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1643; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7]
1644; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1645; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1646; AVX1-NEXT:    retq
1647;
1648; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
1649; AVX2OR512VL:       # %bb.0:
1650; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25]
1651; AVX2OR512VL-NEXT:    retq
1652  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 12, i32 12>
1653  ret <16 x i16> %shuffle
1654}
1655
1656define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20(<16 x i16> %a, <16 x i16> %b) {
1657; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
1658; AVX1:       # %bb.0:
1659; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1660; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1661; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1662; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1663; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1664; AVX1-NEXT:    retq
1665;
1666; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
1667; AVX2-SLOW:       # %bb.0:
1668; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1669; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
1670; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
1671; AVX2-SLOW-NEXT:    retq
1672;
1673; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
1674; AVX2-FAST:       # %bb.0:
1675; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1676; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
1677; AVX2-FAST-NEXT:    retq
1678;
1679; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
1680; AVX512VL:       # %bb.0:
1681; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20]
1682; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
1683; AVX512VL-NEXT:    retq
1684  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20>
1685  ret <16 x i16> %shuffle
1686}
1687
1688define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20(<16 x i16> %a, <16 x i16> %b) {
1689; AVX1-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
1690; AVX1:       # %bb.0:
1691; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1692; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1693; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1694; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1695; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1696; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1697; AVX1-NEXT:    retq
1698;
1699; AVX2-SLOW-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
1700; AVX2-SLOW:       # %bb.0:
1701; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
1702; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
1703; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
1704; AVX2-SLOW-NEXT:    retq
1705;
1706; AVX2-FAST-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
1707; AVX2-FAST:       # %bb.0:
1708; AVX2-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
1709; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
1710; AVX2-FAST-NEXT:    retq
1711;
1712; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
1713; AVX512VL:       # %bb.0:
1714; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20]
1715; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
1716; AVX512VL-NEXT:    retq
1717  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20>
1718  ret <16 x i16> %shuffle
1719}
1720
1721define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28(<16 x i16> %a, <16 x i16> %b) {
1722; AVX1-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
1723; AVX1:       # %bb.0:
1724; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1725; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1726; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1727; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1728; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1729; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1730; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1731; AVX1-NEXT:    retq
1732;
1733; AVX2-SLOW-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
1734; AVX2-SLOW:       # %bb.0:
1735; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1736; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
1737; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
1738; AVX2-SLOW-NEXT:    retq
1739;
1740; AVX2-FAST-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
1741; AVX2-FAST:       # %bb.0:
1742; AVX2-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1743; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
1744; AVX2-FAST-NEXT:    retq
1745;
1746; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
1747; AVX512VL:       # %bb.0:
1748; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28]
1749; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
1750; AVX512VL-NEXT:    retq
1751  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
1752  ret <16 x i16> %shuffle
1753}
1754
1755define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28(<16 x i16> %a, <16 x i16> %b) {
1756; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
1757; AVX1:       # %bb.0:
1758; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1759; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1760; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1761; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1762; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1763; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1764; AVX1-NEXT:    retq
1765;
1766; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
1767; AVX2-SLOW:       # %bb.0:
1768; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1769; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
1770; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
1771; AVX2-SLOW-NEXT:    retq
1772;
1773; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
1774; AVX2-FAST:       # %bb.0:
1775; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1776; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
1777; AVX2-FAST-NEXT:    retq
1778;
1779; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
1780; AVX512VL:       # %bb.0:
1781; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28]
1782; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
1783; AVX512VL-NEXT:    retq
1784  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
1785  ret <16 x i16> %shuffle
1786}
1787
1788define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i16> %a, <16 x i16> %b) {
1789; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
1790; AVX1:       # %bb.0:
1791; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1792; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1793; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1794; AVX1-NEXT:    retq
1795;
1796; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
1797; AVX2:       # %bb.0:
1798; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1799; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1800; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1801; AVX2-NEXT:    retq
1802;
1803; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
1804; AVX512VL:       # %bb.0:
1805; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23]
1806; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
1807; AVX512VL-NEXT:    retq
1808  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
1809  ret <16 x i16> %shuffle
1810}
1811
1812define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) {
1813; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24:
1814; AVX1:       # %bb.0:
1815; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
1816; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1817; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
1818; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1819; AVX1-NEXT:    retq
1820;
1821; AVX2OR512VL-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24:
1822; AVX2OR512VL:       # %bb.0:
1823; AVX2OR512VL-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
1824; AVX2OR512VL-NEXT:    retq
1825  %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24>
1826  ret <16 x i16> %shuffle
1827}
1828
1829define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz(<16 x i16> %a) {
1830; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz:
1831; AVX1:       # %bb.0:
1832; AVX1-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1833; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1834; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1835; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1836; AVX1-NEXT:    retq
1837;
1838; AVX2OR512VL-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz:
1839; AVX2OR512VL:       # %bb.0:
1840; AVX2OR512VL-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
1841; AVX2OR512VL-NEXT:    retq
1842  %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
1843  ret <16 x i16> %shuffle
1844}
1845
1846define <16 x i16> @shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_13(<16 x i16> %a) {
1847; AVX1-LABEL: shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_13:
1848; AVX1:       # %bb.0:
1849; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1850; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,14,15,2,3,4,5,14,15,0,1,8,9,10,11]
1851; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1852; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1853; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1854; AVX1-NEXT:    retq
1855;
1856; AVX2OR512VL-LABEL: shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_13:
1857; AVX2OR512VL:       # %bb.0:
1858; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,14,15,2,3,4,5,14,15,0,1,8,9,10,11,28,29,30,31,18,19,20,21,30,31,16,17,24,25,26,27]
1859; AVX2OR512VL-NEXT:    retq
1860  %1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 6, i32 7, i32 1, i32 2, i32 7, i32 0, i32 4, i32 5, i32 14, i32 15, i32 9, i32 10, i32 15, i32 8, i32 12, i32 13>
1861  ret <16 x i16> %1
1862}
1863
1864;
1865; Shuffle to logical bit shifts
1866;
1867
1868define <16 x i16> @shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i16> %a) {
1869; AVX1-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
1870; AVX1:       # %bb.0:
1871; AVX1-NEXT:    vpslld $16, %xmm0, %xmm1
1872; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1873; AVX1-NEXT:    vpslld $16, %xmm0, %xmm0
1874; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1875; AVX1-NEXT:    retq
1876;
1877; AVX2OR512VL-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
1878; AVX2OR512VL:       # %bb.0:
1879; AVX2OR512VL-NEXT:    vpslld $16, %ymm0, %ymm0
1880; AVX2OR512VL-NEXT:    retq
1881  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
1882  ret <16 x i16> %shuffle
1883}
1884
1885define <16 x i16> @shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i16> %a) {
1886; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
1887; AVX1:       # %bb.0:
1888; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm1
1889; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1890; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm0
1891; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1892; AVX1-NEXT:    retq
1893;
1894; AVX2OR512VL-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
1895; AVX2OR512VL:       # %bb.0:
1896; AVX2OR512VL-NEXT:    vpsllq $48, %ymm0, %ymm0
1897; AVX2OR512VL-NEXT:    retq
1898  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
1899  ret <16 x i16> %shuffle
1900}
1901
1902define <16 x i16> @shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz(<16 x i16> %a) {
1903; AVX1-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz:
1904; AVX1:       # %bb.0:
1905; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
1906; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1907; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
1908; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1909; AVX1-NEXT:    retq
1910;
1911; AVX2OR512VL-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz:
1912; AVX2OR512VL:       # %bb.0:
1913; AVX2OR512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
1914; AVX2OR512VL-NEXT:    retq
1915  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
1916  ret <16 x i16> %shuffle
1917}
1918
1919define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz(<16 x i16> %a) {
1920; AVX1-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
1921; AVX1:       # %bb.0:
1922; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1923; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
1924; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1925; AVX1-NEXT:    retq
1926;
1927; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
1928; AVX2OR512VL:       # %bb.0:
1929; AVX2OR512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
1930; AVX2OR512VL-NEXT:    retq
1931  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 16, i32 16, i32 10, i32 11, i32 16, i32 16, i32 14, i32 15, i32 16, i32 16>
1932  ret <16 x i16> %shuffle
1933}
1934
1935define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz(<16 x i16> %a) {
1936; AVX1-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
1937; AVX1:       # %bb.0:
1938; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1939; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1940; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1941; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1942; AVX1-NEXT:    retq
1943;
1944; AVX2OR512VL-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
1945; AVX2OR512VL:       # %bb.0:
1946; AVX2OR512VL-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1947; AVX2OR512VL-NEXT:    retq
1948  %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 16, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0>
1949  ret <16 x i16> %shuffle
1950}
1951
1952define <16 x i16> @shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz(<16 x i16> %a) {
1953; AVX1-LABEL: shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz:
1954; AVX1:       # %bb.0:
1955; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1956; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1957; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1958; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1959; AVX1-NEXT:    retq
1960;
1961; AVX2OR512VL-LABEL: shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz:
1962; AVX2OR512VL:       # %bb.0:
1963; AVX2OR512VL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1964; AVX2OR512VL-NEXT:    retq
1965  %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 16, i32 0, i32 17, i32 0, i32 18, i32 0, i32 19, i32 0, i32 20, i32 0, i32 21, i32 0, i32 22, i32 0, i32 23, i32 0>
1966  ret <16 x i16> %shuffle
1967}
1968
1969define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz(<16 x i16> %a) {
1970; AVX1-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
1971; AVX1:       # %bb.0:
1972; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1973; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1974; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1975; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1976; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1977; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1978; AVX1-NEXT:    retq
1979;
1980; AVX2-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
1981; AVX2:       # %bb.0:
1982; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1983; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1984; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1985; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1986; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1987; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1988; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1989; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
1990; AVX2-NEXT:    retq
1991;
1992; AVX512VL-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
1993; AVX512VL:       # %bb.0:
1994; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15]
1995; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1996; AVX512VL-NEXT:    vpermt2w %ymm0, %ymm2, %ymm1
1997; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
1998; AVX512VL-NEXT:    retq
1999  %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 28, i32 0, i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 30, i32 0, i32 0, i32 0, i32 31, i32 0, i32 0, i32 0>
2000  ret <16 x i16> %shuffle
2001}
2002
2003define <16 x i16> @shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14(<16 x i16> %a, <16 x i16> %b) {
2004; AVX1-LABEL: shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14:
2005; AVX1:       # %bb.0:
2006; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2007; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
2008; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
2009; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
2010; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2011; AVX1-NEXT:    retq
2012;
2013; AVX2OR512VL-LABEL: shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14:
2014; AVX2OR512VL:       # %bb.0:
2015; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm1[30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
2016; AVX2OR512VL-NEXT:    retq
2017  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
2018  ret <16 x i16> %shuffle
2019}
2020
2021define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24(<16 x i16> %a, <16 x i16> %b) {
2022; AVX1-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24:
2023; AVX1:       # %bb.0:
2024; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2025; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2026; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1]
2027; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
2028; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2029; AVX1-NEXT:    retq
2030;
2031; AVX2OR512VL-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24:
2032; AVX2OR512VL:       # %bb.0:
2033; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
2034; AVX2OR512VL-NEXT:    retq
2035  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24>
2036  ret <16 x i16> %shuffle
2037}
2038
2039define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8(<16 x i16> %a, <16 x i16> %b) {
2040; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8:
2041; AVX1:       # %bb.0:
2042; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2043; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
2044; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1]
2045; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
2046; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2047; AVX1-NEXT:    retq
2048;
2049; AVX2OR512VL-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8:
2050; AVX2OR512VL:       # %bb.0:
2051; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
2052; AVX2OR512VL-NEXT:    retq
2053  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 00, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 8>
2054  ret <16 x i16> %shuffle
2055}
2056
2057define <16 x i16> @shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30(<16 x i16> %a, <16 x i16> %b) {
2058; AVX1-LABEL: shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30:
2059; AVX1:       # %bb.0:
2060; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2061; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2062; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
2063; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
2064; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2065; AVX1-NEXT:    retq
2066;
2067; AVX2OR512VL-LABEL: shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30:
2068; AVX2OR512VL:       # %bb.0:
2069; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm0[30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
2070; AVX2OR512VL-NEXT:    retq
2071  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
2072  ret <16 x i16> %shuffle
2073}
2074
2075define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16(<16 x i16> %a, <16 x i16> %b) {
2076; AVX1-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16:
2077; AVX1:       # %bb.0:
2078; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
2079; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
2080; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2081; AVX1-NEXT:    retq
2082;
2083; AVX2-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16:
2084; AVX2:       # %bb.0:
2085; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2086; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16,17]
2087; AVX2-NEXT:    retq
2088;
2089; AVX512VL-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16:
2090; AVX512VL:       # %bb.0:
2091; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16]
2092; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
2093; AVX512VL-NEXT:    retq
2094  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16>
2095  ret <16 x i16> %shuffle
2096}
2097
2098define <16 x i16> @shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22(<16 x i16> %a, <16 x i16> %b) {
2099; AVX1-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22:
2100; AVX1:       # %bb.0:
2101; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13]
2102; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13]
2103; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2104; AVX1-NEXT:    retq
2105;
2106; AVX2-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22:
2107; AVX2:       # %bb.0:
2108; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2109; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,31,16,17,18,19,20,21,22,23,24,25,26,27,28,29]
2110; AVX2-NEXT:    retq
2111;
2112; AVX512VL-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22:
2113; AVX512VL:       # %bb.0:
2114; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22]
2115; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
2116; AVX512VL-NEXT:    retq
2117  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
2118  ret <16 x i16> %shuffle
2119}
2120
2121define <16 x i16> @shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11(<16 x i16> %a, <16 x i16> %b) {
2122; AVX1-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
2123; AVX1:       # %bb.0:
2124; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2125; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2126; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7]
2127; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
2128; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
2129; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2130; AVX1-NEXT:    retq
2131;
2132; AVX2-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
2133; AVX2:       # %bb.0:
2134; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2135; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
2136; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
2137; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2138; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2139; AVX2-NEXT:    retq
2140;
2141; AVX512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
2142; AVX512VL:       # %bb.0:
2143; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,0,1,2,3,2,11,8,9,8,9,10,11,10,11]
2144; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2145; AVX512VL-NEXT:    retq
2146  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 11, i32 8, i32 9, i32 8, i32 9, i32 10, i32 11, i32 10, i32 11>
2147  ret <16 x i16> %shuffle
2148}
2149
2150define <16 x i16> @shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09(<16 x i16> %a, <16 x i16> %b) {
2151; AVX1-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
2152; AVX1:       # %bb.0:
2153; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2154; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
2155; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
2156; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
2157; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2158; AVX1-NEXT:    retq
2159;
2160; AVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
2161; AVX2:       # %bb.0:
2162; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2163; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2164; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2165; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2166; AVX2-NEXT:    retq
2167;
2168; AVX512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
2169; AVX512VL:       # %bb.0:
2170; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,7,4,5,2,3,0,9,14,15,12,13,10,11,8,9]
2171; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2172; AVX512VL-NEXT:    retq
2173  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 9, i32 14, i32 15, i32 12, i32 13, i32 10, i32 11, i32 8, i32 9>
2174  ret <16 x i16> %shuffle
2175}
2176
2177define <16 x i16> @shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27(<16 x i16> %a, <16 x i16> %b) {
2178; AVX1-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
2179; AVX1:       # %bb.0:
2180; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2181; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2182; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
2183; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
2184; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
2185; AVX1-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2186; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2187; AVX1-NEXT:    retq
2188;
2189; AVX2-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
2190; AVX2:       # %bb.0:
2191; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2192; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2193; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2194; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2195; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
2196; AVX2-NEXT:    retq
2197;
2198; AVX512VL-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
2199; AVX512VL:       # %bb.0:
2200; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27]
2201; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
2202; AVX512VL-NEXT:    retq
2203  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 27, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27>
2204  ret <16 x i16> %shuffle
2205}
2206
2207define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
2208; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
2209; AVX1:       # %bb.0:
2210; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2211; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2212; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,1,4,5,6,7]
2213; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
2214; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
2215; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
2216; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2217; AVX1-NEXT:    retq
2218;
2219; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
2220; AVX2-SLOW:       # %bb.0:
2221; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2222; AVX2-SLOW-NEXT:    vpbroadcastw %xmm1, %xmm1
2223; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
2224; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
2225; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2226; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2227; AVX2-SLOW-NEXT:    retq
2228;
2229; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
2230; AVX2-FAST:       # %bb.0:
2231; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2232; AVX2-FAST-NEXT:    vpbroadcastw %xmm1, %xmm1
2233; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
2234; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2235; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2236; AVX2-FAST-NEXT:    retq
2237;
2238; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
2239; AVX512VL:       # %bb.0:
2240; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,8]
2241; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2242; AVX512VL-NEXT:    retq
2243  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
2244  ret <16 x i16> %shuffle
2245}
2246
2247define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
2248; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
2249; AVX1:       # %bb.0:
2250; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2251; AVX1-NEXT:    vpsllq $48, %xmm1, %xmm2
2252; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2253; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7]
2254; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2255; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
2256; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
2257; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2258; AVX1-NEXT:    retq
2259;
2260; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
2261; AVX2-SLOW:       # %bb.0:
2262; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2263; AVX2-SLOW-NEXT:    vpsllq $48, %xmm1, %xmm1
2264; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
2265; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
2266; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2267; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2268; AVX2-SLOW-NEXT:    retq
2269;
2270; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
2271; AVX2-FAST:       # %bb.0:
2272; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2273; AVX2-FAST-NEXT:    vpsllq $48, %xmm1, %xmm1
2274; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
2275; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2276; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2277; AVX2-FAST-NEXT:    retq
2278;
2279; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
2280; AVX512VL:       # %bb.0:
2281; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,4,4,4,12,8,8,8,8,12,12,12,12]
2282; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2283; AVX512VL-NEXT:    retq
2284  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 12, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
2285  ret <16 x i16> %shuffle
2286}
2287
2288define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11(<16 x i16> %a, <16 x i16> %b) {
2289; AVX1-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
2290; AVX1:       # %bb.0:
2291; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2292; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2293; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2294; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
2295; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
2296; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2297; AVX1-NEXT:    retq
2298;
2299; AVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
2300; AVX2:       # %bb.0:
2301; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2302; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
2303; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2304; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
2305; AVX2-NEXT:    retq
2306;
2307; AVX512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
2308; AVX512VL:       # %bb.0:
2309; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,0,u,1,u,2,u,11,u,8,u,9,u,10,u,11>
2310; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2311; AVX512VL-NEXT:    retq
2312  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 11, i32 undef, i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11>
2313  ret <16 x i16> %shuffle
2314}
2315
2316define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15(<16 x i16> %a, <16 x i16> %b) {
2317; AVX1-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
2318; AVX1:       # %bb.0:
2319; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2320; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2321; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2322; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
2323; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
2324; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2325; AVX1-NEXT:    retq
2326;
2327; AVX2-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
2328; AVX2:       # %bb.0:
2329; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
2330; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
2331; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
2332; AVX2-NEXT:    retq
2333;
2334; AVX512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
2335; AVX512VL:       # %bb.0:
2336; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,4,u,5,u,6,u,15,u,12,u,13,u,14,u,15>
2337; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2338; AVX512VL-NEXT:    retq
2339  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 15, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15>
2340  ret <16 x i16> %shuffle
2341}
2342
2343define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13(<16 x i16> %a, <16 x i16> %b) {
2344; AVX1-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
2345; AVX1:       # %bb.0:
2346; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2347; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2348; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
2349; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
2350; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
2351; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
2352; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2353; AVX1-NEXT:    retq
2354;
2355; AVX2-SLOW-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
2356; AVX2-SLOW:       # %bb.0:
2357; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2358; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2359; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2360; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
2361; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
2362; AVX2-SLOW-NEXT:    retq
2363;
2364; AVX2-FAST-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
2365; AVX2-FAST:       # %bb.0:
2366; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2367; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2368; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2369; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11,22,23,18,19,20,21,16,17,28,29,30,31,24,25,26,27]
2370; AVX2-FAST-NEXT:    retq
2371;
2372; AVX512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
2373; AVX512VL:       # %bb.0:
2374; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,1,2,0,6,7,4,13,11,9,10,8,14,15,12,13]
2375; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2376; AVX512VL-NEXT:    retq
2377  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 13, i32 11, i32 9, i32 10, i32 8, i32 14, i32 15, i32 12, i32 13>
2378  ret <16 x i16> %shuffle
2379}
2380
2381define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
2382; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
2383; AVX1:       # %bb.0:
2384; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2385; AVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
2386; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15]
2387; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2388; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
2389; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2390; AVX1-NEXT:    retq
2391;
2392; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
2393; AVX2:       # %bb.0:
2394; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2395; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
2396; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,24,25,24,25,24,25,24,25,16,17,16,17,16,17,16,17]
2397; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2398; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2399; AVX2-NEXT:    retq
2400;
2401; AVX512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
2402; AVX512VL:       # %bb.0:
2403; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,0,0,0,8,12,12,12,12,8,8,8,8]
2404; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2405; AVX512VL-NEXT:    retq
2406  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 8, i32 12, i32 12, i32 12, i32 12, i32 8, i32 8, i32 8, i32 8>
2407  ret <16 x i16> %shuffle
2408}
2409
2410define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13(<16 x i16> %a, <16 x i16> %b) {
2411; AVX1-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
2412; AVX1:       # %bb.0:
2413; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2414; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2415; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
2416; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2]
2417; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2418; AVX1-NEXT:    retq
2419;
2420; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
2421; AVX2:       # %bb.0:
2422; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2423; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2424; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2425; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
2426; AVX2-NEXT:    retq
2427;
2428; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
2429; AVX512VL:       # %bb.0:
2430; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,0,1,6,7,4,13,10,11,8,9,14,15,12,13]
2431; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2432; AVX512VL-NEXT:    retq
2433  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 13, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
2434  ret <16 x i16> %shuffle
2435}
2436
2437define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13(<16 x i16> %a, <16 x i16> %b) {
2438; AVX1-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
2439; AVX1:       # %bb.0:
2440; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2441; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2442; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
2443; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
2444; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7]
2445; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
2446; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2447; AVX1-NEXT:    retq
2448;
2449; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
2450; AVX2-SLOW:       # %bb.0:
2451; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2452; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
2453; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2454; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15]
2455; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
2456; AVX2-SLOW-NEXT:    retq
2457;
2458; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
2459; AVX2-FAST:       # %bb.0:
2460; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2461; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
2462; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2463; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11,20,21,22,23,16,17,20,21,28,29,30,31,24,25,26,27]
2464; AVX2-FAST-NEXT:    retq
2465;
2466; AVX512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
2467; AVX512VL:       # %bb.0:
2468; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,0,2,6,7,4,13,10,11,8,10,14,15,12,13]
2469; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2470; AVX512VL-NEXT:    retq
2471  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 13, i32 10, i32 11, i32 8, i32 10, i32 14, i32 15, i32 12, i32 13>
2472  ret <16 x i16> %shuffle
2473}
2474
2475define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15(<16 x i16> %a, <16 x i16> %b) {
2476; AVX1-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
2477; AVX1:       # %bb.0:
2478; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2479; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
2480; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
2481; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3]
2482; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
2483; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2484; AVX1-NEXT:    retq
2485;
2486; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
2487; AVX2-SLOW:       # %bb.0:
2488; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2489; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
2490; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
2491; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2492; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2493; AVX2-SLOW-NEXT:    retq
2494;
2495; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
2496; AVX2-FAST:       # %bb.0:
2497; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15,20,21,22,23,16,17,18,19,28,29,30,31,24,25,30,31]
2498; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
2499; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2500; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
2501; AVX2-FAST-NEXT:    retq
2502;
2503; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
2504; AVX512VL:       # %bb.0:
2505; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,0,1,6,7,4,15,10,11,8,9,14,15,12,15]
2506; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2507; AVX512VL-NEXT:    retq
2508  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 15, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 15>
2509  ret <16 x i16> %shuffle
2510}
2511
2512define <16 x i16> @shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08(<16 x i16> %a, <16 x i16> %b) {
2513; AVX1-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
2514; AVX1:       # %bb.0:
2515; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2516; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
2517; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
2518; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2519; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2520; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
2521; AVX1-NEXT:    retq
2522;
2523; AVX2-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
2524; AVX2:       # %bb.0:
2525; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2526; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2527; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2528; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1,30,31,26,27,28,29,24,25,22,23,18,19,20,21,16,17]
2529; AVX2-NEXT:    retq
2530;
2531; AVX512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
2532; AVX512VL:       # %bb.0:
2533; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,5,6,4,3,1,2,8,15,13,14,12,11,9,10,8]
2534; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2535; AVX512VL-NEXT:    retq
2536  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 8, i32 15, i32 13, i32 14, i32 12, i32 11, i32 9, i32 10, i32 8>
2537  ret <16 x i16> %shuffle
2538}
2539
2540define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08(<16 x i16> %a, <16 x i16> %b) {
2541; AVX1-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
2542; AVX1:       # %bb.0:
2543; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2544; AVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
2545; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3]
2546; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2547; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
2548; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2549; AVX1-NEXT:    retq
2550;
2551; AVX2-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
2552; AVX2:       # %bb.0:
2553; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2554; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
2555; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1,18,19,16,17,26,27,24,25,26,27,24,25,18,19,16,17]
2556; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2557; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2558; AVX2-NEXT:    retq
2559;
2560; AVX512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
2561; AVX512VL:       # %bb.0:
2562; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,5,4,5,4,1,8,9,8,13,12,13,12,9,8]
2563; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2564; AVX512VL-NEXT:    retq
2565  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 8, i32 9, i32 8, i32 13, i32 12, i32 13, i32 12, i32 9, i32 8>
2566  ret <16 x i16> %shuffle
2567}
2568
2569define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08(<16 x i16> %a, <16 x i16> %b) {
2570; AVX1-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
2571; AVX1:       # %bb.0:
2572; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2573; AVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
2574; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3]
2575; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2576; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
2577; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2578; AVX1-NEXT:    retq
2579;
2580; AVX2-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
2581; AVX2:       # %bb.0:
2582; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2583; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
2584; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1,26,27,24,25,18,19,16,17,26,27,24,25,18,19,16,17]
2585; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2586; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2587; AVX2-NEXT:    retq
2588;
2589; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
2590; AVX512VL:       # %bb.0:
2591; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,4,1,0,5,4,1,8,13,12,9,8,13,12,9,8]
2592; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2593; AVX512VL-NEXT:    retq
2594  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 8, i32 13, i32 12, i32 9, i32 8, i32 13, i32 12, i32 9, i32 8>
2595  ret <16 x i16> %shuffle
2596}
2597
2598define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12(<16 x i16> %a, <16 x i16> %b) {
2599; AVX1-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
2600; AVX1:       # %bb.0:
2601; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2602; AVX1-NEXT:    vpsllq $48, %xmm1, %xmm2
2603; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3]
2604; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2605; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
2606; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2607; AVX1-NEXT:    retq
2608;
2609; AVX2-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
2610; AVX2:       # %bb.0:
2611; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2612; AVX2-NEXT:    vpsllq $48, %xmm1, %xmm1
2613; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9,26,27,24,25,18,19,16,17,18,19,16,17,26,27,24,25]
2614; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2615; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2616; AVX2-NEXT:    retq
2617;
2618; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
2619; AVX512VL:       # %bb.0:
2620; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,4,1,0,1,0,5,12,13,12,9,8,9,8,13,12]
2621; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2622; AVX512VL-NEXT:    retq
2623  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 12, i32 13, i32 12, i32 9, i32 8, i32 9, i32 8, i32 13, i32 12>
2624  ret <16 x i16> %shuffle
2625}
2626
2627define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08(<16 x i16> %a, <16 x i16> %b) {
2628; AVX1-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
2629; AVX1:       # %bb.0:
2630; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2631; AVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
2632; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3]
2633; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2634; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
2635; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2636; AVX1-NEXT:    retq
2637;
2638; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
2639; AVX2:       # %bb.0:
2640; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2641; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
2642; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1,16,17,24,25,24,25,16,17,16,17,24,25,24,25,16,17]
2643; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2644; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2645; AVX2-NEXT:    retq
2646;
2647; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
2648; AVX512VL:       # %bb.0:
2649; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,4,4,0,0,4,4,8,8,12,12,8,8,12,12,8]
2650; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2651; AVX512VL-NEXT:    retq
2652  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 8>
2653  ret <16 x i16> %shuffle
2654}
2655
2656define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12(<16 x i16> %a, <16 x i16> %b) {
2657; AVX1-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
2658; AVX1:       # %bb.0:
2659; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2660; AVX1-NEXT:    vpsllq $48, %xmm1, %xmm2
2661; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3]
2662; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2663; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
2664; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2665; AVX1-NEXT:    retq
2666;
2667; AVX2-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
2668; AVX2:       # %bb.0:
2669; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2670; AVX2-NEXT:    vpsllq $48, %xmm1, %xmm1
2671; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9,24,25,16,17,16,17,24,25,24,25,16,17,16,17,24,25]
2672; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2673; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2674; AVX2-NEXT:    retq
2675;
2676; AVX512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
2677; AVX512VL:       # %bb.0:
2678; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,0,0,4,4,0,0,12,12,8,8,12,12,8,8,12]
2679; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2680; AVX512VL-NEXT:    retq
2681  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 12>
2682  ret <16 x i16> %shuffle
2683}
2684
2685define <16 x i16> @shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11(<16 x i16> %a, <16 x i16> %b) {
2686; AVX1-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
2687; AVX1:       # %bb.0:
2688; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2689; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
2690; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
2691; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
2692; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2693; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
2694; AVX1-NEXT:    retq
2695;
2696; AVX2-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
2697; AVX2:       # %bb.0:
2698; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2699; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2700; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2701; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7,20,21,28,29,24,25,16,17,26,27,18,19,30,31,22,23]
2702; AVX2-NEXT:    retq
2703;
2704; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
2705; AVX512VL:       # %bb.0:
2706; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,6,4,0,5,1,7,11,10,14,12,8,13,9,15,11]
2707; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2708; AVX512VL-NEXT:    retq
2709  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 11, i32 10, i32 14, i32 12, i32 8, i32 13, i32 9, i32 15, i32 11>
2710  ret <16 x i16> %shuffle
2711}
2712
2713define <16 x i16> @shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11(<16 x i16> %a, <16 x i16> %b) {
2714; AVX1-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
2715; AVX1:       # %bb.0:
2716; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2717; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
2718; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
2719; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
2720; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2721; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
2722; AVX1-NEXT:    retq
2723;
2724; AVX2-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
2725; AVX2:       # %bb.0:
2726; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2727; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2728; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2729; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7,20,21,16,17,28,29,24,25,26,27,18,19,30,31,22,23]
2730; AVX2-NEXT:    retq
2731;
2732; AVX512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
2733; AVX512VL:       # %bb.0:
2734; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,0,6,4,5,1,7,11,10,8,14,12,13,9,15,11]
2735; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2736; AVX512VL-NEXT:    retq
2737  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 11, i32 10, i32 8, i32 14, i32 12, i32 13, i32 9, i32 15, i32 11>
2738  ret <16 x i16> %shuffle
2739}
2740
2741define <16 x i16> @shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13(<16 x i16> %a, <16 x i16> %b) {
2742; AVX1-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
2743; AVX1:       # %bb.0:
2744; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2745; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
2746; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
2747; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2748; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2749; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
2750; AVX1-NEXT:    retq
2751;
2752; AVX2-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
2753; AVX2:       # %bb.0:
2754; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2755; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2756; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2757; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11,20,21,28,29,24,25,16,17,18,19,22,23,30,31,26,27]
2758; AVX2-NEXT:    retq
2759;
2760; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
2761; AVX512VL:       # %bb.0:
2762; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,6,4,0,1,3,7,13,10,14,12,8,9,11,15,13]
2763; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2764; AVX512VL-NEXT:    retq
2765  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 13, i32 10, i32 14, i32 12, i32 8, i32 9, i32 11, i32 15, i32 13>
2766  ret <16 x i16> %shuffle
2767}
2768
2769define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11(<16 x i16> %a, <16 x i16> %b) {
2770; AVX1-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
2771; AVX1:       # %bb.0:
2772; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2773; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
2774; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
2775; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
2776; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2777; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
2778; AVX1-NEXT:    retq
2779;
2780; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
2781; AVX2-SLOW:       # %bb.0:
2782; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2783; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
2784; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
2785; AVX2-SLOW-NEXT:    retq
2786;
2787; AVX2-FAST-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
2788; AVX2-FAST:       # %bb.0:
2789; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7]
2790; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2791; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
2792; AVX2-FAST-NEXT:    retq
2793;
2794; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
2795; AVX512VL:       # %bb.0:
2796; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,7,5,1,6,4,11,14,14,15,13,9,14,12,11]
2797; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2798; AVX512VL-NEXT:    retq
2799  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 11, i32 14, i32 14, i32 15, i32 13, i32 9, i32 14, i32 12, i32 11>
2800  ret <16 x i16> %shuffle
2801}
2802
2803define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
2804; AVX1-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
2805; AVX1:       # %bb.0:
2806; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2807; AVX1-NEXT:    vpsllq $48, %xmm1, %xmm2
2808; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15]
2809; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2810; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
2811; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2812; AVX1-NEXT:    retq
2813;
2814; AVX2-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
2815; AVX2:       # %bb.0:
2816; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2817; AVX2-NEXT:    vpsllq $48, %xmm1, %xmm1
2818; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9,16,17,16,17,24,25,24,25,24,25,24,25,24,25,24,25]
2819; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2820; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2821; AVX2-NEXT:    retq
2822;
2823; AVX512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
2824; AVX512VL:       # %bb.0:
2825; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,4,4,4,4,4,12,8,8,12,12,12,12,12,12]
2826; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2827; AVX512VL-NEXT:    retq
2828  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 12, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
2829  ret <16 x i16> %shuffle
2830}
2831
2832define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
2833; AVX1-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
2834; AVX1:       # %bb.0:
2835; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2836; AVX1-NEXT:    vpsllq $48, %xmm1, %xmm2
2837; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15]
2838; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2839; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
2840; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2841; AVX1-NEXT:    retq
2842;
2843; AVX2-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
2844; AVX2:       # %bb.0:
2845; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2846; AVX2-NEXT:    vpsllq $48, %xmm1, %xmm1
2847; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9,24,25,24,25,16,17,16,17,24,25,24,25,24,25,24,25]
2848; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2849; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2850; AVX2-NEXT:    retq
2851;
2852; AVX512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
2853; AVX512VL:       # %bb.0:
2854; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,0,0,4,4,4,12,12,12,8,8,12,12,12,12]
2855; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2856; AVX512VL-NEXT:    retq
2857  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 12, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
2858  ret <16 x i16> %shuffle
2859}
2860
2861define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
2862; AVX1-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
2863; AVX1:       # %bb.0:
2864; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2865; AVX1-NEXT:    vpsllq $48, %xmm1, %xmm2
2866; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
2867; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2868; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
2869; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2870; AVX1-NEXT:    retq
2871;
2872; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
2873; AVX2:       # %bb.0:
2874; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2875; AVX2-NEXT:    vpsllq $48, %xmm1, %xmm1
2876; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9,16,17,24,25,24,25,16,17,24,25,24,25,24,25,24,25]
2877; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2878; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2879; AVX2-NEXT:    retq
2880;
2881; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
2882; AVX512VL:       # %bb.0:
2883; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,4,4,0,4,4,4,12,8,12,12,8,12,12,12,12]
2884; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2885; AVX512VL-NEXT:    retq
2886  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 12, i32 8, i32 12, i32 12, i32 8, i32 12, i32 12, i32 12, i32 12>
2887  ret <16 x i16> %shuffle
2888}
2889
2890define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
2891; AVX1-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
2892; AVX1:       # %bb.0:
2893; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2894; AVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
2895; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15]
2896; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2897; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
2898; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2899; AVX1-NEXT:    retq
2900;
2901; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
2902; AVX2:       # %bb.0:
2903; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2904; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
2905; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1,16,17,24,25,24,25,16,17,16,17,16,17,16,17,16,17]
2906; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2907; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2908; AVX2-NEXT:    retq
2909;
2910; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
2911; AVX512VL:       # %bb.0:
2912; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,4,4,0,0,0,0,8,8,12,12,8,8,8,8,8]
2913; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2914; AVX512VL-NEXT:    retq
2915  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 8, i32 8, i32 8>
2916  ret <16 x i16> %shuffle
2917}
2918
2919define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
2920; AVX1-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
2921; AVX1:       # %bb.0:
2922; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2923; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2924; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
2925; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
2926; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2927; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
2928; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2929; AVX1-NEXT:    retq
2930;
2931; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
2932; AVX2-SLOW:       # %bb.0:
2933; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2934; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2935; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15]
2936; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2937; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2938; AVX2-SLOW-NEXT:    retq
2939;
2940; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
2941; AVX2-FAST:       # %bb.0:
2942; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15,16,17,24,25,24,25,16,17,24,25,26,27,28,29,30,31]
2943; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
2944; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2945; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
2946; AVX2-FAST-NEXT:    retq
2947;
2948; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
2949; AVX512VL:       # %bb.0:
2950; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,4,4,0,4,5,6,15,8,12,12,8,12,13,14,15]
2951; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2952; AVX512VL-NEXT:    retq
2953  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 15, i32 8, i32 12, i32 12, i32 8, i32 12, i32 13, i32 14, i32 15>
2954  ret <16 x i16> %shuffle
2955}
2956
2957define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
2958; AVX1-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
2959; AVX1:       # %bb.0:
2960; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2961; AVX1-NEXT:    vpsllq $48, %xmm1, %xmm2
2962; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15]
2963; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2964; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
2965; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2966; AVX1-NEXT:    retq
2967;
2968; AVX2-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
2969; AVX2:       # %bb.0:
2970; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2971; AVX2-NEXT:    vpsllq $48, %xmm1, %xmm1
2972; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9,16,17,18,19,24,25,24,25,24,25,24,25,24,25,24,25]
2973; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,255,255,255,255,0,0,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
2974; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2975; AVX2-NEXT:    retq
2976;
2977; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
2978; AVX512VL:       # %bb.0:
2979; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,u,4,4,4,4,4,12,8,u,12,12,12,12,12,12>
2980; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
2981; AVX512VL-NEXT:    retq
2982  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 12, i32 8, i32 undef, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
2983  ret <16 x i16> %shuffle
2984}
2985
2986define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
2987; AVX1-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
2988; AVX1:       # %bb.0:
2989; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2990; AVX1-NEXT:    vpsllq $48, %xmm1, %xmm2
2991; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
2992; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2993; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
2994; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2995; AVX1-NEXT:    retq
2996;
2997; AVX2-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
2998; AVX2:       # %bb.0:
2999; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3000; AVX2-NEXT:    vpsllq $48, %xmm1, %xmm1
3001; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9,24,25,24,25,24,25,16,17,24,25,24,25,24,25,24,25]
3002; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,u,u,255,255,255,255,255,255,255,255,0,0,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255>
3003; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3004; AVX2-NEXT:    retq
3005;
3006; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
3007; AVX512VL:       # %bb.0:
3008; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = <4,4,u,0,4,4,4,12,12,12,u,8,12,12,12,12>
3009; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
3010; AVX512VL-NEXT:    retq
3011  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 12, i32 12, i32 12, i32 undef, i32 8, i32 12, i32 12, i32 12, i32 12>
3012  ret <16 x i16> %shuffle
3013}
3014
3015define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
3016; AVX1-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
3017; AVX1:       # %bb.0:
3018; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3019; AVX1-NEXT:    vpsllq $48, %xmm1, %xmm2
3020; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
3021; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
3022; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
3023; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3024; AVX1-NEXT:    retq
3025;
3026; AVX2-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
3027; AVX2:       # %bb.0:
3028; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3029; AVX2-NEXT:    vpsllq $48, %xmm1, %xmm1
3030; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9,16,17,24,25,24,25,16,17,24,25,24,25,24,25,24,25]
3031; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255>
3032; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3033; AVX2-NEXT:    retq
3034;
3035; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
3036; AVX512VL:       # %bb.0:
3037; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,4,4,0,4,4,4,12,u,12,12,8,12,12,12,12>
3038; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
3039; AVX512VL-NEXT:    retq
3040  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 12, i32 undef, i32 12, i32 12, i32 8, i32 12, i32 12, i32 12, i32 12>
3041  ret <16 x i16> %shuffle
3042}
3043
3044define <16 x i16> @shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
3045; AVX1-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
3046; AVX1:       # %bb.0:
3047; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3048; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
3049; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3050; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3051; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3052; AVX1-NEXT:    retq
3053;
3054; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
3055; AVX2OR512VL:       # %bb.0:
3056; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31]
3057; AVX2OR512VL-NEXT:    retq
3058  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 10, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
3059  ret <16 x i16> %shuffle
3060}
3061
3062define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11(<16 x i16> %a, <16 x i16> %b) {
3063; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
3064; AVX1:       # %bb.0:
3065; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3066; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
3067; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
3068; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
3069; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3070; AVX1-NEXT:    retq
3071;
3072; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
3073; AVX2:       # %bb.0:
3074; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
3075; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7,28,29,22,23,20,21,22,23,24,25,26,27,28,29,22,23]
3076; AVX2-NEXT:    retq
3077;
3078; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
3079; AVX512VL:       # %bb.0:
3080; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,4,5,6,11,u,u,u,u,12,13,14,11>
3081; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
3082; AVX512VL-NEXT:    retq
3083  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11>
3084  ret <16 x i16> %shuffle
3085}
3086
3087define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
3088; AVX1-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
3089; AVX1:       # %bb.0:
3090; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3091; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
3092; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3093; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3094; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3095; AVX1-NEXT:    retq
3096;
3097; AVX2OR512VL-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
3098; AVX2OR512VL:       # %bb.0:
3099; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19]
3100; AVX2OR512VL-NEXT:    retq
3101  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
3102  ret <16 x i16> %shuffle
3103}
3104
3105define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a) {
3106; AVX1-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
3107; AVX1:       # %bb.0:
3108; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7]
3109; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3110; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
3111; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3112; AVX1-NEXT:    retq
3113;
3114; AVX2OR512VL-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
3115; AVX2OR512VL:       # %bb.0:
3116; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[8,9,u,u,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
3117; AVX2OR512VL-NEXT:    retq
3118  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 2, i32 16, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3119  ret <16 x i16> %shuffle
3120}
3121
3122define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11(<16 x i16> %a, <16 x i16> %b) {
3123; AVX1-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
3124; AVX1:       # %bb.0:
3125; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3126; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
3127; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
3128; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
3129; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3130; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3131; AVX1-NEXT:    retq
3132;
3133; AVX2-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
3134; AVX2:       # %bb.0:
3135; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3136; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3137; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3138; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7,16,17,18,19,20,21,30,31,24,25,26,27,28,29,22,23]
3139; AVX2-NEXT:    retq
3140;
3141; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
3142; AVX512VL:       # %bb.0:
3143; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,7,4,5,6,11,8,9,10,15,12,13,14,11]
3144; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
3145; AVX512VL-NEXT:    retq
3146  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 11, i32 8, i32 9, i32 10, i32 15, i32 12, i32 13, i32 14, i32 11>
3147  ret <16 x i16> %shuffle
3148}
3149
3150define <16 x i16> @shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15(<16 x i16> %a, <16 x i16> %b) {
3151; AVX1-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
3152; AVX1:       # %bb.0:
3153; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3154; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3]
3155; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
3156; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
3157; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3158; AVX1-NEXT:    retq
3159;
3160; AVX2-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
3161; AVX2:       # %bb.0:
3162; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15,24,25,26,27,28,29,22,23,16,17,18,19,20,21,30,31]
3163; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
3164; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3165; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
3166; AVX2-NEXT:    retq
3167;
3168; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
3169; AVX512VL:       # %bb.0:
3170; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,6,3,0,1,2,15,12,13,14,11,8,9,10,15]
3171; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
3172; AVX512VL-NEXT:    retq
3173  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 15, i32 12, i32 13, i32 14, i32 11, i32 8, i32 9, i32 10, i32 15>
3174  ret <16 x i16> %shuffle
3175}
3176
3177define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13(<16 x i16> %a, <16 x i16> %b) {
3178; AVX1-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
3179; AVX1:       # %bb.0:
3180; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3181; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
3182; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
3183; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
3184; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3185; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3186; AVX1-NEXT:    retq
3187;
3188; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
3189; AVX2-SLOW:       # %bb.0:
3190; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3191; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
3192; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
3193; AVX2-SLOW-NEXT:    retq
3194;
3195; AVX2-FAST-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
3196; AVX2-FAST:       # %bb.0:
3197; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7]
3198; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
3199; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
3200; AVX2-FAST-NEXT:    retq
3201;
3202; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
3203; AVX512VL:       # %bb.0:
3204; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,1,0,2,7,3,13,11,15,9,8,10,15,11,13]
3205; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
3206; AVX512VL-NEXT:    retq
3207  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 13, i32 11, i32 15, i32 9, i32 8, i32 10, i32 15, i32 11, i32 13>
3208  ret <16 x i16> %shuffle
3209}
3210
3211define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
3212; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
3213; AVX1:       # %bb.0:
3214; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3215; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3216; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3217; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3218; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15]
3219; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3220; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3221; AVX1-NEXT:    retq
3222;
3223; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
3224; AVX2:       # %bb.0:
3225; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3226; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
3227; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3228; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
3229; AVX2-NEXT:    retq
3230;
3231; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
3232; AVX512VL:       # %bb.0:
3233; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
3234; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3235; AVX512VL-NEXT:    retq
3236  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 27, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
3237  ret <16 x i16> %shuffle
3238}
3239
3240define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31(<16 x i16> %a, <16 x i16> %b) {
3241; AVX1-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
3242; AVX1:       # %bb.0:
3243; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3244; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3245; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
3246; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
3247; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
3248; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15]
3249; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3250; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3251; AVX1-NEXT:    retq
3252;
3253; AVX2-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
3254; AVX2:       # %bb.0:
3255; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3256; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
3257; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
3258; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3259; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
3260; AVX2-NEXT:    retq
3261;
3262; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
3263; AVX512VL:       # %bb.0:
3264; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31]
3265; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3266; AVX512VL-NEXT:    retq
3267  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 20, i32 1, i32 21, i32 2, i32 22, i32 3, i32 31, i32 8, i32 28, i32 9, i32 29, i32 10, i32 30, i32 11, i32 31>
3268  ret <16 x i16> %shuffle
3269}
3270
3271define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
3272; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
3273; AVX1:       # %bb.0:
3274; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3275; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3276; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3277; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3278; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15]
3279; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3280; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3281; AVX1-NEXT:    retq
3282;
3283; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
3284; AVX2:       # %bb.0:
3285; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
3286; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
3287; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
3288; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7]
3289; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3290; AVX2-NEXT:    retq
3291;
3292; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
3293; AVX512VL:       # %bb.0:
3294; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
3295; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3296; AVX512VL-NEXT:    retq
3297  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 31, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
3298  ret <16 x i16> %shuffle
3299}
3300
3301define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27(<16 x i16> %a, <16 x i16> %b) {
3302; AVX1-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
3303; AVX1:       # %bb.0:
3304; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3305; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3306; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
3307; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3308; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3309; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15]
3310; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3311; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3312; AVX1-NEXT:    retq
3313;
3314; AVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
3315; AVX2:       # %bb.0:
3316; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
3317; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3318; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3319; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3320; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23]
3321; AVX2-NEXT:    retq
3322;
3323; AVX512VL-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
3324; AVX512VL:       # %bb.0:
3325; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27]
3326; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3327; AVX512VL-NEXT:    retq
3328  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 16, i32 5, i32 17, i32 6, i32 18, i32 7, i32 27, i32 12, i32 24, i32 13, i32 25, i32 14, i32 26, i32 15, i32 27>
3329  ret <16 x i16> %shuffle
3330}
3331
3332define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
3333; AVX1-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
3334; AVX1:       # %bb.0:
3335; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3336; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3]
3337; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
3338; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
3339; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
3340; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
3341; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
3342; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
3343; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
3344; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
3345; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3346; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3347; AVX1-NEXT:    retq
3348;
3349; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
3350; AVX2-SLOW:       # %bb.0:
3351; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3352; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
3353; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
3354; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
3355; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
3356; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
3357; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3358; AVX2-SLOW-NEXT:    retq
3359;
3360; AVX2-FAST-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
3361; AVX2-FAST:       # %bb.0:
3362; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3363; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,2,3,8,9,12,13,12,13,14,15,16,17,16,17,20,21,18,19,24,25,28,29,28,29,30,31]
3364; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
3365; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31]
3366; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3367; AVX2-FAST-NEXT:    retq
3368;
3369; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
3370; AVX512VL:       # %bb.0:
3371; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31]
3372; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3373; AVX512VL-NEXT:    retq
3374  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 6, i32 22, i32 7, i32 31, i32 8, i32 24, i32 9, i32 25, i32 14, i32 30, i32 15, i32 31>
3375  ret <16 x i16> %shuffle
3376}
3377
3378define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25(<16 x i16> %a, <16 x i16> %b) {
3379; AVX1-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
3380; AVX1:       # %bb.0:
3381; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3382; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,0,2,3]
3383; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
3384; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
3385; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
3386; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
3387; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,0,1,2,3,2,3,0,1,12,13,2,3]
3388; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
3389; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3390; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3391; AVX1-NEXT:    retq
3392;
3393; AVX2-SLOW-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
3394; AVX2-SLOW:       # %bb.0:
3395; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3396; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u>
3397; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3398; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19]
3399; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
3400; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
3401; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3402; AVX2-SLOW-NEXT:    retq
3403;
3404; AVX2-FAST-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
3405; AVX2-FAST:       # %bb.0:
3406; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3407; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u>
3408; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3409; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19]
3410; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31]
3411; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3412; AVX2-FAST-NEXT:    retq
3413;
3414; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
3415; AVX512VL:       # %bb.0:
3416; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25]
3417; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3418; AVX512VL-NEXT:    retq
3419  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 20, i32 1, i32 21, i32 6, i32 16, i32 7, i32 25, i32 8, i32 28, i32 9, i32 29, i32 14, i32 24, i32 15, i32 25>
3420  ret <16 x i16> %shuffle
3421}
3422
3423define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26(<16 x i16> %a, <16 x i16> %b) {
3424; AVX1-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
3425; AVX1:       # %bb.0:
3426; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3427; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3428; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,12,13,10,11,8,9,10,11,12,13,10,11]
3429; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
3430; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3431; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3432; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3433; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
3434; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
3435; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3436; AVX1-NEXT:    retq
3437;
3438; AVX2-SLOW-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
3439; AVX2-SLOW:       # %bb.0:
3440; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3441; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
3442; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3443; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
3444; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
3445; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3446; AVX2-SLOW-NEXT:    retq
3447;
3448; AVX2-FAST-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
3449; AVX2-FAST:       # %bb.0:
3450; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3451; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
3452; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3453; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1,2,3,0,1,8,9,10,11,6,7,4,5,18,19,16,17,18,19,16,17,24,25,26,27,22,23,20,21]
3454; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,12,13,14,15,18,19,16,17,22,23,20,21,22,23,20,21,28,29,30,31]
3455; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
3456; AVX2-FAST-NEXT:    retq
3457;
3458; AVX512VL-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
3459; AVX512VL:       # %bb.0:
3460; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26]
3461; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3462; AVX512VL-NEXT:    retq
3463  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 0, i32 17, i32 16, i32 3, i32 2, i32 19, i32 26, i32 9, i32 8, i32 25, i32 24, i32 11, i32 10, i32 27, i32 26>
3464  ret <16 x i16> %shuffle
3465}
3466
3467define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11(<16 x i16> %a, <16 x i16> %b) {
3468; AVX1-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
3469; AVX1:       # %bb.0:
3470; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3471; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3472; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3473; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3474; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15]
3475; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3476; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3477; AVX1-NEXT:    retq
3478;
3479; AVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
3480; AVX2:       # %bb.0:
3481; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
3482; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
3483; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
3484; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
3485; AVX2-NEXT:    retq
3486;
3487; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
3488; AVX512VL:       # %bb.0:
3489; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
3490; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
3491; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
3492; AVX512VL-NEXT:    retq
3493  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 18, i32 2, i32 19, i32 11, i32 24, i32 8, i32 25, i32 9, i32 26, i32 10, i32 27, i32 11>
3494  ret <16 x i16> %shuffle
3495}
3496
3497define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15(<16 x i16> %a, <16 x i16> %b) {
3498; AVX1-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
3499; AVX1:       # %bb.0:
3500; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3501; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3502; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3503; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3504; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15]
3505; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3506; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3507; AVX1-NEXT:    retq
3508;
3509; AVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
3510; AVX2:       # %bb.0:
3511; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
3512; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
3513; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
3514; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
3515; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
3516; AVX2-NEXT:    retq
3517;
3518; AVX512VL-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
3519; AVX512VL:       # %bb.0:
3520; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
3521; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
3522; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
3523; AVX512VL-NEXT:    retq
3524  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 20, i32 4, i32 21, i32 5, i32 22, i32 6, i32 23, i32 15, i32 28, i32 12, i32 29, i32 13, i32 30, i32 14, i32 31, i32 15>
3525  ret <16 x i16> %shuffle
3526}
3527
3528define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31(<16 x i16> %a, <16 x i16> %b) {
3529; AVX1-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
3530; AVX1:       # %bb.0:
3531; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3532; AVX1-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,5,7]
3533; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
3534; AVX1-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7]
3535; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
3536; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3537; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,4,5,14,15,0,1,4,5,4,5,6,7]
3538; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
3539; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3540; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3541; AVX1-NEXT:    retq
3542;
3543; AVX2-SLOW-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
3544; AVX2-SLOW:       # %bb.0:
3545; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3546; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3547; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
3548; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
3549; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3550; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3551; AVX2-SLOW-NEXT:    retq
3552;
3553; AVX2-FAST-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
3554; AVX2-FAST:       # %bb.0:
3555; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3556; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31]
3557; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
3558; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3559; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
3560; AVX2-FAST-NEXT:    retq
3561;
3562; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
3563; AVX512VL:       # %bb.0:
3564; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31]
3565; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3566; AVX512VL-NEXT:    retq
3567  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 1, i32 3, i32 20, i32 22, i32 21, i32 31, i32 8, i32 10, i32 9, i32 11, i32 28, i32 30, i32 29, i32 31>
3568  ret <16 x i16> %shuffle
3569}
3570
3571define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
3572; AVX1-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
3573; AVX1:       # %bb.0:
3574; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
3575; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
3576; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
3577; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
3578; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3579; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
3580; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
3581; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
3582; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
3583; AVX1-NEXT:    retq
3584;
3585; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
3586; AVX2-SLOW:       # %bb.0:
3587; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
3588; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7]
3589; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15]
3590; AVX2-SLOW-NEXT:    retq
3591;
3592; AVX2-FAST-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
3593; AVX2-FAST:       # %bb.0:
3594; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
3595; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15,24,25,24,25,22,23,20,21,24,25,26,27,28,29,30,31]
3596; AVX2-FAST-NEXT:    retq
3597;
3598; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
3599; AVX512VL:       # %bb.0:
3600; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u>
3601; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3602; AVX512VL-NEXT:    retq
3603  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 3, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 12, i32 11, i32 26, i32 undef, i32 undef, i32 undef, i32 undef>
3604  ret <16 x i16> %shuffle
3605}
3606
3607define <16 x i16> @shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
3608; AVX1-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu:
3609; AVX1:       # %bb.0:
3610; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3611; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3612; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
3613; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
3614; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3615; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
3616; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3617; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3618; AVX1-NEXT:    retq
3619;
3620; AVX2-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu:
3621; AVX2:       # %bb.0:
3622; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3623; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3,16,17,22,23,20,21,26,27,16,17,26,27,16,17,18,19]
3624; AVX2-NEXT:    retq
3625;
3626; AVX512VL-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu:
3627; AVX512VL:       # %bb.0:
3628; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u>
3629; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3630; AVX512VL-NEXT:    retq
3631  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 3, i32 2, i32 21, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 11, i32 10, i32 29, i32 undef, i32 undef, i32 undef, i32 undef>
3632  ret <16 x i16> %shuffle
3633}
3634
3635define <16 x i16> @shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
3636; ALL-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu:
3637; ALL:       # %bb.0:
3638; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm1[0,2,2,3,4,6,6,7]
3639; ALL-NEXT:    retq
3640  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 21, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 29, i32 undef, i32 undef, i32 undef, i32 undef>
3641  ret <16 x i16> %shuffle
3642}
3643
3644define <16 x i16> @shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
3645; AVX1-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu:
3646; AVX1:       # %bb.0:
3647; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3648; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3649; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
3650; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7]
3651; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
3652; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
3653; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3654; AVX1-NEXT:    retq
3655;
3656; AVX2-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu:
3657; AVX2:       # %bb.0:
3658; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3659; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
3660; AVX2-NEXT:    retq
3661;
3662; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu:
3663; AVX512VL:       # %bb.0:
3664; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u>
3665; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3666; AVX512VL-NEXT:    retq
3667  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 21, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 10, i32 29, i32 undef, i32 undef, i32 undef, i32 undef>
3668  ret <16 x i16> %shuffle
3669}
3670
3671define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11(<16 x i16> %a, <16 x i16> %b) {
3672; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
3673; AVX1:       # %bb.0:
3674; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3675; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3676; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3677; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm0[7]
3678; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
3679; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3680; AVX1-NEXT:    retq
3681;
3682; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
3683; AVX2:       # %bb.0:
3684; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,2]
3685; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
3686; AVX2-NEXT:    retq
3687;
3688; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
3689; AVX512VL:       # %bb.0:
3690; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,4,5,6,27,u,u,u,u,12,13,14,27>
3691; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
3692; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
3693; AVX512VL-NEXT:    retq
3694  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 11>
3695  ret <16 x i16> %shuffle
3696}
3697
3698define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
3699; AVX1-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu:
3700; AVX1:       # %bb.0:
3701; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3702; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3703; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
3704; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
3705; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
3706; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
3707; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3708; AVX1-NEXT:    retq
3709;
3710; AVX2-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu:
3711; AVX2:       # %bb.0:
3712; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
3713; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
3714; AVX2-NEXT:    retq
3715;
3716; AVX512VL-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu:
3717; AVX512VL:       # %bb.0:
3718; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u>
3719; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
3720; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
3721; AVX512VL-NEXT:    retq
3722  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 20, i32 21, i32 22, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
3723  ret <16 x i16> %shuffle
3724}
3725
3726define <16 x i16> @shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11(<16 x i16> %a, <16 x i16> %b) {
3727; AVX1-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
3728; AVX1:       # %bb.0:
3729; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3730; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3731; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
3732; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6],xmm0[7]
3733; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
3734; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
3735; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
3736; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3737; AVX1-NEXT:    retq
3738;
3739; AVX2-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
3740; AVX2:       # %bb.0:
3741; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3742; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3743; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,u,u>
3744; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3745; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7,16,17,18,19,20,21,26,27,24,25,26,27,28,29,22,23]
3746; AVX2-NEXT:    retq
3747;
3748; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
3749; AVX512VL:       # %bb.0:
3750; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11]
3751; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3752; AVX512VL-NEXT:    retq
3753  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 21, i32 20, i32 21, i32 22, i32 11, i32 8, i32 9, i32 10, i32 29, i32 28, i32 29, i32 30, i32 11>
3754  ret <16 x i16> %shuffle
3755}
3756
3757define <16 x i16> @shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15(<16 x i16> %a, <16 x i16> %b) {
3758; AVX1-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15:
3759; AVX1:       # %bb.0:
3760; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3761; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3762; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
3763; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
3764; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6],xmm0[7]
3765; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3766; AVX1-NEXT:    retq
3767;
3768; AVX2-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15:
3769; AVX2:       # %bb.0:
3770; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3771; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12,13,14],ymm0[15]
3772; AVX2-NEXT:    retq
3773;
3774; AVX512VL-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15:
3775; AVX512VL:       # %bb.0:
3776; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15]
3777; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3778; AVX512VL-NEXT:    retq
3779  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 20, i32 21, i32 22, i32 15, i32 8, i32 25, i32 10, i32 11, i32 28, i32 29, i32 30, i32 15>
3780  ret <16 x i16> %shuffle
3781}
3782
3783define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25(<16 x i16> %a, <16 x i16> %b) {
3784; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
3785; AVX1:       # %bb.0:
3786; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
3787; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
3788; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3789; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
3790; AVX1-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
3791; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm1[7]
3792; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
3793; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
3794; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
3795; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3796; AVX1-NEXT:    retq
3797;
3798; AVX2-SLOW-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
3799; AVX2-SLOW:       # %bb.0:
3800; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm1
3801; AVX2-SLOW-NEXT:    vpbroadcastd %xmm1, %ymm1
3802; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15]
3803; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15]
3804; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
3805; AVX2-SLOW-NEXT:    retq
3806;
3807; AVX2-FAST-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
3808; AVX2-FAST:       # %bb.0:
3809; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15,16,17,18,19,20,21,18,19,24,25,26,27,30,31,30,31]
3810; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm1
3811; AVX2-FAST-NEXT:    vpbroadcastd %xmm1, %ymm1
3812; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
3813; AVX2-FAST-NEXT:    retq
3814;
3815; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
3816; AVX512VL:       # %bb.0:
3817; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,1,u,5,7,25,u,u,u,9,u,13,15,25>
3818; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3819; AVX512VL-NEXT:    retq
3820  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 25, i32 undef, i32 undef, i32 undef, i32 9, i32 undef, i32 13, i32 15, i32 25>
3821  ret <16 x i16> %shuffle
3822}
3823
3824define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu(<16 x i16> %a, <16 x i16> %b) {
3825; AVX1-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
3826; AVX1:       # %bb.0:
3827; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3828; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
3829; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
3830; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
3831; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
3832; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
3833; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3834; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
3835; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
3836; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3837; AVX1-NEXT:    retq
3838;
3839; AVX2-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
3840; AVX2:       # %bb.0:
3841; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5,16,17,20,21,20,21,22,23,16,17,20,21,24,25,20,21]
3842; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3843; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3844; AVX2-NEXT:    retq
3845;
3846; AVX512VL-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
3847; AVX512VL:       # %bb.0:
3848; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,20,u,0,2,4,u,u,u,28,u,8,10,12,u>
3849; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
3850; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
3851; AVX512VL-NEXT:    retq
3852  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 16, i32 18, i32 20, i32 undef, i32 undef, i32 undef, i32 12, i32 undef, i32 24, i32 26, i32 28, i32 undef>
3853  ret <16 x i16> %shuffle
3854}
3855
3856define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12(<16 x i16> %a, <16 x i16> %b) {
3857; AVX1-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
3858; AVX1:       # %bb.0:
3859; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3860; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3861; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
3862; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
3863; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
3864; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3865; AVX1-NEXT:    retq
3866;
3867; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
3868; AVX2:       # %bb.0:
3869; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
3870; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3871; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3872; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3873; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
3874; AVX2-NEXT:    retq
3875;
3876; AVX512VL-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
3877; AVX512VL:       # %bb.0:
3878; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
3879; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
3880; AVX512VL-NEXT:    retq
3881  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 12, i32 29, i32 30, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12>
3882  ret <16 x i16> %shuffle
3883}
3884
3885define <16 x i16> @shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_uu(<16 x i16> %a, <16 x i16> %b) {
3886; AVX1-LABEL: shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_uu:
3887; AVX1:       # %bb.0:
3888; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3889; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3890; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
3891; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
3892; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3893; AVX1-NEXT:    retq
3894;
3895; AVX2OR512VL-LABEL: shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_uu:
3896; AVX2OR512VL:       # %bb.0:
3897; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9],ymm1[26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25]
3898; AVX2OR512VL-NEXT:    retq
3899  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 22, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 30, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
3900  ret <16 x i16> %shuffle
3901}
3902
3903define <16 x i16> @shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12(<16 x i16> %a, <16 x i16> %b) {
3904; AVX1-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
3905; AVX1:       # %bb.0:
3906; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3907; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
3908; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
3909; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
3910; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3911; AVX1-NEXT:    retq
3912;
3913; AVX2-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
3914; AVX2:       # %bb.0:
3915; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3916; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3917; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3918; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
3919; AVX2-NEXT:    retq
3920;
3921; AVX512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
3922; AVX512VL:       # %bb.0:
3923; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,6,7,0,1,2,3,12,13,14,15,8,9,10,11,12]
3924; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
3925; AVX512VL-NEXT:    retq
3926  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12>
3927  ret <16 x i16> %shuffle
3928}
3929
3930define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_uu(<16 x i16> %a, <16 x i16> %b) {
3931; AVX1-LABEL: shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_uu:
3932; AVX1:       # %bb.0:
3933; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
3934; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3935; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
3936; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3937; AVX1-NEXT:    retq
3938;
3939; AVX2OR512VL-LABEL: shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_uu:
3940; AVX2OR512VL:       # %bb.0:
3941; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
3942; AVX2OR512VL-NEXT:    retq
3943  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 14, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
3944  ret <16 x i16> %shuffle
3945}
3946
3947define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_uu(<16 x i16> %a, <16 x i16> %b) {
3948; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_uu:
3949; AVX1:       # %bb.0:
3950; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
3951; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3952; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
3953; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3954; AVX1-NEXT:    retq
3955;
3956; AVX2OR512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_uu:
3957; AVX2OR512VL:       # %bb.0:
3958; AVX2OR512VL-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25]
3959; AVX2OR512VL-NEXT:    retq
3960  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
3961  ret <16 x i16> %shuffle
3962}
3963
3964define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10(<16 x i16> %a, <16 x i16> %b) {
3965; AVX1-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
3966; AVX1:       # %bb.0:
3967; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3968; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3969; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
3970; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
3971; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,8,9,0,1,4,5,10,11]
3972; AVX1-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3973; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
3974; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3975; AVX1-NEXT:    retq
3976;
3977; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
3978; AVX2:       # %bb.0:
3979; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
3980; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3981; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3982; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3983; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
3984; AVX2-NEXT:    retq
3985;
3986; AVX512VL-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
3987; AVX512VL:       # %bb.0:
3988; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
3989; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
3990; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
3991; AVX512VL-NEXT:    retq
3992  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 10, i32 27, i32 28, i32 29, i32 30, i32 31, i32 8, i32 9, i32 10>
3993  ret <16 x i16> %shuffle
3994}
3995
3996define <16 x i16> @shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_uu(<16 x i16> %a, <16 x i16> %b) {
3997; AVX1-LABEL: shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_uu:
3998; AVX1:       # %bb.0:
3999; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4000; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4001; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
4002; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
4003; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
4004; AVX1-NEXT:    retq
4005;
4006; AVX2OR512VL-LABEL: shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_uu:
4007; AVX2OR512VL:       # %bb.0:
4008; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5],ymm1[22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21]
4009; AVX2OR512VL-NEXT:    retq
4010  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 20, i32 21, i32 22, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 undef, i32 undef, i32 9, i32 undef>
4011  ret <16 x i16> %shuffle
4012}
4013
4014define <16 x i16> @shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10(<16 x i16> %a, <16 x i16> %b) {
4015; AVX1-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
4016; AVX1:       # %bb.0:
4017; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4018; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
4019; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
4020; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
4021; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4022; AVX1-NEXT:    retq
4023;
4024; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
4025; AVX2:       # %bb.0:
4026; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
4027; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4028; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
4029; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
4030; AVX2-NEXT:    retq
4031;
4032; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
4033; AVX512VL:       # %bb.0:
4034; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,4,5,6,7,0,1,10,11,12,13,14,15,8,9,10]
4035; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
4036; AVX512VL-NEXT:    retq
4037  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10>
4038  ret <16 x i16> %shuffle
4039}
4040
4041define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_uu(<16 x i16> %a, <16 x i16> %b) {
4042; AVX1-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_uu:
4043; AVX1:       # %bb.0:
4044; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
4045; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4046; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
4047; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4048; AVX1-NEXT:    retq
4049;
4050; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_uu:
4051; AVX2OR512VL:       # %bb.0:
4052; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
4053; AVX2OR512VL-NEXT:    retq
4054  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 9, i32 undef>
4055  ret <16 x i16> %shuffle
4056}
4057
4058define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
4059; AVX1-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_uu:
4060; AVX1:       # %bb.0:
4061; AVX1-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4062; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4063; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4064; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4065; AVX1-NEXT:    retq
4066;
4067; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_uu:
4068; AVX2OR512VL:       # %bb.0:
4069; AVX2OR512VL-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4070; AVX2OR512VL-NEXT:    retq
4071  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
4072  ret <16 x i16> %shuffle
4073}
4074
4075define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26(<16 x i16> %a, <16 x i16> %b) {
4076; AVX1-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
4077; AVX1:       # %bb.0:
4078; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4079; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4080; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
4081; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
4082; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,8,9,0,1,4,5,10,11]
4083; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4084; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
4085; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
4086; AVX1-NEXT:    retq
4087;
4088; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
4089; AVX2:       # %bb.0:
4090; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
4091; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
4092; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4093; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
4094; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
4095; AVX2-NEXT:    retq
4096;
4097; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
4098; AVX512VL:       # %bb.0:
4099; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
4100; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
4101; AVX512VL-NEXT:    retq
4102  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26>
4103  ret <16 x i16> %shuffle
4104}
4105
4106define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_uu(<16 x i16> %a, <16 x i16> %b) {
4107; AVX1-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_uu:
4108; AVX1:       # %bb.0:
4109; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4110; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4111; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
4112; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
4113; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
4114; AVX1-NEXT:    retq
4115;
4116; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_uu:
4117; AVX2OR512VL:       # %bb.0:
4118; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5],ymm0[22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21]
4119; AVX2OR512VL-NEXT:    retq
4120  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 17, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 25, i32 undef>
4121  ret <16 x i16> %shuffle
4122}
4123
4124define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28(<16 x i16> %a, <16 x i16> %b) {
4125; AVX1-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
4126; AVX1:       # %bb.0:
4127; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4128; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4129; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
4130; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
4131; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
4132; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
4133; AVX1-NEXT:    retq
4134;
4135; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
4136; AVX2:       # %bb.0:
4137; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
4138; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
4139; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4140; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
4141; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
4142; AVX2-NEXT:    retq
4143;
4144; AVX512VL-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
4145; AVX512VL:       # %bb.0:
4146; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
4147; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
4148; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
4149; AVX512VL-NEXT:    retq
4150  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 28, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28>
4151  ret <16 x i16> %shuffle
4152}
4153
4154define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu(<16 x i16> %a, <16 x i16> %b) {
4155; AVX1-LABEL: shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu:
4156; AVX1:       # %bb.0:
4157; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4158; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4159; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
4160; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
4161; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
4162; AVX1-NEXT:    retq
4163;
4164; AVX2OR512VL-LABEL: shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu:
4165; AVX2OR512VL:       # %bb.0:
4166; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9],ymm0[26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25]
4167; AVX2OR512VL-NEXT:    retq
4168  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 14, i32 undef, i32 undef, i32 25, i32 26, i32 27, i32 undef>
4169  ret <16 x i16> %shuffle
4170}
4171
4172define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu(<16 x i16> %a, <16 x i16> %b) {
4173; AVX1-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
4174; AVX1:       # %bb.0:
4175; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4176; AVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4177; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4178; AVX1-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,4]
4179; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
4180; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4181; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
4182; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4183; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
4184; AVX1-NEXT:    retq
4185;
4186; AVX2-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
4187; AVX2:       # %bb.0:
4188; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8,9,10,11],ymm1[12],ymm0[13,14],ymm1[15]
4189; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,6,7,6,7,8,9,8,9,10,11,14,15,30,31,30,31,22,23,22,23,24,25,24,25,26,27,30,31]
4190; AVX2-NEXT:    retq
4191;
4192; AVX512VL-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
4193; AVX512VL:       # %bb.0:
4194; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u>
4195; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
4196; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
4197; AVX512VL-NEXT:    retq
4198  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 undef, i32 3, i32 undef, i32 20, i32 20, i32 5, i32 undef, i32 31, i32 undef, i32 11, i32 undef, i32 28, i32 28, i32 13, i32 undef>
4199  ret <16 x i16> %shuffle
4200}
4201
4202define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> %a, <16 x i16> %b) {
4203; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
4204; AVX1:       # %bb.0:
4205; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4206; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
4207; AVX1-NEXT:    retq
4208;
4209; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
4210; AVX2OR512VL:       # %bb.0:
4211; AVX2OR512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4212; AVX2OR512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
4213; AVX2OR512VL-NEXT:    retq
4214  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19>
4215  ret <16 x i16> %shuffle
4216}
4217
4218define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) {
4219; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
4220; AVX1:       # %bb.0:
4221; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
4222; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4223; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
4224; AVX1-NEXT:    retq
4225;
4226; AVX2-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
4227; AVX2-SLOW:       # %bb.0:
4228; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
4229; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
4230; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
4231; AVX2-SLOW-NEXT:    retq
4232;
4233; AVX2-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
4234; AVX2-FAST:       # %bb.0:
4235; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
4236; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
4237; AVX2-FAST-NEXT:    retq
4238;
4239; AVX512VL-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
4240; AVX512VL-SLOW:       # %bb.0:
4241; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
4242; AVX512VL-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
4243; AVX512VL-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
4244; AVX512VL-SLOW-NEXT:    retq
4245;
4246; AVX512VL-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
4247; AVX512VL-FAST:       # %bb.0:
4248; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
4249; AVX512VL-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
4250; AVX512VL-FAST-NEXT:    retq
4251  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4252  ret <16 x i16> %shuffle
4253}
4254
4255define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, <16 x i16> %b) {
4256; AVX1-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
4257; AVX1:       # %bb.0:
4258; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4259; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
4260; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4261; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
4262; AVX1-NEXT:    retq
4263;
4264; AVX2OR512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
4265; AVX2OR512VL:       # %bb.0:
4266; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
4267; AVX2OR512VL-NEXT:    vpbroadcastw %xmm0, %ymm0
4268; AVX2OR512VL-NEXT:    retq
4269  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
4270  ret <16 x i16> %shuffle
4271}
4272
4273define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
4274; ALL-LABEL: shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u:
4275; ALL:       # %bb.0:
4276; ALL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4277; ALL-NEXT:    retq
4278  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4279  ret <16 x i16> %shuffle
4280}
4281
4282define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
4283; AVX1-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
4284; AVX1:       # %bb.0:
4285; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
4286; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4287; AVX1-NEXT:    retq
4288;
4289; AVX2-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
4290; AVX2-SLOW:       # %bb.0:
4291; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
4292; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
4293; AVX2-SLOW-NEXT:    retq
4294;
4295; AVX2-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
4296; AVX2-FAST:       # %bb.0:
4297; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
4298; AVX2-FAST-NEXT:    retq
4299;
4300; AVX512VL-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
4301; AVX512VL-SLOW:       # %bb.0:
4302; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
4303; AVX512VL-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
4304; AVX512VL-SLOW-NEXT:    retq
4305;
4306; AVX512VL-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
4307; AVX512VL-FAST:       # %bb.0:
4308; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
4309; AVX512VL-FAST-NEXT:    retq
4310  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4311  ret <16 x i16> %shuffle
4312}
4313
4314define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
4315; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
4316; AVX1:       # %bb.0:
4317; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4318; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
4319; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4320; AVX1-NEXT:    retq
4321;
4322; AVX2-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
4323; AVX2-SLOW:       # %bb.0:
4324; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
4325; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
4326; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
4327; AVX2-SLOW-NEXT:    retq
4328;
4329; AVX2-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
4330; AVX2-FAST:       # %bb.0:
4331; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
4332; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
4333; AVX2-FAST-NEXT:    retq
4334;
4335; AVX512VL-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
4336; AVX512VL-SLOW:       # %bb.0:
4337; AVX512VL-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
4338; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
4339; AVX512VL-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
4340; AVX512VL-SLOW-NEXT:    retq
4341;
4342; AVX512VL-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
4343; AVX512VL-FAST:       # %bb.0:
4344; AVX512VL-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
4345; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
4346; AVX512VL-FAST-NEXT:    retq
4347  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4348  ret <16 x i16> %shuffle
4349}
4350
4351define <16 x i16> @shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25(<16 x i16> %a0, <16 x i16> %a1) {
4352; AVX1-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
4353; AVX1:       # %bb.0:
4354; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4355; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
4356; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
4357; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4358; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4359; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4360; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
4361; AVX1-NEXT:    retq
4362;
4363; AVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
4364; AVX2:       # %bb.0:
4365; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19]
4366; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31]
4367; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
4368; AVX2-NEXT:    retq
4369;
4370; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
4371; AVX512VL:       # %bb.0:
4372; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
4373; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
4374; AVX512VL-NEXT:    retq
4375  %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>
4376  ret <16 x i16> %1
4377}
4378
4379define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25(<16 x i16> %a0, <16 x i16> %a1) {
4380; AVX1-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
4381; AVX1:       # %bb.0:
4382; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4383; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4384; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
4385; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,2,4,5,6,7]
4386; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4387; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
4388; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
4389; AVX1-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,7]
4390; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
4391; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
4392; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7]
4393; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4394; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7]
4395; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
4396; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
4397; AVX1-NEXT:    retq
4398;
4399; AVX2-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
4400; AVX2:       # %bb.0:
4401; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19]
4402; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31]
4403; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
4404; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4405; AVX2-NEXT:    retq
4406;
4407; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
4408; AVX512VL:       # %bb.0:
4409; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,18,3,19,10,26,11,27,0,16,1,17,8,24,9,25]
4410; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
4411; AVX512VL-NEXT:    retq
4412  %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>
4413  %2 = bitcast <16 x i16> %1 to <4 x i64>
4414  %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
4415  %4 = bitcast <4 x i64> %3 to <16 x i16>
4416  ret <16 x i16> %4
4417}
4418
4419define <16 x i16> @shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13(<16 x i16> %a) {
4420; AVX1-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
4421; AVX1:       # %bb.0:
4422; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,7,5]
4423; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
4424; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4425; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5]
4426; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4427; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4428; AVX1-NEXT:    retq
4429;
4430; AVX2-SLOW-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
4431; AVX2-SLOW:       # %bb.0:
4432; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,5,8,9,10,11,12,14,15,13]
4433; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
4434; AVX2-SLOW-NEXT:    retq
4435;
4436; AVX2-FAST-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
4437; AVX2-FAST:       # %bb.0:
4438; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11,24,25,28,29,30,31,26,27,24,25,28,29,30,31,26,27]
4439; AVX2-FAST-NEXT:    retq
4440;
4441; AVX512VL-SLOW-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
4442; AVX512VL-SLOW:       # %bb.0:
4443; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,5,8,9,10,11,12,14,15,13]
4444; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
4445; AVX512VL-SLOW-NEXT:    retq
4446;
4447; AVX512VL-FAST-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
4448; AVX512VL-FAST:       # %bb.0:
4449; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11,24,25,28,29,30,31,26,27,24,25,28,29,30,31,26,27]
4450; AVX512VL-FAST-NEXT:    retq
4451  %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 4, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7, i32 5, i32 12, i32 14, i32 15, i32 undef, i32 undef, i32 14, i32 15, i32 13>
4452  ret <16 x i16> %shuffle
4453}
4454
4455define <16 x i16> @insert_v16i16_0elt_into_zero_vector(i16* %ptr) {
4456; ALL-LABEL: insert_v16i16_0elt_into_zero_vector:
4457; ALL:       # %bb.0:
4458; ALL-NEXT:    movzwl (%rdi), %eax
4459; ALL-NEXT:    vmovd %eax, %xmm0
4460; ALL-NEXT:    retq
4461  %val = load i16, i16* %ptr
4462  %i0 = insertelement <16 x i16> zeroinitializer, i16 %val, i32 0
4463  ret <16 x i16> %i0
4464}
4465
4466define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i16> %a, <16 x i16> %b) {
4467; ALL-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31:
4468; ALL:       # %bb.0:
4469; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4470; ALL-NEXT:    retq
4471  %alo = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4472  %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4473  %shuf = shufflevector <8 x i16> %alo, <8 x i16> %bhi, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4474  ret <16 x i16> %shuf
4475}
4476
4477define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) {
4478; AVX1-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
4479; AVX1:       # %bb.0:
4480; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
4481; AVX1-NEXT:    retq
4482;
4483; AVX2-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
4484; AVX2:       # %bb.0:
4485; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
4486; AVX2-NEXT:    retq
4487;
4488; AVX512VL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
4489; AVX512VL:       # %bb.0:
4490; AVX512VL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
4491; AVX512VL-NEXT:    retq
4492  %ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4493  %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4494  %bc0hi = bitcast <8 x i16> %ahi to <16 x i8>
4495  %bc1hi = bitcast <8 x i16> %bhi to <16 x i8>
4496  %shuffle8 = shufflevector <16 x i8> %bc0hi, <16 x i8> %bc1hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
4497  %shuffle16 = bitcast <32 x i8> %shuffle8 to <16 x i16>
4498  ret <16 x i16> %shuffle16
4499}
4500
4501define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) {
4502; AVX1-LABEL: PR24935:
4503; AVX1:       # %bb.0:
4504; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
4505; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4506; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
4507; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5,6,7]
4508; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
4509; AVX1-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,5,5,6,7]
4510; AVX1-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[2,3,2,3,4,5,6,7,8,9,8,9,0,1,2,3]
4511; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7]
4512; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6],xmm2[7]
4513; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
4514; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
4515; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7]
4516; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,4,5,4,5,10,11,4,5,14,15,12,13,0,1]
4517; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5],xmm0[6],xmm1[7]
4518; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
4519; AVX1-NEXT:    retq
4520;
4521; AVX2-SLOW-LABEL: PR24935:
4522; AVX2-SLOW:       # %bb.0:
4523; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
4524; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
4525; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
4526; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>
4527; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
4528; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4529; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15]
4530; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
4531; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
4532; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15]
4533; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
4534; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
4535; AVX2-SLOW-NEXT:    retq
4536;
4537; AVX2-FAST-LABEL: PR24935:
4538; AVX2-FAST:       # %bb.0:
4539; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
4540; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
4541; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
4542; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>
4543; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
4544; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,2,3,6,7,10,11,10,11,12,13,14,15,16,17,18,19,18,19,22,23,26,27,26,27,28,29,30,31]
4545; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
4546; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
4547; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15]
4548; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
4549; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
4550; AVX2-FAST-NEXT:    retq
4551;
4552; AVX512VL-LABEL: PR24935:
4553; AVX512VL:       # %bb.0:
4554; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8]
4555; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
4556; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
4557; AVX512VL-NEXT:    retq
4558  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 27, i32 26, i32 1, i32 29, i32 26, i32 23, i32 11, i32 16, i32 1, i32 9, i32 16, i32 28, i32 13, i32 4, i32 0, i32 24>
4559  ret <16 x i16> %shuffle
4560}
4561
4562define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) {
4563; AVX1-LABEL: PR34369:
4564; AVX1:       # %bb.0:
4565; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4566; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[8,9,10,11,4,5,10,11,8,9,10,11,4,5,4,5]
4567; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,0,1,6,7,10,11,4,5,4,5,6,7]
4568; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6],xmm3[7]
4569; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[14,15,0,1,12,13,0,1,2,3,4,5,8,9,8,9]
4570; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
4571; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4572; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
4573; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
4574; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm1, %xmm1
4575; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
4576; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
4577; AVX1-NEXT:    retq
4578;
4579; AVX2-LABEL: PR34369:
4580; AVX2:       # %bb.0:
4581; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
4582; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,4,5,10,11,8,9,10,11,4,5,4,5]
4583; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
4584; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4585; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
4586; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4587; AVX2-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm1
4588; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
4589; AVX2-NEXT:    retq
4590;
4591; AVX512VL-LABEL: PR34369:
4592; AVX512VL:       # %bb.0:
4593; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,0,13,5,2,2,10,15,8,14,8,9,10,12,12]
4594; AVX512VL-NEXT:    vptestnmw %ymm1, %ymm1, %k1
4595; AVX512VL-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
4596; AVX512VL-NEXT:    retq
4597  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
4598  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
4599  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
4600  ret <16 x i16> %res
4601}
4602
4603define <16 x i16> @insert_dup_mem_v16i16_i32(i32* %ptr) {
4604; AVX1-LABEL: insert_dup_mem_v16i16_i32:
4605; AVX1:       # %bb.0:
4606; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
4607; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
4608; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4609; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
4610; AVX1-NEXT:    retq
4611;
4612; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_i32:
4613; AVX2OR512VL:       # %bb.0:
4614; AVX2OR512VL-NEXT:    vpbroadcastw (%rdi), %ymm0
4615; AVX2OR512VL-NEXT:    retq
4616  %tmp = load i32, i32* %ptr, align 4
4617  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
4618  %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
4619  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> zeroinitializer
4620  ret <16 x i16> %tmp3
4621}
4622
4623define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) {
4624; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16:
4625; AVX1:       # %bb.0:
4626; AVX1-NEXT:    movswl (%rdi), %eax
4627; AVX1-NEXT:    vmovd %eax, %xmm0
4628; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
4629; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4630; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
4631; AVX1-NEXT:    retq
4632;
4633; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16:
4634; AVX2:       # %bb.0:
4635; AVX2-NEXT:    movswl (%rdi), %eax
4636; AVX2-NEXT:    vmovd %eax, %xmm0
4637; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
4638; AVX2-NEXT:    retq
4639;
4640; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16:
4641; AVX512VL:       # %bb.0:
4642; AVX512VL-NEXT:    movswl (%rdi), %eax
4643; AVX512VL-NEXT:    vpbroadcastw %eax, %ymm0
4644; AVX512VL-NEXT:    retq
4645  %tmp = load i16, i16* %ptr, align 2
4646  %tmp1 = sext i16 %tmp to i32
4647  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
4648  %tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16>
4649  %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <16 x i32> zeroinitializer
4650  ret <16 x i16> %tmp4
4651}
4652
4653define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(i32* %ptr) #0 {
4654; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i32:
4655; AVX1:       # %bb.0:
4656; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
4657; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
4658; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4659; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
4660; AVX1-NEXT:    retq
4661;
4662; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i16_i32:
4663; AVX2OR512VL:       # %bb.0:
4664; AVX2OR512VL-NEXT:    vpbroadcastw 2(%rdi), %ymm0
4665; AVX2OR512VL-NEXT:    retq
4666  %tmp = load i32, i32* %ptr, align 4
4667  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
4668  %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
4669  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
4670  ret <16 x i16> %tmp3
4671}
4672
4673define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 {
4674; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32:
4675; AVX1:       # %bb.0:
4676; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
4677; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
4678; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
4679; AVX1-NEXT:    retq
4680;
4681; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i32:
4682; AVX2OR512VL:       # %bb.0:
4683; AVX2OR512VL-NEXT:    vpbroadcastw 2(%rdi), %ymm0
4684; AVX2OR512VL-NEXT:    retq
4685  %tmp = load i32, i32* %ptr, align 4
4686  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
4687  %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
4688  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4689  ret <16 x i16> %tmp3
4690}
4691