• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
7
8target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
9target triple = "x86_64-unknown-unknown"
10
11define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
12; FIXME: SSE2 should look like the following:
13; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
14; FIXME:       # BB#0:
15; FIXME-NEXT:    punpcklbw %xmm0, %xmm0
16; FIXME-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
17; FIXME-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
18; FIXME-NEXT:    retq
19;
20; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
21; SSE2:       # BB#0:
22; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
23; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
24; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
25; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
26; SSE2-NEXT:    retq
27;
28; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
29; SSSE3:       # BB#0:
30; SSSE3-NEXT:    pxor %xmm1, %xmm1
31; SSSE3-NEXT:    pshufb %xmm1, %xmm0
32; SSSE3-NEXT:    retq
33;
34; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
35; SSE41:       # BB#0:
36; SSE41-NEXT:    pxor %xmm1, %xmm1
37; SSE41-NEXT:    pshufb %xmm1, %xmm0
38; SSE41-NEXT:    retq
39;
40; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
41; AVX1:       # BB#0:
42; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
43; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
44; AVX1-NEXT:    retq
45;
46; AVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
47; AVX2:       # BB#0:
48; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
49; AVX2-NEXT:    retq
50  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
51  ret <16 x i8> %shuffle
52}
53
54define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
55; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
56; SSE2:       # BB#0:
57; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
58; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
59; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
60; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
61; SSE2-NEXT:    retq
62;
63; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
64; SSSE3:       # BB#0:
65; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
66; SSSE3-NEXT:    retq
67;
68; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
69; SSE41:       # BB#0:
70; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
71; SSE41-NEXT:    retq
72;
73; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
74; AVX:       # BB#0:
75; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
76; AVX-NEXT:    retq
77  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
78  ret <16 x i8> %shuffle
79}
80
81define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
82; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
83; SSE2:       # BB#0:
84; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
85; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,2,4,5,6,7]
86; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
87; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
88; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
89; SSE2-NEXT:    retq
90;
91; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
92; SSSE3:       # BB#0:
93; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
94; SSSE3-NEXT:    retq
95;
96; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
97; SSE41:       # BB#0:
98; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
99; SSE41-NEXT:    retq
100;
101; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
102; AVX:       # BB#0:
103; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
104; AVX-NEXT:    retq
105  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
106  ret <16 x i8> %shuffle
107}
108
109define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
110; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
111; SSE:       # BB#0:
112; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
113; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
114; SSE-NEXT:    retq
115;
116; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
117; AVX:       # BB#0:
118; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
119; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
120; AVX-NEXT:    retq
121  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
122  ret <16 x i8> %shuffle
123}
124
125define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) {
126; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
127; SSE:       # BB#0:
128; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
129; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
130; SSE-NEXT:    retq
131;
132; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
133; AVX:       # BB#0:
134; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
135; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
136; AVX-NEXT:    retq
137  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
138  ret <16 x i8> %shuffle
139}
140
141define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
142; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
143; SSE2:       # BB#0:
144; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
145; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
146; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
147; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
148; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
149; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
150; SSE2-NEXT:    retq
151;
152; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
153; SSSE3:       # BB#0:
154; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
155; SSSE3-NEXT:    retq
156;
157; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
158; SSE41:       # BB#0:
159; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
160; SSE41-NEXT:    retq
161;
162; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
163; AVX:       # BB#0:
164; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
165; AVX-NEXT:    retq
166  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
167  ret <16 x i8> %shuffle
168}
169
170define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
171; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
172; SSE:       # BB#0:
173; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
174; SSE-NEXT:    retq
175;
176; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
177; AVX:       # BB#0:
178; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
179; AVX-NEXT:    retq
180  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
181  ret <16 x i8> %shuffle
182}
183
184define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
185; FIXME: SSE2 should be the following:
186; FIXME-LABEL: @shuffle_v16i8_0101010101010101
187; FIXME:       # BB#0:
188; FIXME-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
189; FIXME-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
190; FIXME-NEXT:    retq
191;
192; SSE2-LABEL: shuffle_v16i8_0101010101010101:
193; SSE2:       # BB#0:
194; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
195; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
196; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
197; SSE2-NEXT:    retq
198;
199; SSSE3-LABEL: shuffle_v16i8_0101010101010101:
200; SSSE3:       # BB#0:
201; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
202; SSSE3-NEXT:    retq
203;
204; SSE41-LABEL: shuffle_v16i8_0101010101010101:
205; SSE41:       # BB#0:
206; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
207; SSE41-NEXT:    retq
208;
209; AVX1-LABEL: shuffle_v16i8_0101010101010101:
210; AVX1:       # BB#0:
211; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
212; AVX1-NEXT:    retq
213;
214; AVX2-LABEL: shuffle_v16i8_0101010101010101:
215; AVX2:       # BB#0:
216; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
217; AVX2-NEXT:    retq
218  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
219  ret <16 x i8> %shuffle
220}
221
222define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) {
223; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
224; SSE:       # BB#0:
225; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
226; SSE-NEXT:    retq
227;
228; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
229; AVX:       # BB#0:
230; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
231; AVX-NEXT:    retq
232  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
233  ret <16 x i8> %shuffle
234}
235
236define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) {
237; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
238; SSE:       # BB#0:
239; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
240; SSE-NEXT:    retq
241;
242; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
243; AVX:       # BB#0:
244; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
245; AVX-NEXT:    retq
246  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
247  ret <16 x i8> %shuffle
248}
249
250define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
251; SSE2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
252; SSE2:       # BB#0:
253; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
254; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
255; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
256; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
257; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
258; SSE2-NEXT:    pand %xmm2, %xmm1
259; SSE2-NEXT:    pandn %xmm0, %xmm2
260; SSE2-NEXT:    por %xmm1, %xmm2
261; SSE2-NEXT:    movdqa %xmm2, %xmm0
262; SSE2-NEXT:    retq
263;
264; SSSE3-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
265; SSSE3:       # BB#0:
266; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
267; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
268; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
269; SSSE3-NEXT:    movdqa %xmm1, %xmm0
270; SSSE3-NEXT:    retq
271;
272; SSE41-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
273; SSE41:       # BB#0:
274; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
275; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
276; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
277; SSE41-NEXT:    movdqa %xmm1, %xmm0
278; SSE41-NEXT:    retq
279;
280; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
281; AVX1:       # BB#0:
282; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
283; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
284; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
285; AVX1-NEXT:    retq
286;
287; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
288; AVX2:       # BB#0:
289; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
290; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
291; AVX2-NEXT:    retq
292  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
293  ret <16 x i8> %shuffle
294}
295
296define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) {
297; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
298; SSE2:       # BB#0:
299; SSE2-NEXT:    pxor %xmm1, %xmm1
300; SSE2-NEXT:    movdqa %xmm0, %xmm2
301; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
302; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
303; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
304; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
305; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
306; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
307; SSE2-NEXT:    packuswb %xmm2, %xmm0
308; SSE2-NEXT:    retq
309;
310; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
311; SSSE3:       # BB#0:
312; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
313; SSSE3-NEXT:    retq
314;
315; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
316; SSE41:       # BB#0:
317; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
318; SSE41-NEXT:    retq
319;
320; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
321; AVX:       # BB#0:
322; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
323; AVX-NEXT:    retq
324  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
325  ret <16 x i8> %shuffle
326}
327
328define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
329; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
330; SSE2:       # BB#0:
331; SSE2-NEXT:    pxor %xmm2, %xmm2
332; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
333; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
334; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
335; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
336; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
337; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
338; SSE2-NEXT:    packuswb %xmm1, %xmm0
339; SSE2-NEXT:    retq
340;
341; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
342; SSSE3:       # BB#0:
343; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
344; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
345; SSSE3-NEXT:    retq
346;
347; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
348; SSE41:       # BB#0:
349; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
350; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
351; SSE41-NEXT:    retq
352;
353; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
354; AVX:       # BB#0:
355; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
356; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
357; AVX-NEXT:    retq
358  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
359  ret <16 x i8> %shuffle
360}
361
362define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
363; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
364; SSE2:       # BB#0:
365; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
366; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
367; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
368; SSE2-NEXT:    pxor %xmm1, %xmm1
369; SSE2-NEXT:    movdqa %xmm0, %xmm2
370; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
371; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7]
372; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
373; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
374; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
375; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
376; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
377; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
378; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
379; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
380; SSE2-NEXT:    packuswb %xmm3, %xmm0
381; SSE2-NEXT:    retq
382;
383; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
384; SSSE3:       # BB#0:
385; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
386; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
387; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
388; SSSE3-NEXT:    retq
389;
390; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
391; SSE41:       # BB#0:
392; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
393; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
394; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
395; SSE41-NEXT:    retq
396;
397; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
398; AVX:       # BB#0:
399; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
400; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
401; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
402; AVX-NEXT:    retq
403  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
404  ret <16 x i8> %shuffle
405}
406
407define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i8> %a, <16 x i8> %b) {
408; SSE2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
409; SSE2:       # BB#0:
410; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
411; SSE2-NEXT:    andps %xmm2, %xmm0
412; SSE2-NEXT:    andnps %xmm1, %xmm2
413; SSE2-NEXT:    orps %xmm2, %xmm0
414; SSE2-NEXT:    retq
415;
416; SSSE3-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
417; SSSE3:       # BB#0:
418; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
419; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
420; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
421; SSSE3-NEXT:    retq
422;
423; SSE41-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
424; SSE41:       # BB#0:
425; SSE41-NEXT:    movdqa %xmm0, %xmm2
426; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
427; SSE41-NEXT:    pblendvb %xmm2, %xmm1
428; SSE41-NEXT:    movdqa %xmm1, %xmm0
429; SSE41-NEXT:    retq
430;
431; AVX-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
432; AVX:       # BB#0:
433; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
434; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
435; AVX-NEXT:    retq
436  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
437  ret <16 x i8> %shuffle
438}
439
440define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(<16 x i8> %a, <16 x i8> %b) {
441; SSE2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
442; SSE2:       # BB#0:
443; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
444; SSE2-NEXT:    andps %xmm2, %xmm0
445; SSE2-NEXT:    andnps %xmm1, %xmm2
446; SSE2-NEXT:    orps %xmm2, %xmm0
447; SSE2-NEXT:    retq
448;
449; SSSE3-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
450; SSSE3:       # BB#0:
451; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[15]
452; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero
453; SSSE3-NEXT:    por %xmm1, %xmm0
454; SSSE3-NEXT:    retq
455;
456; SSE41-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
457; SSE41:       # BB#0:
458; SSE41-NEXT:    movdqa %xmm0, %xmm2
459; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
460; SSE41-NEXT:    pblendvb %xmm2, %xmm1
461; SSE41-NEXT:    movdqa %xmm1, %xmm0
462; SSE41-NEXT:    retq
463;
464; AVX-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
465; AVX:       # BB#0:
466; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
467; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
468; AVX-NEXT:    retq
469  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
470  ret <16 x i8> %shuffle
471}
472
473define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(<16 x i8> %a) {
474; SSE-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
475; SSE:       # BB#0:
476; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
477; SSE-NEXT:    retq
478;
479; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
480; AVX:       # BB#0:
481; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
482; AVX-NEXT:    retq
483  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
484  ret <16 x i8> %shuffle
485}
486
487define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) {
488; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
489; SSE2:       # BB#0:
490; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
491; SSE2-NEXT:    andps %xmm2, %xmm0
492; SSE2-NEXT:    andnps %xmm1, %xmm2
493; SSE2-NEXT:    orps %xmm2, %xmm0
494; SSE2-NEXT:    retq
495;
496; SSSE3-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
497; SSSE3:       # BB#0:
498; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
499; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero
500; SSSE3-NEXT:    por %xmm1, %xmm0
501; SSSE3-NEXT:    retq
502;
503; SSE41-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
504; SSE41:       # BB#0:
505; SSE41-NEXT:    movdqa %xmm0, %xmm2
506; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
507; SSE41-NEXT:    pblendvb %xmm2, %xmm1
508; SSE41-NEXT:    movdqa %xmm1, %xmm0
509; SSE41-NEXT:    retq
510;
511; AVX-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
512; AVX:       # BB#0:
513; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
514; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
515; AVX-NEXT:    retq
516  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
517  ret <16 x i8> %shuffle
518}
519
520define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(<16 x i8> %a, <16 x i8> %b) {
521; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
522; SSE2:       # BB#0:
523; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
524; SSE2-NEXT:    andps %xmm2, %xmm1
525; SSE2-NEXT:    andnps %xmm0, %xmm2
526; SSE2-NEXT:    orps %xmm1, %xmm2
527; SSE2-NEXT:    movaps %xmm2, %xmm0
528; SSE2-NEXT:    retq
529;
530; SSSE3-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
531; SSSE3:       # BB#0:
532; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[4,5,6,7],zero,zero,xmm0[10,11],zero,xmm0[13],zero,xmm0[15]
533; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9],zero,zero,xmm1[12],zero,xmm1[14],zero
534; SSSE3-NEXT:    por %xmm1, %xmm0
535; SSSE3-NEXT:    retq
536;
537; SSE41-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
538; SSE41:       # BB#0:
539; SSE41-NEXT:    movdqa %xmm0, %xmm2
540; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
541; SSE41-NEXT:    pblendvb %xmm1, %xmm2
542; SSE41-NEXT:    movdqa %xmm2, %xmm0
543; SSE41-NEXT:    retq
544;
545; AVX-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
546; AVX:       # BB#0:
547; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
548; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
549; AVX-NEXT:    retq
550  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 13, i32 30, i32 15>
551  ret <16 x i8> %shuffle
552}
553
554define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
555; SSE2-LABEL: trunc_v4i32_shuffle:
556; SSE2:       # BB#0:
557; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
558; SSE2-NEXT:    packuswb %xmm0, %xmm0
559; SSE2-NEXT:    packuswb %xmm0, %xmm0
560; SSE2-NEXT:    retq
561;
562; SSSE3-LABEL: trunc_v4i32_shuffle:
563; SSSE3:       # BB#0:
564; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
565; SSSE3-NEXT:    retq
566;
567; SSE41-LABEL: trunc_v4i32_shuffle:
568; SSE41:       # BB#0:
569; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
570; SSE41-NEXT:    retq
571;
572; AVX-LABEL: trunc_v4i32_shuffle:
573; AVX:       # BB#0:
574; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
575; AVX-NEXT:    retq
576  %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
577  ret <16 x i8> %shuffle
578}
579
580define <16 x i8> @stress_test0(<16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i8> %s.0.7, <16 x i8> %s.0.8, <16 x i8> %s.0.9) {
581; We don't have anything useful to check here. This generates 100s of
582; instructions. Instead, just make sure we survived codegen.
583; ALL-LABEL: stress_test0:
584; ALL:         retq
585entry:
586  %s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> <i32 1, i32 22, i32 21, i32 28, i32 3, i32 16, i32 6, i32 1, i32 19, i32 29, i32 12, i32 31, i32 2, i32 3, i32 3, i32 6>
587  %s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> <i32 31, i32 20, i32 12, i32 19, i32 2, i32 15, i32 12, i32 31, i32 2, i32 28, i32 2, i32 30, i32 7, i32 8, i32 17, i32 28>
588  %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> %s.0.9, <16 x i32> <i32 14, i32 10, i32 17, i32 5, i32 17, i32 9, i32 17, i32 21, i32 31, i32 24, i32 16, i32 6, i32 20, i32 28, i32 23, i32 8>
589  %s.2.2 = shufflevector <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i32> <i32 20, i32 9, i32 21, i32 11, i32 11, i32 4, i32 3, i32 18, i32 3, i32 30, i32 4, i32 31, i32 11, i32 24, i32 13, i32 29>
590  %s.3.2 = shufflevector <16 x i8> %s.2.2, <16 x i8> %s.1.4, <16 x i32> <i32 15, i32 13, i32 5, i32 11, i32 7, i32 17, i32 14, i32 22, i32 22, i32 16, i32 7, i32 24, i32 16, i32 22, i32 7, i32 29>
591  %s.5.4 = shufflevector <16 x i8> %s.1.5, <16 x i8> %s.1.8, <16 x i32> <i32 3, i32 13, i32 19, i32 7, i32 23, i32 11, i32 1, i32 9, i32 16, i32 25, i32 2, i32 7, i32 0, i32 21, i32 23, i32 17>
592  %s.6.1 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.3.2, <16 x i32> <i32 11, i32 2, i32 28, i32 31, i32 27, i32 3, i32 9, i32 27, i32 25, i32 25, i32 14, i32 7, i32 12, i32 28, i32 12, i32 23>
593  %s.7.1 = shufflevector <16 x i8> %s.6.1, <16 x i8> %s.3.2, <16 x i32> <i32 15, i32 29, i32 14, i32 0, i32 29, i32 15, i32 26, i32 30, i32 6, i32 7, i32 2, i32 8, i32 12, i32 10, i32 29, i32 17>
594  %s.7.2 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.5.4, <16 x i32> <i32 3, i32 29, i32 3, i32 19, i32 undef, i32 20, i32 undef, i32 3, i32 27, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
595  %s.16.0 = shufflevector <16 x i8> %s.7.1, <16 x i8> %s.7.2, <16 x i32> <i32 13, i32 1, i32 16, i32 16, i32 6, i32 7, i32 29, i32 18, i32 19, i32 28, i32 undef, i32 undef, i32 31, i32 1, i32 undef, i32 10>
596  ret <16 x i8> %s.16.0
597}
598
599define <16 x i8> @undef_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind {
600; There is nothing interesting to check about these instructions other than
601; that they survive codegen. However, we actually do better and delete all of
602; them because the result is 'undef'.
603;
604; ALL-LABEL: undef_test1:
605; ALL:       # BB#0: # %entry
606; ALL-NEXT:    retq
607entry:
608  %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0>
609  %s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> <i32 21, i32 undef, i32 undef, i32 19, i32 undef, i32 undef, i32 29, i32 24, i32 21, i32 23, i32 21, i32 17, i32 19, i32 undef, i32 20, i32 22>
610  %s.2.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> undef, <16 x i32> <i32 3, i32 8, i32 undef, i32 7, i32 undef, i32 10, i32 8, i32 0, i32 15, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 9>
611  %s.2.9 = shufflevector <16 x i8> %s.0.9, <16 x i8> undef, <16 x i32> <i32 7, i32 undef, i32 14, i32 7, i32 8, i32 undef, i32 7, i32 8, i32 5, i32 15, i32 undef, i32 1, i32 11, i32 undef, i32 undef, i32 11>
612  %s.3.4 = shufflevector <16 x i8> %s.2.4, <16 x i8> %s.0.5, <16 x i32> <i32 5, i32 0, i32 21, i32 6, i32 15, i32 27, i32 22, i32 21, i32 4, i32 22, i32 19, i32 26, i32 9, i32 26, i32 8, i32 29>
613  %s.3.9 = shufflevector <16 x i8> %s.2.9, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 8, i32 1, i32 undef, i32 4, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 undef>
614  %s.4.7 = shufflevector <16 x i8> %s.1.8, <16 x i8> %s.2.9, <16 x i32> <i32 9, i32 0, i32 22, i32 20, i32 24, i32 7, i32 21, i32 17, i32 20, i32 12, i32 19, i32 23, i32 2, i32 9, i32 17, i32 10>
615  %s.4.8 = shufflevector <16 x i8> %s.2.9, <16 x i8> %s.3.9, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 10, i32 undef, i32 0, i32 5, i32 undef, i32 9, i32 undef>
616  %s.5.7 = shufflevector <16 x i8> %s.4.7, <16 x i8> %s.4.8, <16 x i32> <i32 16, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
617  %s.8.4 = shufflevector <16 x i8> %s.3.4, <16 x i8> %s.5.7, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 28, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
618  %s.9.4 = shufflevector <16 x i8> %s.8.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 5>
619  %s.10.4 = shufflevector <16 x i8> %s.9.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
620  %s.12.4 = shufflevector <16 x i8> %s.10.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef>
621
622  ret <16 x i8> %s.12.4
623}
624
625define <16 x i8> @PR20540(<8 x i8> %a) {
626; SSE2-LABEL: PR20540:
627; SSE2:       # BB#0:
628; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
629; SSE2-NEXT:    packuswb %xmm0, %xmm0
630; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
631; SSE2-NEXT:    retq
632;
633; SSSE3-LABEL: PR20540:
634; SSSE3:       # BB#0:
635; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
636; SSSE3-NEXT:    retq
637;
638; SSE41-LABEL: PR20540:
639; SSE41:       # BB#0:
640; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
641; SSE41-NEXT:    retq
642;
643; AVX-LABEL: PR20540:
644; AVX:       # BB#0:
645; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
646; AVX-NEXT:    retq
647  %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
648  ret <16 x i8> %shuffle
649}
650
651define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
652; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
653; SSE:       # BB#0:
654; SSE-NEXT:    movzbl %dil, %eax
655; SSE-NEXT:    movd %eax, %xmm0
656; SSE-NEXT:    retq
657;
658; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
659; AVX:       # BB#0:
660; AVX-NEXT:    movzbl %dil, %eax
661; AVX-NEXT:    vmovd %eax, %xmm0
662; AVX-NEXT:    retq
663  %a = insertelement <16 x i8> undef, i8 %i, i32 0
664  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
665  ret <16 x i8> %shuffle
666}
667
668define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
669; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
670; SSE2:       # BB#0:
671; SSE2-NEXT:    shll $8, %edi
672; SSE2-NEXT:    pxor %xmm0, %xmm0
673; SSE2-NEXT:    pinsrw $2, %edi, %xmm0
674; SSE2-NEXT:    retq
675;
676; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
677; SSSE3:       # BB#0:
678; SSSE3-NEXT:    shll $8, %edi
679; SSSE3-NEXT:    pxor %xmm0, %xmm0
680; SSSE3-NEXT:    pinsrw $2, %edi, %xmm0
681; SSSE3-NEXT:    retq
682;
683; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
684; SSE41:       # BB#0:
685; SSE41-NEXT:    pxor %xmm0, %xmm0
686; SSE41-NEXT:    pinsrb $5, %edi, %xmm0
687; SSE41-NEXT:    retq
688;
689; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
690; AVX:       # BB#0:
691; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
692; AVX-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
693; AVX-NEXT:    retq
694  %a = insertelement <16 x i8> undef, i8 %i, i32 0
695  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
696  ret <16 x i8> %shuffle
697}
698
699define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
700; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
701; SSE2:       # BB#0:
702; SSE2-NEXT:    shll $8, %edi
703; SSE2-NEXT:    pxor %xmm0, %xmm0
704; SSE2-NEXT:    pinsrw $7, %edi, %xmm0
705; SSE2-NEXT:    retq
706;
707; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
708; SSSE3:       # BB#0:
709; SSSE3-NEXT:    shll $8, %edi
710; SSSE3-NEXT:    pxor %xmm0, %xmm0
711; SSSE3-NEXT:    pinsrw $7, %edi, %xmm0
712; SSSE3-NEXT:    retq
713;
714; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
715; SSE41:       # BB#0:
716; SSE41-NEXT:    pxor %xmm0, %xmm0
717; SSE41-NEXT:    pinsrb $15, %edi, %xmm0
718; SSE41-NEXT:    retq
719;
720; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
721; AVX:       # BB#0:
722; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
723; AVX-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
724; AVX-NEXT:    retq
725  %a = insertelement <16 x i8> undef, i8 %i, i32 0
726  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
727  ret <16 x i8> %shuffle
728}
729
730define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
731; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
732; SSE2:       # BB#0:
733; SSE2-NEXT:    movzbl %dil, %eax
734; SSE2-NEXT:    pxor %xmm0, %xmm0
735; SSE2-NEXT:    pinsrw $1, %eax, %xmm0
736; SSE2-NEXT:    retq
737;
738; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
739; SSSE3:       # BB#0:
740; SSSE3-NEXT:    movzbl %dil, %eax
741; SSSE3-NEXT:    pxor %xmm0, %xmm0
742; SSSE3-NEXT:    pinsrw $1, %eax, %xmm0
743; SSSE3-NEXT:    retq
744;
745; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
746; SSE41:       # BB#0:
747; SSE41-NEXT:    pxor %xmm0, %xmm0
748; SSE41-NEXT:    pinsrb $2, %edi, %xmm0
749; SSE41-NEXT:    retq
750;
751; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
752; AVX:       # BB#0:
753; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
754; AVX-NEXT:    vpinsrb $2, %edi, %xmm0, %xmm0
755; AVX-NEXT:    retq
756  %a = insertelement <16 x i8> undef, i8 %i, i32 3
757  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
758  ret <16 x i8> %shuffle
759}
760
761define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) {
762; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
763; SSE:       # BB#0:
764; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
765; SSE-NEXT:    retq
766;
767; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
768; AVX:       # BB#0:
769; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
770; AVX-NEXT:    retq
771  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef>
772  ret <16 x i8> %shuffle
773}
774
775define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
776; SSE-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
777; SSE:       # BB#0:
778; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
779; SSE-NEXT:    retq
780;
781; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
782; AVX:       # BB#0:
783; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
784; AVX-NEXT:    retq
785  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
786  ret <16 x i8> %shuffle
787}
788
789define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
790; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
791; SSE2:       # BB#0:
792; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
793; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
794; SSE2-NEXT:    por %xmm1, %xmm0
795; SSE2-NEXT:    retq
796;
797; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
798; SSSE3:       # BB#0:
799; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
800; SSSE3-NEXT:    retq
801;
802; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
803; SSE41:       # BB#0:
804; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
805; SSE41-NEXT:    retq
806;
807; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
808; AVX:       # BB#0:
809; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
810; AVX-NEXT:    retq
811  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
812  ret <16 x i8> %shuffle
813}
814
815define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
816; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
817; SSE2:       # BB#0:
818; SSE2-NEXT:    movdqa %xmm0, %xmm1
819; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
820; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
821; SSE2-NEXT:    por %xmm1, %xmm0
822; SSE2-NEXT:    retq
823;
824; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
825; SSSE3:       # BB#0:
826; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
827; SSSE3-NEXT:    retq
828;
829; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
830; SSE41:       # BB#0:
831; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
832; SSE41-NEXT:    retq
833;
834; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
835; AVX:       # BB#0:
836; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
837; AVX-NEXT:    retq
838  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
839  ret <16 x i8> %shuffle
840}
841
842define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) {
843; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
844; SSE2:       # BB#0:
845; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
846; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
847; SSE2-NEXT:    por %xmm1, %xmm0
848; SSE2-NEXT:    retq
849;
850; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
851; SSSE3:       # BB#0:
852; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
853; SSSE3-NEXT:    retq
854;
855; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
856; SSE41:       # BB#0:
857; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
858; SSE41-NEXT:    retq
859;
860; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
861; AVX:       # BB#0:
862; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
863; AVX-NEXT:    retq
864  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
865  ret <16 x i8> %shuffle
866}
867
868define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) {
869; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
870; SSE2:       # BB#0:
871; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
872; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
873; SSE2-NEXT:    por %xmm1, %xmm0
874; SSE2-NEXT:    retq
875;
876; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
877; SSSE3:       # BB#0:
878; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
879; SSSE3-NEXT:    movdqa %xmm1, %xmm0
880; SSSE3-NEXT:    retq
881;
882; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
883; SSE41:       # BB#0:
884; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
885; SSE41-NEXT:    movdqa %xmm1, %xmm0
886; SSE41-NEXT:    retq
887;
888; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
889; AVX:       # BB#0:
890; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
891; AVX-NEXT:    retq
892  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
893  ret <16 x i8> %shuffle
894}
895
896define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) {
897; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
898; SSE2:       # BB#0:
899; SSE2-NEXT:    movdqa %xmm0, %xmm1
900; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
901; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
902; SSE2-NEXT:    por %xmm1, %xmm0
903; SSE2-NEXT:    retq
904;
905; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
906; SSSE3:       # BB#0:
907; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
908; SSSE3-NEXT:    retq
909;
910; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
911; SSE41:       # BB#0:
912; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
913; SSE41-NEXT:    retq
914;
915; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
916; AVX:       # BB#0:
917; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
918; AVX-NEXT:    retq
919  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
920  ret <16 x i8> %shuffle
921}
922
923define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) {
924; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
925; SSE2:       # BB#0:
926; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
927; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
928; SSE2-NEXT:    por %xmm1, %xmm0
929; SSE2-NEXT:    retq
930;
931; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
932; SSSE3:       # BB#0:
933; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
934; SSSE3-NEXT:    movdqa %xmm1, %xmm0
935; SSSE3-NEXT:    retq
936;
937; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
938; SSE41:       # BB#0:
939; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
940; SSE41-NEXT:    movdqa %xmm1, %xmm0
941; SSE41-NEXT:    retq
942;
943; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
944; AVX:       # BB#0:
945; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
946; AVX-NEXT:    retq
947  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
948  ret <16 x i8> %shuffle
949}
950
951define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) {
952; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
953; SSE2:       # BB#0:
954; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
955; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
956; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
957; SSE2-NEXT:    retq
958;
959; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
960; SSSE3:       # BB#0:
961; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
962; SSSE3-NEXT:    retq
963;
964; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
965; SSE41:       # BB#0:
966; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
967; SSE41-NEXT:    retq
968;
969; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
970; AVX:       # BB#0:
971; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
972; AVX-NEXT:    retq
973  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
974  ret <16 x i8> %shuffle
975}
976
977define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
978; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
979; SSE2:       # BB#0:
980; SSE2-NEXT:    pxor %xmm1, %xmm1
981; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
982; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
983; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
984; SSE2-NEXT:    retq
985;
986; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
987; SSSE3:       # BB#0:
988; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
989; SSSE3-NEXT:    retq
990;
991; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
992; SSE41:       # BB#0:
993; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
994; SSE41-NEXT:    retq
995;
996; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
997; AVX:       # BB#0:
998; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
999; AVX-NEXT:    retq
1000  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1001  ret <16 x i8> %shuffle
1002}
1003
1004define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) {
1005; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1006; SSE2:       # BB#0:
1007; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1008; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1009; SSE2-NEXT:    retq
1010;
1011; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1012; SSSE3:       # BB#0:
1013; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1014; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1015; SSSE3-NEXT:    retq
1016;
1017; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1018; SSE41:       # BB#0:
1019; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1020; SSE41-NEXT:    retq
1021;
1022; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1023; AVX:       # BB#0:
1024; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1025; AVX-NEXT:    retq
1026  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
1027  ret <16 x i8> %shuffle
1028}
1029
1030define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
1031; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1032; SSE2:       # BB#0:
1033; SSE2-NEXT:    pxor %xmm1, %xmm1
1034; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1035; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1036; SSE2-NEXT:    retq
1037;
1038; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1039; SSSE3:       # BB#0:
1040; SSSE3-NEXT:    pxor %xmm1, %xmm1
1041; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1042; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1043; SSSE3-NEXT:    retq
1044;
1045; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1046; SSE41:       # BB#0:
1047; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1048; SSE41-NEXT:    retq
1049;
1050; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1051; AVX:       # BB#0:
1052; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1053; AVX-NEXT:    retq
1054  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
1055  ret <16 x i8> %shuffle
1056}
1057
1058define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) {
1059; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1060; SSE2:       # BB#0:
1061; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1062; SSE2-NEXT:    retq
1063;
1064; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1065; SSSE3:       # BB#0:
1066; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1067; SSSE3-NEXT:    retq
1068;
1069; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1070; SSE41:       # BB#0:
1071; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1072; SSE41-NEXT:    retq
1073;
1074; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1075; AVX:       # BB#0:
1076; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1077; AVX-NEXT:    retq
1078  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
1079  ret <16 x i8> %shuffle
1080}
1081
1082define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) {
1083; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1084; SSE2:       # BB#0:
1085; SSE2-NEXT:    pxor %xmm1, %xmm1
1086; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1087; SSE2-NEXT:    retq
1088;
1089; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1090; SSSE3:       # BB#0:
1091; SSSE3-NEXT:    pxor %xmm1, %xmm1
1092; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1093; SSSE3-NEXT:    retq
1094;
1095; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1096; SSE41:       # BB#0:
1097; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1098; SSE41-NEXT:    retq
1099;
1100; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1101; AVX:       # BB#0:
1102; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1103; AVX-NEXT:    retq
1104  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
1105  ret <16 x i8> %shuffle
1106}
1107
1108define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) {
1109; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1110; SSE2:       # BB#0: # %entry
1111; SSE2-NEXT:    pxor %xmm2, %xmm2
1112; SSE2-NEXT:    movdqa %xmm0, %xmm3
1113; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1114; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1]
1115; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7]
1116; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
1117; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535]
1118; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1119; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3]
1120; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
1121; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
1122; SSE2-NEXT:    pand %xmm5, %xmm2
1123; SSE2-NEXT:    pandn %xmm4, %xmm5
1124; SSE2-NEXT:    por %xmm2, %xmm5
1125; SSE2-NEXT:    psrlq $16, %xmm3
1126; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1127; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,1,3]
1128; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
1129; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4]
1130; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
1131; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1132; SSE2-NEXT:    packuswb %xmm5, %xmm2
1133; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1134; SSE2-NEXT:    pand %xmm0, %xmm2
1135; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
1136; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
1137; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,7]
1138; SSE2-NEXT:    pandn %xmm1, %xmm0
1139; SSE2-NEXT:    por %xmm2, %xmm0
1140; SSE2-NEXT:    retq
1141;
1142; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1143; SSSE3:       # BB#0: # %entry
1144; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1145; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1146; SSSE3-NEXT:    por %xmm1, %xmm0
1147; SSSE3-NEXT:    retq
1148;
1149; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1150; SSE41:       # BB#0: # %entry
1151; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1152; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1153; SSE41-NEXT:    por %xmm1, %xmm0
1154; SSE41-NEXT:    retq
1155;
1156; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1157; AVX:       # BB#0: # %entry
1158; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1159; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1160; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1161; AVX-NEXT:    retq
1162entry:
1163  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
1164
1165  ret <16 x i8> %shuffle
1166}
1167
1168define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) {
1169; Nothing interesting to test here. Just make sure we didn't crashe.
1170; ALL-LABEL: stress_test2:
1171; ALL:         retq
1172entry:
1173  %s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> <i32 29, i32 30, i32 2, i32 16, i32 26, i32 21, i32 11, i32 26, i32 26, i32 3, i32 4, i32 5, i32 30, i32 28, i32 15, i32 5>
1174  %s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> <i32 31, i32 1, i32 24, i32 12, i32 28, i32 5, i32 2, i32 9, i32 29, i32 1, i32 31, i32 5, i32 6, i32 17, i32 15, i32 22>
1175  %s.2.0 = shufflevector <16 x i8> %s.1.0, <16 x i8> %s.1.1, <16 x i32> <i32 22, i32 1, i32 12, i32 3, i32 30, i32 4, i32 30, i32 undef, i32 1, i32 10, i32 14, i32 18, i32 27, i32 13, i32 16, i32 19>
1176
1177  ret <16 x i8> %s.2.0
1178}
1179
1180define void @constant_gets_selected(<4 x i32>* %ptr1, <4 x i32>* %ptr2) {
1181; SSE-LABEL: constant_gets_selected:
1182; SSE:       # BB#0: # %entry
1183; SSE-NEXT:    xorps %xmm0, %xmm0
1184; SSE-NEXT:    movaps %xmm0, (%rdi)
1185; SSE-NEXT:    movaps %xmm0, (%rsi)
1186; SSE-NEXT:    retq
1187;
1188; AVX-LABEL: constant_gets_selected:
1189; AVX:       # BB#0: # %entry
1190; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1191; AVX-NEXT:    vmovaps %xmm0, (%rdi)
1192; AVX-NEXT:    vmovaps %xmm0, (%rsi)
1193; AVX-NEXT:    retq
1194entry:
1195  %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8>
1196  %shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
1197  %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32>
1198  store <4 x i32> %weirder_zero, <4 x i32>* %ptr1, align 16
1199  store <4 x i32> zeroinitializer, <4 x i32>* %ptr2, align 16
1200  ret void
1201}
1202
1203;
1204; Shuffle to logical bit shifts
1205;
1206
1207define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) {
1208; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
1209; SSE:       # BB#0:
1210; SSE-NEXT:    psllw $8, %xmm0
1211; SSE-NEXT:    retq
1212;
1213; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
1214; AVX:       # BB#0:
1215; AVX-NEXT:    vpsllw $8, %xmm0, %xmm0
1216; AVX-NEXT:    retq
1217  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
1218  ret <16 x i8> %shuffle
1219}
1220
1221define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) {
1222; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
1223; SSE:       # BB#0:
1224; SSE-NEXT:    pslld $24, %xmm0
1225; SSE-NEXT:    retq
1226;
1227; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
1228; AVX:       # BB#0:
1229; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
1230; AVX-NEXT:    retq
1231  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
1232  ret <16 x i8> %shuffle
1233}
1234
1235define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) {
1236; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
1237; SSE:       # BB#0:
1238; SSE-NEXT:    psllq $56, %xmm0
1239; SSE-NEXT:    retq
1240;
1241; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
1242; AVX:       # BB#0:
1243; AVX-NEXT:    vpsllq $56, %xmm0, %xmm0
1244; AVX-NEXT:    retq
1245  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8>
1246  ret <16 x i8> %shuffle
1247}
1248
1249define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
1250; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
1251; SSE:       # BB#0:
1252; SSE-NEXT:    psllq $8, %xmm0
1253; SSE-NEXT:    retq
1254;
1255; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
1256; AVX:       # BB#0:
1257; AVX-NEXT:    vpsllq $8, %xmm0, %xmm0
1258; AVX-NEXT:    retq
1259  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 undef, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 16, i32 8, i32 9, i32 undef, i32 11, i32 12, i32 13, i32 14>
1260  ret <16 x i8> %shuffle
1261}
1262
1263define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) {
1264; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
1265; SSE:       # BB#0:
1266; SSE-NEXT:    psrlw $8, %xmm0
1267; SSE-NEXT:    retq
1268;
1269; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
1270; AVX:       # BB#0:
1271; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1272; AVX-NEXT:    retq
1273  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 undef, i32 16, i32 undef, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
1274  ret <16 x i8> %shuffle
1275}
1276
1277define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) {
1278; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
1279; SSE:       # BB#0:
1280; SSE-NEXT:    psrld $16, %xmm0
1281; SSE-NEXT:    retq
1282;
1283; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
1284; AVX:       # BB#0:
1285; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
1286; AVX-NEXT:    retq
1287  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 15, i32 16, i32 16>
1288  ret <16 x i8> %shuffle
1289}
1290
1291define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) {
1292; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
1293; SSE:       # BB#0:
1294; SSE-NEXT:    psrlq $56, %xmm0
1295; SSE-NEXT:    retq
1296;
1297; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
1298; AVX:       # BB#0:
1299; AVX-NEXT:    vpsrlq $56, %xmm0, %xmm0
1300; AVX-NEXT:    retq
1301  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16>
1302  ret <16 x i8> %shuffle
1303}
1304
1305define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
1306; SSE2-LABEL: PR12412:
1307; SSE2:       # BB#0: # %entry
1308; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1309; SSE2-NEXT:    pand %xmm2, %xmm1
1310; SSE2-NEXT:    pand %xmm2, %xmm0
1311; SSE2-NEXT:    packuswb %xmm1, %xmm0
1312; SSE2-NEXT:    retq
1313;
1314; SSSE3-LABEL: PR12412:
1315; SSSE3:       # BB#0: # %entry
1316; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1317; SSSE3-NEXT:    pshufb %xmm2, %xmm1
1318; SSSE3-NEXT:    pshufb %xmm2, %xmm0
1319; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1320; SSSE3-NEXT:    retq
1321;
1322; SSE41-LABEL: PR12412:
1323; SSE41:       # BB#0: # %entry
1324; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1325; SSE41-NEXT:    pshufb %xmm2, %xmm1
1326; SSE41-NEXT:    pshufb %xmm2, %xmm0
1327; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1328; SSE41-NEXT:    retq
1329;
1330; AVX-LABEL: PR12412:
1331; AVX:       # BB#0: # %entry
1332; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1333; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1334; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1335; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1336; AVX-NEXT:    retq
1337entry:
1338  %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1339  ret <16 x i8> %0
1340}
1341
1342define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(<16 x i8> %a) {
1343; SSE-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
1344; SSE:       # BB#0:
1345; SSE-NEXT:    psrld $8, %xmm0
1346; SSE-NEXT:    retq
1347;
1348; AVX-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
1349; AVX:       # BB#0:
1350; AVX-NEXT:    vpsrld $8, %xmm0, %xmm0
1351; AVX-NEXT:    retq
1352  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 2, i32 3, i32 16, i32 undef, i32 6, i32 7, i32 16, i32 undef, i32 10, i32 11, i32 16, i32 undef, i32 14, i32 15, i32 16>
1353  ret <16 x i8> %shuffle
1354}
1355
1356define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) {
1357; SSE-LABEL: shuffle_v16i8_bitcast_unpack:
1358; SSE:       # BB#0:
1359; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1360; SSE-NEXT:    retq
1361;
1362; AVX-LABEL: shuffle_v16i8_bitcast_unpack:
1363; AVX:       # BB#0:
1364; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1365; AVX-NEXT:    retq
1366  %shuffle8  = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 23, i32 6, i32 22, i32 5, i32 21, i32 4, i32 20, i32 3, i32 19, i32 2, i32 18, i32 1, i32 17, i32 0, i32 16>
1367  %bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float>
1368  %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1369  %bitcast16 = bitcast <4 x float> %shuffle32 to <8 x i16>
1370  %shuffle16 = shufflevector <8 x i16> %bitcast16, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1371  %bitcast8  = bitcast <8 x i16> %shuffle16 to <16 x i8>
1372  ret <16 x i8> %bitcast8
1373}
1374
1375define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
1376; SSE2-LABEL: insert_dup_mem_v16i8_i32:
1377; SSE2:       # BB#0:
1378; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1379; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1380; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1381; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1382; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1383; SSE2-NEXT:    retq
1384;
1385; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
1386; SSSE3:       # BB#0:
1387; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1388; SSSE3-NEXT:    pxor %xmm1, %xmm1
1389; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1390; SSSE3-NEXT:    retq
1391;
1392; SSE41-LABEL: insert_dup_mem_v16i8_i32:
1393; SSE41:       # BB#0:
1394; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1395; SSE41-NEXT:    pxor %xmm1, %xmm1
1396; SSE41-NEXT:    pshufb %xmm1, %xmm0
1397; SSE41-NEXT:    retq
1398;
1399; AVX1-LABEL: insert_dup_mem_v16i8_i32:
1400; AVX1:       # BB#0:
1401; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1402; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1403; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1404; AVX1-NEXT:    retq
1405;
1406; AVX2-LABEL: insert_dup_mem_v16i8_i32:
1407; AVX2:       # BB#0:
1408; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
1409; AVX2-NEXT:    retq
1410  %tmp = load i32, i32* %ptr, align 4
1411  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
1412  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
1413  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
1414  ret <16 x i8> %tmp3
1415}
1416
1417define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
1418; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8:
1419; SSE2:       # BB#0:
1420; SSE2-NEXT:    movsbl (%rdi), %eax
1421; SSE2-NEXT:    movd %eax, %xmm0
1422; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1423; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1424; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1425; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1426; SSE2-NEXT:    retq
1427;
1428; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
1429; SSSE3:       # BB#0:
1430; SSSE3-NEXT:    movsbl (%rdi), %eax
1431; SSSE3-NEXT:    movd %eax, %xmm0
1432; SSSE3-NEXT:    pxor %xmm1, %xmm1
1433; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1434; SSSE3-NEXT:    retq
1435;
1436; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8:
1437; SSE41:       # BB#0:
1438; SSE41-NEXT:    movsbl (%rdi), %eax
1439; SSE41-NEXT:    movd %eax, %xmm0
1440; SSE41-NEXT:    pxor %xmm1, %xmm1
1441; SSE41-NEXT:    pshufb %xmm1, %xmm0
1442; SSE41-NEXT:    retq
1443;
1444; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8:
1445; AVX1:       # BB#0:
1446; AVX1-NEXT:    movsbl (%rdi), %eax
1447; AVX1-NEXT:    vmovd %eax, %xmm0
1448; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1449; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1450; AVX1-NEXT:    retq
1451;
1452; AVX2-LABEL: insert_dup_mem_v16i8_sext_i8:
1453; AVX2:       # BB#0:
1454; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
1455; AVX2-NEXT:    retq
1456  %tmp = load i8, i8* %ptr, align 1
1457  %tmp1 = sext i8 %tmp to i32
1458  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
1459  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
1460  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> zeroinitializer
1461  ret <16 x i8> %tmp4
1462}
1463
1464define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
1465; SSE2-LABEL: insert_dup_elt1_mem_v16i8_i32:
1466; SSE2:       # BB#0:
1467; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1468; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1469; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1470; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
1471; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
1472; SSE2-NEXT:    retq
1473;
1474; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
1475; SSSE3:       # BB#0:
1476; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1477; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1478; SSSE3-NEXT:    retq
1479;
1480; SSE41-LABEL: insert_dup_elt1_mem_v16i8_i32:
1481; SSE41:       # BB#0:
1482; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1483; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1484; SSE41-NEXT:    retq
1485;
1486; AVX1-LABEL: insert_dup_elt1_mem_v16i8_i32:
1487; AVX1:       # BB#0:
1488; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1489; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1490; AVX1-NEXT:    retq
1491;
1492; AVX2-LABEL: insert_dup_elt1_mem_v16i8_i32:
1493; AVX2:       # BB#0:
1494; AVX2-NEXT:    vpbroadcastb 1(%rdi), %xmm0
1495; AVX2-NEXT:    retq
1496  %tmp = load i32, i32* %ptr, align 4
1497  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
1498  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
1499  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1500  ret <16 x i8> %tmp3
1501}
1502
1503define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
1504; SSE2-LABEL: insert_dup_elt2_mem_v16i8_i32:
1505; SSE2:       # BB#0:
1506; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1507; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1508; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
1509; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
1510; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
1511; SSE2-NEXT:    retq
1512;
1513; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
1514; SSSE3:       # BB#0:
1515; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1516; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1517; SSSE3-NEXT:    retq
1518;
1519; SSE41-LABEL: insert_dup_elt2_mem_v16i8_i32:
1520; SSE41:       # BB#0:
1521; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1522; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1523; SSE41-NEXT:    retq
1524;
1525; AVX1-LABEL: insert_dup_elt2_mem_v16i8_i32:
1526; AVX1:       # BB#0:
1527; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1528; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1529; AVX1-NEXT:    retq
1530;
1531; AVX2-LABEL: insert_dup_elt2_mem_v16i8_i32:
1532; AVX2:       # BB#0:
1533; AVX2-NEXT:    vpbroadcastb 2(%rdi), %xmm0
1534; AVX2-NEXT:    retq
1535  %tmp = load i32, i32* %ptr, align 4
1536  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
1537  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
1538  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1539  ret <16 x i8> %tmp3
1540}
1541
1542define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
1543; SSE2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1544; SSE2:       # BB#0:
1545; SSE2-NEXT:    movsbl (%rdi), %eax
1546; SSE2-NEXT:    movd %eax, %xmm0
1547; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1548; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1549; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
1550; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
1551; SSE2-NEXT:    retq
1552;
1553; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1554; SSSE3:       # BB#0:
1555; SSSE3-NEXT:    movsbl (%rdi), %eax
1556; SSSE3-NEXT:    movd %eax, %xmm0
1557; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1558; SSSE3-NEXT:    retq
1559;
1560; SSE41-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1561; SSE41:       # BB#0:
1562; SSE41-NEXT:    movsbl (%rdi), %eax
1563; SSE41-NEXT:    movd %eax, %xmm0
1564; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1565; SSE41-NEXT:    retq
1566;
1567; AVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1568; AVX1:       # BB#0:
1569; AVX1-NEXT:    movsbl (%rdi), %eax
1570; AVX1-NEXT:    vmovd %eax, %xmm0
1571; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1572; AVX1-NEXT:    retq
1573;
1574; AVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1575; AVX2:       # BB#0:
1576; AVX2-NEXT:    movsbl (%rdi), %eax
1577; AVX2-NEXT:    shrl $8, %eax
1578; AVX2-NEXT:    vmovd %eax, %xmm0
1579; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
1580; AVX2-NEXT:    retq
1581  %tmp = load i8, i8* %ptr, align 1
1582  %tmp1 = sext i8 %tmp to i32
1583  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
1584  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
1585  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1586  ret <16 x i8> %tmp4
1587}
1588
1589define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
1590; SSE2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1591; SSE2:       # BB#0:
1592; SSE2-NEXT:    movsbl (%rdi), %eax
1593; SSE2-NEXT:    movd %eax, %xmm0
1594; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1595; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
1596; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
1597; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
1598; SSE2-NEXT:    retq
1599;
1600; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1601; SSSE3:       # BB#0:
1602; SSSE3-NEXT:    movsbl (%rdi), %eax
1603; SSSE3-NEXT:    movd %eax, %xmm0
1604; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1605; SSSE3-NEXT:    retq
1606;
1607; SSE41-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1608; SSE41:       # BB#0:
1609; SSE41-NEXT:    movsbl (%rdi), %eax
1610; SSE41-NEXT:    movd %eax, %xmm0
1611; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1612; SSE41-NEXT:    retq
1613;
1614; AVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1615; AVX1:       # BB#0:
1616; AVX1-NEXT:    movsbl (%rdi), %eax
1617; AVX1-NEXT:    vmovd %eax, %xmm0
1618; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1619; AVX1-NEXT:    retq
1620;
1621; AVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1622; AVX2:       # BB#0:
1623; AVX2-NEXT:    movsbl (%rdi), %eax
1624; AVX2-NEXT:    shrl $16, %eax
1625; AVX2-NEXT:    vmovd %eax, %xmm0
1626; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
1627; AVX2-NEXT:    retq
1628  %tmp = load i8, i8* %ptr, align 1
1629  %tmp1 = sext i8 %tmp to i32
1630  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
1631  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
1632  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1633  ret <16 x i8> %tmp4
1634}
1635