• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
9;
10; Just one 32-bit run to make sure we do reasonable things there.
11; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
12
13define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
14; SSE2-LABEL: sext_16i8_to_8i16:
15; SSE2:       # %bb.0: # %entry
16; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
17; SSE2-NEXT:    psraw $8, %xmm0
18; SSE2-NEXT:    retq
19;
20; SSSE3-LABEL: sext_16i8_to_8i16:
21; SSSE3:       # %bb.0: # %entry
22; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
23; SSSE3-NEXT:    psraw $8, %xmm0
24; SSSE3-NEXT:    retq
25;
26; SSE41-LABEL: sext_16i8_to_8i16:
27; SSE41:       # %bb.0: # %entry
28; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
29; SSE41-NEXT:    retq
30;
31; AVX-LABEL: sext_16i8_to_8i16:
32; AVX:       # %bb.0: # %entry
33; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
34; AVX-NEXT:    retq
35;
36; X32-SSE41-LABEL: sext_16i8_to_8i16:
37; X32-SSE41:       # %bb.0: # %entry
38; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
39; X32-SSE41-NEXT:    retl
40entry:
41  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
42  %C = sext <8 x i8> %B to <8 x i16>
43  ret <8 x i16> %C
44}
45
46define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
47; SSE2-LABEL: sext_16i8_to_16i16:
48; SSE2:       # %bb.0: # %entry
49; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
50; SSE2-NEXT:    psraw $8, %xmm2
51; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
52; SSE2-NEXT:    psraw $8, %xmm1
53; SSE2-NEXT:    movdqa %xmm2, %xmm0
54; SSE2-NEXT:    retq
55;
56; SSSE3-LABEL: sext_16i8_to_16i16:
57; SSSE3:       # %bb.0: # %entry
58; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
59; SSSE3-NEXT:    psraw $8, %xmm2
60; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
61; SSSE3-NEXT:    psraw $8, %xmm1
62; SSSE3-NEXT:    movdqa %xmm2, %xmm0
63; SSSE3-NEXT:    retq
64;
65; SSE41-LABEL: sext_16i8_to_16i16:
66; SSE41:       # %bb.0: # %entry
67; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
68; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
69; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
70; SSE41-NEXT:    movdqa %xmm2, %xmm0
71; SSE41-NEXT:    retq
72;
73; AVX1-LABEL: sext_16i8_to_16i16:
74; AVX1:       # %bb.0: # %entry
75; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
76; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
77; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
78; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
79; AVX1-NEXT:    retq
80;
81; AVX2-LABEL: sext_16i8_to_16i16:
82; AVX2:       # %bb.0: # %entry
83; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
84; AVX2-NEXT:    retq
85;
86; AVX512-LABEL: sext_16i8_to_16i16:
87; AVX512:       # %bb.0: # %entry
88; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
89; AVX512-NEXT:    retq
90;
91; X32-SSE41-LABEL: sext_16i8_to_16i16:
92; X32-SSE41:       # %bb.0: # %entry
93; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
94; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
95; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
96; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
97; X32-SSE41-NEXT:    retl
98entry:
99  %B = sext <16 x i8> %A to <16 x i16>
100  ret <16 x i16> %B
101}
102
103define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp {
104; SSE2-LABEL: sext_32i8_to_32i16:
105; SSE2:       # %bb.0: # %entry
106; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
107; SSE2-NEXT:    psraw $8, %xmm4
108; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
109; SSE2-NEXT:    psraw $8, %xmm5
110; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
111; SSE2-NEXT:    psraw $8, %xmm2
112; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
113; SSE2-NEXT:    psraw $8, %xmm3
114; SSE2-NEXT:    movdqa %xmm4, %xmm0
115; SSE2-NEXT:    movdqa %xmm5, %xmm1
116; SSE2-NEXT:    retq
117;
118; SSSE3-LABEL: sext_32i8_to_32i16:
119; SSSE3:       # %bb.0: # %entry
120; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
121; SSSE3-NEXT:    psraw $8, %xmm4
122; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
123; SSSE3-NEXT:    psraw $8, %xmm5
124; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
125; SSSE3-NEXT:    psraw $8, %xmm2
126; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
127; SSSE3-NEXT:    psraw $8, %xmm3
128; SSSE3-NEXT:    movdqa %xmm4, %xmm0
129; SSSE3-NEXT:    movdqa %xmm5, %xmm1
130; SSSE3-NEXT:    retq
131;
132; SSE41-LABEL: sext_32i8_to_32i16:
133; SSE41:       # %bb.0: # %entry
134; SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
135; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
136; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
137; SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
138; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
139; SSE41-NEXT:    pmovsxbw %xmm0, %xmm3
140; SSE41-NEXT:    movdqa %xmm5, %xmm0
141; SSE41-NEXT:    movdqa %xmm4, %xmm1
142; SSE41-NEXT:    retq
143;
144; AVX1-LABEL: sext_32i8_to_32i16:
145; AVX1:       # %bb.0: # %entry
146; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
147; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
148; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
149; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
150; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
151; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
152; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
153; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
154; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
155; AVX1-NEXT:    vmovaps %ymm2, %ymm0
156; AVX1-NEXT:    retq
157;
158; AVX2-LABEL: sext_32i8_to_32i16:
159; AVX2:       # %bb.0: # %entry
160; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm2
161; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
162; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
163; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
164; AVX2-NEXT:    retq
165;
166; AVX512F-LABEL: sext_32i8_to_32i16:
167; AVX512F:       # %bb.0: # %entry
168; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm2
169; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
170; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm1
171; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
172; AVX512F-NEXT:    retq
173;
174; AVX512BW-LABEL: sext_32i8_to_32i16:
175; AVX512BW:       # %bb.0: # %entry
176; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
177; AVX512BW-NEXT:    retq
178;
179; X32-SSE41-LABEL: sext_32i8_to_32i16:
180; X32-SSE41:       # %bb.0: # %entry
181; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
182; X32-SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
183; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
184; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
185; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
186; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm3
187; X32-SSE41-NEXT:    movdqa %xmm5, %xmm0
188; X32-SSE41-NEXT:    movdqa %xmm4, %xmm1
189; X32-SSE41-NEXT:    retl
190entry:
191  %B = sext <32 x i8> %A to <32 x i16>
192  ret <32 x i16> %B
193}
194
195define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
196; SSE2-LABEL: sext_16i8_to_4i32:
197; SSE2:       # %bb.0: # %entry
198; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
199; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
200; SSE2-NEXT:    psrad $24, %xmm0
201; SSE2-NEXT:    retq
202;
203; SSSE3-LABEL: sext_16i8_to_4i32:
204; SSSE3:       # %bb.0: # %entry
205; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
206; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
207; SSSE3-NEXT:    psrad $24, %xmm0
208; SSSE3-NEXT:    retq
209;
210; SSE41-LABEL: sext_16i8_to_4i32:
211; SSE41:       # %bb.0: # %entry
212; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
213; SSE41-NEXT:    retq
214;
215; AVX-LABEL: sext_16i8_to_4i32:
216; AVX:       # %bb.0: # %entry
217; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
218; AVX-NEXT:    retq
219;
220; X32-SSE41-LABEL: sext_16i8_to_4i32:
221; X32-SSE41:       # %bb.0: # %entry
222; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
223; X32-SSE41-NEXT:    retl
224entry:
225  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
226  %C = sext <4 x i8> %B to <4 x i32>
227  ret <4 x i32> %C
228}
229
230define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
231; SSE2-LABEL: sext_16i8_to_8i32:
232; SSE2:       # %bb.0: # %entry
233; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
234; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
235; SSE2-NEXT:    psrad $24, %xmm2
236; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
237; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
238; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
239; SSE2-NEXT:    psrad $24, %xmm1
240; SSE2-NEXT:    movdqa %xmm2, %xmm0
241; SSE2-NEXT:    retq
242;
243; SSSE3-LABEL: sext_16i8_to_8i32:
244; SSSE3:       # %bb.0: # %entry
245; SSSE3-NEXT:    movdqa %xmm0, %xmm1
246; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
247; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
248; SSSE3-NEXT:    psrad $24, %xmm0
249; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
250; SSSE3-NEXT:    psrad $24, %xmm1
251; SSSE3-NEXT:    retq
252;
253; SSE41-LABEL: sext_16i8_to_8i32:
254; SSE41:       # %bb.0: # %entry
255; SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
256; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
257; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
258; SSE41-NEXT:    movdqa %xmm2, %xmm0
259; SSE41-NEXT:    retq
260;
261; AVX1-LABEL: sext_16i8_to_8i32:
262; AVX1:       # %bb.0: # %entry
263; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
264; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
265; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
266; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
267; AVX1-NEXT:    retq
268;
269; AVX2-LABEL: sext_16i8_to_8i32:
270; AVX2:       # %bb.0: # %entry
271; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
272; AVX2-NEXT:    retq
273;
274; AVX512-LABEL: sext_16i8_to_8i32:
275; AVX512:       # %bb.0: # %entry
276; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
277; AVX512-NEXT:    retq
278;
279; X32-SSE41-LABEL: sext_16i8_to_8i32:
280; X32-SSE41:       # %bb.0: # %entry
281; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
282; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
283; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
284; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
285; X32-SSE41-NEXT:    retl
286entry:
287  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
288  %C = sext <8 x i8> %B to <8 x i32>
289  ret <8 x i32> %C
290}
291
292define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
293; SSE2-LABEL: sext_16i8_to_16i32:
294; SSE2:       # %bb.0: # %entry
295; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
296; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
297; SSE2-NEXT:    psrad $24, %xmm4
298; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
299; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
300; SSE2-NEXT:    psrad $24, %xmm2
301; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
302; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
303; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
304; SSE2-NEXT:    psrad $24, %xmm1
305; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
306; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
307; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
308; SSE2-NEXT:    psrad $24, %xmm3
309; SSE2-NEXT:    movdqa %xmm4, %xmm0
310; SSE2-NEXT:    retq
311;
312; SSSE3-LABEL: sext_16i8_to_16i32:
313; SSSE3:       # %bb.0: # %entry
314; SSSE3-NEXT:    movdqa %xmm0, %xmm3
315; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
316; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
317; SSSE3-NEXT:    psrad $24, %xmm0
318; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
319; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
320; SSSE3-NEXT:    psrad $24, %xmm2
321; SSSE3-NEXT:    movdqa %xmm3, %xmm1
322; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
323; SSSE3-NEXT:    psrad $24, %xmm1
324; SSSE3-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[u,u,u,12,u,u,u,13,u,u,u,14,u,u,u,15]
325; SSSE3-NEXT:    psrad $24, %xmm3
326; SSSE3-NEXT:    retq
327;
328; SSE41-LABEL: sext_16i8_to_16i32:
329; SSE41:       # %bb.0: # %entry
330; SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
331; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
332; SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
333; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
334; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
335; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
336; SSE41-NEXT:    pmovsxbd %xmm0, %xmm3
337; SSE41-NEXT:    movdqa %xmm4, %xmm0
338; SSE41-NEXT:    retq
339;
340; AVX1-LABEL: sext_16i8_to_16i32:
341; AVX1:       # %bb.0: # %entry
342; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
343; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
344; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
345; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
346; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
347; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
348; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
349; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
350; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
351; AVX1-NEXT:    vmovaps %ymm2, %ymm0
352; AVX1-NEXT:    retq
353;
354; AVX2-LABEL: sext_16i8_to_16i32:
355; AVX2:       # %bb.0: # %entry
356; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm2
357; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
358; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm1
359; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
360; AVX2-NEXT:    retq
361;
362; AVX512-LABEL: sext_16i8_to_16i32:
363; AVX512:       # %bb.0: # %entry
364; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
365; AVX512-NEXT:    retq
366;
367; X32-SSE41-LABEL: sext_16i8_to_16i32:
368; X32-SSE41:       # %bb.0: # %entry
369; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
370; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
371; X32-SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
372; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
373; X32-SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
374; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
375; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm3
376; X32-SSE41-NEXT:    movdqa %xmm4, %xmm0
377; X32-SSE41-NEXT:    retl
378entry:
379  %B = sext <16 x i8> %A to <16 x i32>
380  ret <16 x i32> %B
381}
382
383define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
384; SSE2-LABEL: sext_16i8_to_2i64:
385; SSE2:       # %bb.0: # %entry
386; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
387; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
388; SSE2-NEXT:    movdqa %xmm0, %xmm1
389; SSE2-NEXT:    psrad $31, %xmm1
390; SSE2-NEXT:    psrad $24, %xmm0
391; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
392; SSE2-NEXT:    retq
393;
394; SSSE3-LABEL: sext_16i8_to_2i64:
395; SSSE3:       # %bb.0: # %entry
396; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
397; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
398; SSSE3-NEXT:    movdqa %xmm0, %xmm1
399; SSSE3-NEXT:    psrad $31, %xmm1
400; SSSE3-NEXT:    psrad $24, %xmm0
401; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
402; SSSE3-NEXT:    retq
403;
404; SSE41-LABEL: sext_16i8_to_2i64:
405; SSE41:       # %bb.0: # %entry
406; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
407; SSE41-NEXT:    retq
408;
409; AVX-LABEL: sext_16i8_to_2i64:
410; AVX:       # %bb.0: # %entry
411; AVX-NEXT:    vpmovsxbq %xmm0, %xmm0
412; AVX-NEXT:    retq
413;
414; X32-SSE41-LABEL: sext_16i8_to_2i64:
415; X32-SSE41:       # %bb.0: # %entry
416; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
417; X32-SSE41-NEXT:    retl
418entry:
419  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
420  %C = sext <2 x i8> %B to <2 x i64>
421  ret <2 x i64> %C
422}
423
424define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
425; SSE2-LABEL: sext_16i8_to_4i64:
426; SSE2:       # %bb.0: # %entry
427; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
428; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
429; SSE2-NEXT:    movdqa %xmm2, %xmm1
430; SSE2-NEXT:    psrad $31, %xmm1
431; SSE2-NEXT:    psrad $24, %xmm2
432; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
433; SSE2-NEXT:    psrld $16, %xmm0
434; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
435; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
436; SSE2-NEXT:    movdqa %xmm1, %xmm0
437; SSE2-NEXT:    psrad $31, %xmm0
438; SSE2-NEXT:    psrad $24, %xmm1
439; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
440; SSE2-NEXT:    movdqa %xmm2, %xmm0
441; SSE2-NEXT:    retq
442;
443; SSSE3-LABEL: sext_16i8_to_4i64:
444; SSSE3:       # %bb.0: # %entry
445; SSSE3-NEXT:    movdqa %xmm0, %xmm1
446; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
447; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
448; SSSE3-NEXT:    movdqa %xmm0, %xmm2
449; SSSE3-NEXT:    psrad $31, %xmm2
450; SSSE3-NEXT:    psrad $24, %xmm0
451; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
452; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,u,u,u,3,u,u,u,u,u,u,u,u]
453; SSSE3-NEXT:    movdqa %xmm1, %xmm2
454; SSSE3-NEXT:    psrad $31, %xmm2
455; SSSE3-NEXT:    psrad $24, %xmm1
456; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
457; SSSE3-NEXT:    retq
458;
459; SSE41-LABEL: sext_16i8_to_4i64:
460; SSE41:       # %bb.0: # %entry
461; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
462; SSE41-NEXT:    psrld $16, %xmm0
463; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
464; SSE41-NEXT:    movdqa %xmm2, %xmm0
465; SSE41-NEXT:    retq
466;
467; AVX1-LABEL: sext_16i8_to_4i64:
468; AVX1:       # %bb.0: # %entry
469; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
470; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
471; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
472; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
473; AVX1-NEXT:    retq
474;
475; AVX2-LABEL: sext_16i8_to_4i64:
476; AVX2:       # %bb.0: # %entry
477; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
478; AVX2-NEXT:    retq
479;
480; AVX512-LABEL: sext_16i8_to_4i64:
481; AVX512:       # %bb.0: # %entry
482; AVX512-NEXT:    vpmovsxbq %xmm0, %ymm0
483; AVX512-NEXT:    retq
484;
485; X32-SSE41-LABEL: sext_16i8_to_4i64:
486; X32-SSE41:       # %bb.0: # %entry
487; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
488; X32-SSE41-NEXT:    psrld $16, %xmm0
489; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
490; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
491; X32-SSE41-NEXT:    retl
492entry:
493  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
494  %C = sext <4 x i8> %B to <4 x i64>
495  ret <4 x i64> %C
496}
497
498define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
499; SSE2-LABEL: sext_16i8_to_8i64:
500; SSE2:       # %bb.0: # %entry
501; SSE2-NEXT:    movdqa %xmm0, %xmm1
502; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
503; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
504; SSE2-NEXT:    movdqa %xmm0, %xmm2
505; SSE2-NEXT:    psrad $31, %xmm2
506; SSE2-NEXT:    psrad $24, %xmm0
507; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
508; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
509; SSE2-NEXT:    psrld $16, %xmm1
510; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
511; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
512; SSE2-NEXT:    movdqa %xmm1, %xmm2
513; SSE2-NEXT:    psrad $31, %xmm2
514; SSE2-NEXT:    psrad $24, %xmm1
515; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
516; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
517; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
518; SSE2-NEXT:    movdqa %xmm2, %xmm4
519; SSE2-NEXT:    psrad $31, %xmm4
520; SSE2-NEXT:    psrad $24, %xmm2
521; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
522; SSE2-NEXT:    psrld $16, %xmm3
523; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
524; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
525; SSE2-NEXT:    movdqa %xmm3, %xmm4
526; SSE2-NEXT:    psrad $31, %xmm4
527; SSE2-NEXT:    psrad $24, %xmm3
528; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
529; SSE2-NEXT:    retq
530;
531; SSSE3-LABEL: sext_16i8_to_8i64:
532; SSSE3:       # %bb.0: # %entry
533; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <u,u,u,2,u,u,u,3,u,u,u,u,u,u,u,u>
534; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
535; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
536; SSSE3-NEXT:    movdqa %xmm0, %xmm1
537; SSSE3-NEXT:    pshufb %xmm2, %xmm1
538; SSSE3-NEXT:    movdqa %xmm1, %xmm0
539; SSSE3-NEXT:    psrad $31, %xmm0
540; SSSE3-NEXT:    psrad $24, %xmm1
541; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
542; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
543; SSSE3-NEXT:    movdqa %xmm0, %xmm4
544; SSSE3-NEXT:    psrad $31, %xmm4
545; SSSE3-NEXT:    psrad $24, %xmm0
546; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
547; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
548; SSSE3-NEXT:    pshufb %xmm2, %xmm3
549; SSSE3-NEXT:    movdqa %xmm3, %xmm2
550; SSSE3-NEXT:    psrad $31, %xmm2
551; SSSE3-NEXT:    psrad $24, %xmm3
552; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
553; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
554; SSSE3-NEXT:    movdqa %xmm2, %xmm4
555; SSSE3-NEXT:    psrad $31, %xmm4
556; SSSE3-NEXT:    psrad $24, %xmm2
557; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
558; SSSE3-NEXT:    retq
559;
560; SSE41-LABEL: sext_16i8_to_8i64:
561; SSE41:       # %bb.0: # %entry
562; SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
563; SSE41-NEXT:    movdqa %xmm0, %xmm1
564; SSE41-NEXT:    psrld $16, %xmm1
565; SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
566; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
567; SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
568; SSE41-NEXT:    psrlq $48, %xmm0
569; SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
570; SSE41-NEXT:    movdqa %xmm4, %xmm0
571; SSE41-NEXT:    retq
572;
573; AVX1-LABEL: sext_16i8_to_8i64:
574; AVX1:       # %bb.0: # %entry
575; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
576; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
577; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
578; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
579; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
580; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm1
581; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
582; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
583; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
584; AVX1-NEXT:    vmovaps %ymm2, %ymm0
585; AVX1-NEXT:    retq
586;
587; AVX2-LABEL: sext_16i8_to_8i64:
588; AVX2:       # %bb.0: # %entry
589; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm2
590; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
591; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm1
592; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
593; AVX2-NEXT:    retq
594;
595; AVX512-LABEL: sext_16i8_to_8i64:
596; AVX512:       # %bb.0: # %entry
597; AVX512-NEXT:    vpmovsxbq %xmm0, %zmm0
598; AVX512-NEXT:    retq
599;
600; X32-SSE41-LABEL: sext_16i8_to_8i64:
601; X32-SSE41:       # %bb.0: # %entry
602; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
603; X32-SSE41-NEXT:    movdqa %xmm0, %xmm1
604; X32-SSE41-NEXT:    psrld $16, %xmm1
605; X32-SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
606; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
607; X32-SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
608; X32-SSE41-NEXT:    psrlq $48, %xmm0
609; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
610; X32-SSE41-NEXT:    movdqa %xmm4, %xmm0
611; X32-SSE41-NEXT:    retl
612entry:
613  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
614  %C = sext <8 x i8> %B to <8 x i64>
615  ret <8 x i64> %C
616}
617
618define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
619; SSE2-LABEL: sext_8i16_to_4i32:
620; SSE2:       # %bb.0: # %entry
621; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
622; SSE2-NEXT:    psrad $16, %xmm0
623; SSE2-NEXT:    retq
624;
625; SSSE3-LABEL: sext_8i16_to_4i32:
626; SSSE3:       # %bb.0: # %entry
627; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
628; SSSE3-NEXT:    psrad $16, %xmm0
629; SSSE3-NEXT:    retq
630;
631; SSE41-LABEL: sext_8i16_to_4i32:
632; SSE41:       # %bb.0: # %entry
633; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
634; SSE41-NEXT:    retq
635;
636; AVX-LABEL: sext_8i16_to_4i32:
637; AVX:       # %bb.0: # %entry
638; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
639; AVX-NEXT:    retq
640;
641; X32-SSE41-LABEL: sext_8i16_to_4i32:
642; X32-SSE41:       # %bb.0: # %entry
643; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
644; X32-SSE41-NEXT:    retl
645entry:
646  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
647  %C = sext <4 x i16> %B to <4 x i32>
648  ret <4 x i32> %C
649}
650
651define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
652; SSE2-LABEL: sext_8i16_to_8i32:
653; SSE2:       # %bb.0: # %entry
654; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
655; SSE2-NEXT:    psrad $16, %xmm2
656; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
657; SSE2-NEXT:    psrad $16, %xmm1
658; SSE2-NEXT:    movdqa %xmm2, %xmm0
659; SSE2-NEXT:    retq
660;
661; SSSE3-LABEL: sext_8i16_to_8i32:
662; SSSE3:       # %bb.0: # %entry
663; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
664; SSSE3-NEXT:    psrad $16, %xmm2
665; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
666; SSSE3-NEXT:    psrad $16, %xmm1
667; SSSE3-NEXT:    movdqa %xmm2, %xmm0
668; SSSE3-NEXT:    retq
669;
670; SSE41-LABEL: sext_8i16_to_8i32:
671; SSE41:       # %bb.0: # %entry
672; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
673; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
674; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
675; SSE41-NEXT:    movdqa %xmm2, %xmm0
676; SSE41-NEXT:    retq
677;
678; AVX1-LABEL: sext_8i16_to_8i32:
679; AVX1:       # %bb.0: # %entry
680; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
681; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
682; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
683; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
684; AVX1-NEXT:    retq
685;
686; AVX2-LABEL: sext_8i16_to_8i32:
687; AVX2:       # %bb.0: # %entry
688; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
689; AVX2-NEXT:    retq
690;
691; AVX512-LABEL: sext_8i16_to_8i32:
692; AVX512:       # %bb.0: # %entry
693; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
694; AVX512-NEXT:    retq
695;
696; X32-SSE41-LABEL: sext_8i16_to_8i32:
697; X32-SSE41:       # %bb.0: # %entry
698; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
699; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
700; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
701; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
702; X32-SSE41-NEXT:    retl
703entry:
704  %B = sext <8 x i16> %A to <8 x i32>
705  ret <8 x i32> %B
706}
707
708define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp {
709; SSE2-LABEL: sext_16i16_to_16i32:
710; SSE2:       # %bb.0: # %entry
711; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
712; SSE2-NEXT:    psrad $16, %xmm4
713; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
714; SSE2-NEXT:    psrad $16, %xmm5
715; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
716; SSE2-NEXT:    psrad $16, %xmm2
717; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
718; SSE2-NEXT:    psrad $16, %xmm3
719; SSE2-NEXT:    movdqa %xmm4, %xmm0
720; SSE2-NEXT:    movdqa %xmm5, %xmm1
721; SSE2-NEXT:    retq
722;
723; SSSE3-LABEL: sext_16i16_to_16i32:
724; SSSE3:       # %bb.0: # %entry
725; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
726; SSSE3-NEXT:    psrad $16, %xmm4
727; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
728; SSSE3-NEXT:    psrad $16, %xmm5
729; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
730; SSSE3-NEXT:    psrad $16, %xmm2
731; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
732; SSSE3-NEXT:    psrad $16, %xmm3
733; SSSE3-NEXT:    movdqa %xmm4, %xmm0
734; SSSE3-NEXT:    movdqa %xmm5, %xmm1
735; SSSE3-NEXT:    retq
736;
737; SSE41-LABEL: sext_16i16_to_16i32:
738; SSE41:       # %bb.0: # %entry
739; SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
740; SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
741; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
742; SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
743; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
744; SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
745; SSE41-NEXT:    movdqa %xmm5, %xmm0
746; SSE41-NEXT:    movdqa %xmm4, %xmm1
747; SSE41-NEXT:    retq
748;
749; AVX1-LABEL: sext_16i16_to_16i32:
750; AVX1:       # %bb.0: # %entry
751; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
752; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
753; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
754; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
755; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
756; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
757; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
758; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
759; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
760; AVX1-NEXT:    vmovaps %ymm2, %ymm0
761; AVX1-NEXT:    retq
762;
763; AVX2-LABEL: sext_16i16_to_16i32:
764; AVX2:       # %bb.0: # %entry
765; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm2
766; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
767; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm1
768; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
769; AVX2-NEXT:    retq
770;
771; AVX512-LABEL: sext_16i16_to_16i32:
772; AVX512:       # %bb.0: # %entry
773; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
774; AVX512-NEXT:    retq
775;
776; X32-SSE41-LABEL: sext_16i16_to_16i32:
777; X32-SSE41:       # %bb.0: # %entry
778; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
779; X32-SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
780; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
781; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
782; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
783; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
784; X32-SSE41-NEXT:    movdqa %xmm5, %xmm0
785; X32-SSE41-NEXT:    movdqa %xmm4, %xmm1
786; X32-SSE41-NEXT:    retl
787entry:
788  %B = sext <16 x i16> %A to <16 x i32>
789  ret <16 x i32> %B
790}
791
792define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
793; SSE2-LABEL: sext_8i16_to_2i64:
794; SSE2:       # %bb.0: # %entry
795; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
796; SSE2-NEXT:    movdqa %xmm0, %xmm1
797; SSE2-NEXT:    psrad $31, %xmm1
798; SSE2-NEXT:    psrad $16, %xmm0
799; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
800; SSE2-NEXT:    retq
801;
802; SSSE3-LABEL: sext_8i16_to_2i64:
803; SSSE3:       # %bb.0: # %entry
804; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
805; SSSE3-NEXT:    movdqa %xmm0, %xmm1
806; SSSE3-NEXT:    psrad $31, %xmm1
807; SSSE3-NEXT:    psrad $16, %xmm0
808; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
809; SSSE3-NEXT:    retq
810;
811; SSE41-LABEL: sext_8i16_to_2i64:
812; SSE41:       # %bb.0: # %entry
813; SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
814; SSE41-NEXT:    retq
815;
816; AVX-LABEL: sext_8i16_to_2i64:
817; AVX:       # %bb.0: # %entry
818; AVX-NEXT:    vpmovsxwq %xmm0, %xmm0
819; AVX-NEXT:    retq
820;
821; X32-SSE41-LABEL: sext_8i16_to_2i64:
822; X32-SSE41:       # %bb.0: # %entry
823; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
824; X32-SSE41-NEXT:    retl
825entry:
826  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
827  %C = sext <2 x i16> %B to <2 x i64>
828  ret <2 x i64> %C
829}
830
831define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
832; SSE2-LABEL: sext_8i16_to_4i64:
833; SSE2:       # %bb.0: # %entry
834; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
835; SSE2-NEXT:    movdqa %xmm2, %xmm1
836; SSE2-NEXT:    psrad $31, %xmm1
837; SSE2-NEXT:    psrad $16, %xmm2
838; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
839; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
840; SSE2-NEXT:    movdqa %xmm1, %xmm0
841; SSE2-NEXT:    psrad $31, %xmm0
842; SSE2-NEXT:    psrad $16, %xmm1
843; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
844; SSE2-NEXT:    movdqa %xmm2, %xmm0
845; SSE2-NEXT:    retq
846;
847; SSSE3-LABEL: sext_8i16_to_4i64:
848; SSSE3:       # %bb.0: # %entry
849; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
850; SSSE3-NEXT:    movdqa %xmm2, %xmm1
851; SSSE3-NEXT:    psrad $31, %xmm1
852; SSSE3-NEXT:    psrad $16, %xmm2
853; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
854; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
855; SSSE3-NEXT:    movdqa %xmm1, %xmm0
856; SSSE3-NEXT:    psrad $31, %xmm0
857; SSSE3-NEXT:    psrad $16, %xmm1
858; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
859; SSSE3-NEXT:    movdqa %xmm2, %xmm0
860; SSSE3-NEXT:    retq
861;
862; SSE41-LABEL: sext_8i16_to_4i64:
863; SSE41:       # %bb.0: # %entry
864; SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
865; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
866; SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
867; SSE41-NEXT:    movdqa %xmm2, %xmm0
868; SSE41-NEXT:    retq
869;
870; AVX1-LABEL: sext_8i16_to_4i64:
871; AVX1:       # %bb.0: # %entry
872; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
873; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
874; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
875; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
876; AVX1-NEXT:    retq
877;
878; AVX2-LABEL: sext_8i16_to_4i64:
879; AVX2:       # %bb.0: # %entry
880; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
881; AVX2-NEXT:    retq
882;
883; AVX512-LABEL: sext_8i16_to_4i64:
884; AVX512:       # %bb.0: # %entry
885; AVX512-NEXT:    vpmovsxwq %xmm0, %ymm0
886; AVX512-NEXT:    retq
887;
888; X32-SSE41-LABEL: sext_8i16_to_4i64:
889; X32-SSE41:       # %bb.0: # %entry
890; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
891; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
892; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
893; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
894; X32-SSE41-NEXT:    retl
895entry:
896  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
897  %C = sext <4 x i16> %B to <4 x i64>
898  ret <4 x i64> %C
899}
900
901define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp {
902; SSE2-LABEL: sext_8i16_to_8i64:
903; SSE2:       # %bb.0: # %entry
904; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
905; SSE2-NEXT:    movdqa %xmm4, %xmm1
906; SSE2-NEXT:    psrad $31, %xmm1
907; SSE2-NEXT:    psrad $16, %xmm4
908; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
909; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
910; SSE2-NEXT:    movdqa %xmm2, %xmm1
911; SSE2-NEXT:    psrad $31, %xmm1
912; SSE2-NEXT:    psrad $16, %xmm2
913; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
914; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
915; SSE2-NEXT:    movdqa %xmm1, %xmm3
916; SSE2-NEXT:    psrad $31, %xmm3
917; SSE2-NEXT:    psrad $16, %xmm1
918; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
919; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
920; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
921; SSE2-NEXT:    movdqa %xmm3, %xmm0
922; SSE2-NEXT:    psrad $31, %xmm0
923; SSE2-NEXT:    psrad $16, %xmm3
924; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
925; SSE2-NEXT:    movdqa %xmm4, %xmm0
926; SSE2-NEXT:    retq
927;
928; SSSE3-LABEL: sext_8i16_to_8i64:
929; SSSE3:       # %bb.0: # %entry
930; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
931; SSSE3-NEXT:    movdqa %xmm4, %xmm1
932; SSSE3-NEXT:    psrad $31, %xmm1
933; SSSE3-NEXT:    psrad $16, %xmm4
934; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
935; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
936; SSSE3-NEXT:    movdqa %xmm2, %xmm1
937; SSSE3-NEXT:    psrad $31, %xmm1
938; SSSE3-NEXT:    psrad $16, %xmm2
939; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
940; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
941; SSSE3-NEXT:    movdqa %xmm1, %xmm3
942; SSSE3-NEXT:    psrad $31, %xmm3
943; SSSE3-NEXT:    psrad $16, %xmm1
944; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
945; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
946; SSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
947; SSSE3-NEXT:    movdqa %xmm3, %xmm0
948; SSSE3-NEXT:    psrad $31, %xmm0
949; SSSE3-NEXT:    psrad $16, %xmm3
950; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
951; SSSE3-NEXT:    movdqa %xmm4, %xmm0
952; SSSE3-NEXT:    retq
953;
954; SSE41-LABEL: sext_8i16_to_8i64:
955; SSE41:       # %bb.0: # %entry
956; SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
957; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
958; SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
959; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
960; SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
961; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
962; SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
963; SSE41-NEXT:    movdqa %xmm4, %xmm0
964; SSE41-NEXT:    retq
965;
966; AVX1-LABEL: sext_8i16_to_8i64:
967; AVX1:       # %bb.0: # %entry
968; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
969; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
970; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
971; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
972; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
973; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
974; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
975; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
976; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
977; AVX1-NEXT:    vmovaps %ymm2, %ymm0
978; AVX1-NEXT:    retq
979;
980; AVX2-LABEL: sext_8i16_to_8i64:
981; AVX2:       # %bb.0: # %entry
982; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm2
983; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
984; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm1
985; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
986; AVX2-NEXT:    retq
987;
988; AVX512-LABEL: sext_8i16_to_8i64:
989; AVX512:       # %bb.0: # %entry
990; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
991; AVX512-NEXT:    retq
992;
993; X32-SSE41-LABEL: sext_8i16_to_8i64:
994; X32-SSE41:       # %bb.0: # %entry
995; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
996; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
997; X32-SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
998; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
999; X32-SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
1000; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1001; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
1002; X32-SSE41-NEXT:    movdqa %xmm4, %xmm0
1003; X32-SSE41-NEXT:    retl
1004entry:
1005  %B = sext <8 x i16> %A to <8 x i64>
1006  ret <8 x i64> %B
1007}
1008
1009define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
1010; SSE2-LABEL: sext_4i32_to_2i64:
1011; SSE2:       # %bb.0: # %entry
1012; SSE2-NEXT:    movdqa %xmm0, %xmm1
1013; SSE2-NEXT:    psrad $31, %xmm1
1014; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1015; SSE2-NEXT:    retq
1016;
1017; SSSE3-LABEL: sext_4i32_to_2i64:
1018; SSSE3:       # %bb.0: # %entry
1019; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1020; SSSE3-NEXT:    psrad $31, %xmm1
1021; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1022; SSSE3-NEXT:    retq
1023;
1024; SSE41-LABEL: sext_4i32_to_2i64:
1025; SSE41:       # %bb.0: # %entry
1026; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
1027; SSE41-NEXT:    retq
1028;
1029; AVX-LABEL: sext_4i32_to_2i64:
1030; AVX:       # %bb.0: # %entry
1031; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
1032; AVX-NEXT:    retq
1033;
1034; X32-SSE41-LABEL: sext_4i32_to_2i64:
1035; X32-SSE41:       # %bb.0: # %entry
1036; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
1037; X32-SSE41-NEXT:    retl
1038entry:
1039  %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1040  %C = sext <2 x i32> %B to <2 x i64>
1041  ret <2 x i64> %C
1042}
1043
1044define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
1045; SSE2-LABEL: sext_4i32_to_4i64:
1046; SSE2:       # %bb.0: # %entry
1047; SSE2-NEXT:    movdqa %xmm0, %xmm2
1048; SSE2-NEXT:    psrad $31, %xmm2
1049; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1050; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1051; SSE2-NEXT:    movdqa %xmm1, %xmm2
1052; SSE2-NEXT:    psrad $31, %xmm2
1053; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1054; SSE2-NEXT:    retq
1055;
1056; SSSE3-LABEL: sext_4i32_to_4i64:
1057; SSSE3:       # %bb.0: # %entry
1058; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1059; SSSE3-NEXT:    psrad $31, %xmm2
1060; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1061; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1062; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1063; SSSE3-NEXT:    psrad $31, %xmm2
1064; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1065; SSSE3-NEXT:    retq
1066;
1067; SSE41-LABEL: sext_4i32_to_4i64:
1068; SSE41:       # %bb.0: # %entry
1069; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
1070; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1071; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
1072; SSE41-NEXT:    movdqa %xmm2, %xmm0
1073; SSE41-NEXT:    retq
1074;
1075; AVX1-LABEL: sext_4i32_to_4i64:
1076; AVX1:       # %bb.0: # %entry
1077; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1078; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1079; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1080; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1081; AVX1-NEXT:    retq
1082;
1083; AVX2-LABEL: sext_4i32_to_4i64:
1084; AVX2:       # %bb.0: # %entry
1085; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
1086; AVX2-NEXT:    retq
1087;
1088; AVX512-LABEL: sext_4i32_to_4i64:
1089; AVX512:       # %bb.0: # %entry
1090; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
1091; AVX512-NEXT:    retq
1092;
1093; X32-SSE41-LABEL: sext_4i32_to_4i64:
1094; X32-SSE41:       # %bb.0: # %entry
1095; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
1096; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1097; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
1098; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
1099; X32-SSE41-NEXT:    retl
1100entry:
1101  %B = sext <4 x i32> %A to <4 x i64>
1102  ret <4 x i64> %B
1103}
1104
1105define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp {
1106; SSE2-LABEL: sext_8i32_to_8i64:
1107; SSE2:       # %bb.0: # %entry
1108; SSE2-NEXT:    movdqa %xmm1, %xmm2
1109; SSE2-NEXT:    movdqa %xmm0, %xmm3
1110; SSE2-NEXT:    psrad $31, %xmm3
1111; SSE2-NEXT:    movdqa %xmm1, %xmm4
1112; SSE2-NEXT:    psrad $31, %xmm4
1113; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1114; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1115; SSE2-NEXT:    movdqa %xmm1, %xmm3
1116; SSE2-NEXT:    psrad $31, %xmm3
1117; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1118; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
1119; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1120; SSE2-NEXT:    movdqa %xmm3, %xmm4
1121; SSE2-NEXT:    psrad $31, %xmm4
1122; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1123; SSE2-NEXT:    retq
1124;
1125; SSSE3-LABEL: sext_8i32_to_8i64:
1126; SSSE3:       # %bb.0: # %entry
1127; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1128; SSSE3-NEXT:    movdqa %xmm0, %xmm3
1129; SSSE3-NEXT:    psrad $31, %xmm3
1130; SSSE3-NEXT:    movdqa %xmm1, %xmm4
1131; SSSE3-NEXT:    psrad $31, %xmm4
1132; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1133; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1134; SSSE3-NEXT:    movdqa %xmm1, %xmm3
1135; SSSE3-NEXT:    psrad $31, %xmm3
1136; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1137; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
1138; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1139; SSSE3-NEXT:    movdqa %xmm3, %xmm4
1140; SSSE3-NEXT:    psrad $31, %xmm4
1141; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1142; SSSE3-NEXT:    retq
1143;
1144; SSE41-LABEL: sext_8i32_to_8i64:
1145; SSE41:       # %bb.0: # %entry
1146; SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
1147; SSE41-NEXT:    pmovsxdq %xmm1, %xmm2
1148; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1149; SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
1150; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1151; SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
1152; SSE41-NEXT:    movdqa %xmm5, %xmm0
1153; SSE41-NEXT:    movdqa %xmm4, %xmm1
1154; SSE41-NEXT:    retq
1155;
1156; AVX1-LABEL: sext_8i32_to_8i64:
1157; AVX1:       # %bb.0: # %entry
1158; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1159; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1160; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
1161; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
1162; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1163; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1164; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1165; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1166; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
1167; AVX1-NEXT:    vmovaps %ymm2, %ymm0
1168; AVX1-NEXT:    retq
1169;
1170; AVX2-LABEL: sext_8i32_to_8i64:
1171; AVX2:       # %bb.0: # %entry
1172; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm2
1173; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1174; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm1
1175; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
1176; AVX2-NEXT:    retq
1177;
1178; AVX512-LABEL: sext_8i32_to_8i64:
1179; AVX512:       # %bb.0: # %entry
1180; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
1181; AVX512-NEXT:    retq
1182;
1183; X32-SSE41-LABEL: sext_8i32_to_8i64:
1184; X32-SSE41:       # %bb.0: # %entry
1185; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
1186; X32-SSE41-NEXT:    pmovsxdq %xmm1, %xmm2
1187; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1188; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
1189; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1190; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
1191; X32-SSE41-NEXT:    movdqa %xmm5, %xmm0
1192; X32-SSE41-NEXT:    movdqa %xmm4, %xmm1
1193; X32-SSE41-NEXT:    retl
1194entry:
1195  %B = sext <8 x i32> %A to <8 x i64>
1196  ret <8 x i64> %B
1197}
1198
1199define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
1200; SSE-LABEL: load_sext_2i1_to_2i64:
1201; SSE:       # %bb.0: # %entry
1202; SSE-NEXT:    movzbl (%rdi), %eax
1203; SSE-NEXT:    movq %rax, %rcx
1204; SSE-NEXT:    shlq $62, %rcx
1205; SSE-NEXT:    sarq $63, %rcx
1206; SSE-NEXT:    movq %rcx, %xmm1
1207; SSE-NEXT:    shlq $63, %rax
1208; SSE-NEXT:    sarq $63, %rax
1209; SSE-NEXT:    movq %rax, %xmm0
1210; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1211; SSE-NEXT:    retq
1212;
1213; AVX1-LABEL: load_sext_2i1_to_2i64:
1214; AVX1:       # %bb.0: # %entry
1215; AVX1-NEXT:    movzbl (%rdi), %eax
1216; AVX1-NEXT:    movq %rax, %rcx
1217; AVX1-NEXT:    shlq $62, %rcx
1218; AVX1-NEXT:    sarq $63, %rcx
1219; AVX1-NEXT:    vmovq %rcx, %xmm0
1220; AVX1-NEXT:    shlq $63, %rax
1221; AVX1-NEXT:    sarq $63, %rax
1222; AVX1-NEXT:    vmovq %rax, %xmm1
1223; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1224; AVX1-NEXT:    retq
1225;
1226; AVX2-LABEL: load_sext_2i1_to_2i64:
1227; AVX2:       # %bb.0: # %entry
1228; AVX2-NEXT:    movzbl (%rdi), %eax
1229; AVX2-NEXT:    movq %rax, %rcx
1230; AVX2-NEXT:    shlq $62, %rcx
1231; AVX2-NEXT:    sarq $63, %rcx
1232; AVX2-NEXT:    vmovq %rcx, %xmm0
1233; AVX2-NEXT:    shlq $63, %rax
1234; AVX2-NEXT:    sarq $63, %rax
1235; AVX2-NEXT:    vmovq %rax, %xmm1
1236; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1237; AVX2-NEXT:    retq
1238;
1239; AVX512F-LABEL: load_sext_2i1_to_2i64:
1240; AVX512F:       # %bb.0: # %entry
1241; AVX512F-NEXT:    movzbl (%rdi), %eax
1242; AVX512F-NEXT:    kmovw %eax, %k1
1243; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1244; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1245; AVX512F-NEXT:    vzeroupper
1246; AVX512F-NEXT:    retq
1247;
1248; AVX512BW-LABEL: load_sext_2i1_to_2i64:
1249; AVX512BW:       # %bb.0: # %entry
1250; AVX512BW-NEXT:    movzbl (%rdi), %eax
1251; AVX512BW-NEXT:    kmovd %eax, %k1
1252; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1253; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1254; AVX512BW-NEXT:    vzeroupper
1255; AVX512BW-NEXT:    retq
1256;
1257; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
1258; X32-SSE41:       # %bb.0: # %entry
1259; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1260; X32-SSE41-NEXT:    movzbl (%eax), %eax
1261; X32-SSE41-NEXT:    movl %eax, %ecx
1262; X32-SSE41-NEXT:    shll $31, %ecx
1263; X32-SSE41-NEXT:    sarl $31, %ecx
1264; X32-SSE41-NEXT:    movd %ecx, %xmm0
1265; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
1266; X32-SSE41-NEXT:    shll $30, %eax
1267; X32-SSE41-NEXT:    sarl $31, %eax
1268; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
1269; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1270; X32-SSE41-NEXT:    retl
1271entry:
1272 %X = load <2 x i1>, <2 x i1>* %ptr
1273 %Y = sext <2 x i1> %X to <2 x i64>
1274 ret <2 x i64> %Y
1275}
1276
1277define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
1278; SSE2-LABEL: load_sext_2i8_to_2i64:
1279; SSE2:       # %bb.0: # %entry
1280; SSE2-NEXT:    movzwl (%rdi), %eax
1281; SSE2-NEXT:    movd %eax, %xmm0
1282; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1283; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1284; SSE2-NEXT:    movdqa %xmm0, %xmm1
1285; SSE2-NEXT:    psrad $31, %xmm1
1286; SSE2-NEXT:    psrad $24, %xmm0
1287; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1288; SSE2-NEXT:    retq
1289;
1290; SSSE3-LABEL: load_sext_2i8_to_2i64:
1291; SSSE3:       # %bb.0: # %entry
1292; SSSE3-NEXT:    movzwl (%rdi), %eax
1293; SSSE3-NEXT:    movd %eax, %xmm0
1294; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,u,u,u,1,u,u,u,u,u,u,u,u]
1295; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1296; SSSE3-NEXT:    psrad $31, %xmm1
1297; SSSE3-NEXT:    psrad $24, %xmm0
1298; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1299; SSSE3-NEXT:    retq
1300;
1301; SSE41-LABEL: load_sext_2i8_to_2i64:
1302; SSE41:       # %bb.0: # %entry
1303; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
1304; SSE41-NEXT:    retq
1305;
1306; AVX-LABEL: load_sext_2i8_to_2i64:
1307; AVX:       # %bb.0: # %entry
1308; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
1309; AVX-NEXT:    retq
1310;
1311; X32-SSE41-LABEL: load_sext_2i8_to_2i64:
1312; X32-SSE41:       # %bb.0: # %entry
1313; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1314; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
1315; X32-SSE41-NEXT:    retl
1316entry:
1317 %X = load <2 x i8>, <2 x i8>* %ptr
1318 %Y = sext <2 x i8> %X to <2 x i64>
1319 ret <2 x i64> %Y
1320}
1321
1322define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
1323; SSE2-LABEL: load_sext_4i1_to_4i32:
1324; SSE2:       # %bb.0: # %entry
1325; SSE2-NEXT:    movzbl (%rdi), %eax
1326; SSE2-NEXT:    movq %rax, %rcx
1327; SSE2-NEXT:    shlq $60, %rcx
1328; SSE2-NEXT:    sarq $63, %rcx
1329; SSE2-NEXT:    movd %ecx, %xmm0
1330; SSE2-NEXT:    movq %rax, %rcx
1331; SSE2-NEXT:    shlq $61, %rcx
1332; SSE2-NEXT:    sarq $63, %rcx
1333; SSE2-NEXT:    movd %ecx, %xmm1
1334; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1335; SSE2-NEXT:    movq %rax, %rcx
1336; SSE2-NEXT:    shlq $62, %rcx
1337; SSE2-NEXT:    sarq $63, %rcx
1338; SSE2-NEXT:    movd %ecx, %xmm2
1339; SSE2-NEXT:    shlq $63, %rax
1340; SSE2-NEXT:    sarq $63, %rax
1341; SSE2-NEXT:    movd %eax, %xmm0
1342; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1343; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1344; SSE2-NEXT:    retq
1345;
1346; SSSE3-LABEL: load_sext_4i1_to_4i32:
1347; SSSE3:       # %bb.0: # %entry
1348; SSSE3-NEXT:    movzbl (%rdi), %eax
1349; SSSE3-NEXT:    movq %rax, %rcx
1350; SSSE3-NEXT:    shlq $60, %rcx
1351; SSSE3-NEXT:    sarq $63, %rcx
1352; SSSE3-NEXT:    movd %ecx, %xmm0
1353; SSSE3-NEXT:    movq %rax, %rcx
1354; SSSE3-NEXT:    shlq $61, %rcx
1355; SSSE3-NEXT:    sarq $63, %rcx
1356; SSSE3-NEXT:    movd %ecx, %xmm1
1357; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1358; SSSE3-NEXT:    movq %rax, %rcx
1359; SSSE3-NEXT:    shlq $62, %rcx
1360; SSSE3-NEXT:    sarq $63, %rcx
1361; SSSE3-NEXT:    movd %ecx, %xmm2
1362; SSSE3-NEXT:    shlq $63, %rax
1363; SSSE3-NEXT:    sarq $63, %rax
1364; SSSE3-NEXT:    movd %eax, %xmm0
1365; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1366; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1367; SSSE3-NEXT:    retq
1368;
1369; SSE41-LABEL: load_sext_4i1_to_4i32:
1370; SSE41:       # %bb.0: # %entry
1371; SSE41-NEXT:    movzbl (%rdi), %eax
1372; SSE41-NEXT:    movq %rax, %rcx
1373; SSE41-NEXT:    shlq $62, %rcx
1374; SSE41-NEXT:    sarq $63, %rcx
1375; SSE41-NEXT:    movq %rax, %rdx
1376; SSE41-NEXT:    shlq $63, %rdx
1377; SSE41-NEXT:    sarq $63, %rdx
1378; SSE41-NEXT:    movd %edx, %xmm0
1379; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
1380; SSE41-NEXT:    movq %rax, %rcx
1381; SSE41-NEXT:    shlq $61, %rcx
1382; SSE41-NEXT:    sarq $63, %rcx
1383; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
1384; SSE41-NEXT:    shlq $60, %rax
1385; SSE41-NEXT:    sarq $63, %rax
1386; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1387; SSE41-NEXT:    retq
1388;
1389; AVX1-LABEL: load_sext_4i1_to_4i32:
1390; AVX1:       # %bb.0: # %entry
1391; AVX1-NEXT:    movzbl (%rdi), %eax
1392; AVX1-NEXT:    movq %rax, %rcx
1393; AVX1-NEXT:    shlq $62, %rcx
1394; AVX1-NEXT:    sarq $63, %rcx
1395; AVX1-NEXT:    movq %rax, %rdx
1396; AVX1-NEXT:    shlq $63, %rdx
1397; AVX1-NEXT:    sarq $63, %rdx
1398; AVX1-NEXT:    vmovd %edx, %xmm0
1399; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
1400; AVX1-NEXT:    movq %rax, %rcx
1401; AVX1-NEXT:    shlq $61, %rcx
1402; AVX1-NEXT:    sarq $63, %rcx
1403; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
1404; AVX1-NEXT:    shlq $60, %rax
1405; AVX1-NEXT:    sarq $63, %rax
1406; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1407; AVX1-NEXT:    retq
1408;
1409; AVX2-LABEL: load_sext_4i1_to_4i32:
1410; AVX2:       # %bb.0: # %entry
1411; AVX2-NEXT:    movzbl (%rdi), %eax
1412; AVX2-NEXT:    movq %rax, %rcx
1413; AVX2-NEXT:    shlq $62, %rcx
1414; AVX2-NEXT:    sarq $63, %rcx
1415; AVX2-NEXT:    movq %rax, %rdx
1416; AVX2-NEXT:    shlq $63, %rdx
1417; AVX2-NEXT:    sarq $63, %rdx
1418; AVX2-NEXT:    vmovd %edx, %xmm0
1419; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
1420; AVX2-NEXT:    movq %rax, %rcx
1421; AVX2-NEXT:    shlq $61, %rcx
1422; AVX2-NEXT:    sarq $63, %rcx
1423; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
1424; AVX2-NEXT:    shlq $60, %rax
1425; AVX2-NEXT:    sarq $63, %rax
1426; AVX2-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1427; AVX2-NEXT:    retq
1428;
1429; AVX512F-LABEL: load_sext_4i1_to_4i32:
1430; AVX512F:       # %bb.0: # %entry
1431; AVX512F-NEXT:    movzbl (%rdi), %eax
1432; AVX512F-NEXT:    kmovw %eax, %k1
1433; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1434; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1435; AVX512F-NEXT:    vzeroupper
1436; AVX512F-NEXT:    retq
1437;
1438; AVX512BW-LABEL: load_sext_4i1_to_4i32:
1439; AVX512BW:       # %bb.0: # %entry
1440; AVX512BW-NEXT:    movzbl (%rdi), %eax
1441; AVX512BW-NEXT:    kmovd %eax, %k1
1442; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1443; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1444; AVX512BW-NEXT:    vzeroupper
1445; AVX512BW-NEXT:    retq
1446;
1447; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
1448; X32-SSE41:       # %bb.0: # %entry
1449; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1450; X32-SSE41-NEXT:    movl (%eax), %eax
1451; X32-SSE41-NEXT:    movl %eax, %ecx
1452; X32-SSE41-NEXT:    shll $30, %ecx
1453; X32-SSE41-NEXT:    sarl $31, %ecx
1454; X32-SSE41-NEXT:    movl %eax, %edx
1455; X32-SSE41-NEXT:    shll $31, %edx
1456; X32-SSE41-NEXT:    sarl $31, %edx
1457; X32-SSE41-NEXT:    movd %edx, %xmm0
1458; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
1459; X32-SSE41-NEXT:    movl %eax, %ecx
1460; X32-SSE41-NEXT:    shll $29, %ecx
1461; X32-SSE41-NEXT:    sarl $31, %ecx
1462; X32-SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
1463; X32-SSE41-NEXT:    shll $28, %eax
1464; X32-SSE41-NEXT:    sarl $31, %eax
1465; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1466; X32-SSE41-NEXT:    retl
1467entry:
1468 %X = load <4 x i1>, <4 x i1>* %ptr
1469 %Y = sext <4 x i1> %X to <4 x i32>
1470 ret <4 x i32> %Y
1471}
1472
1473define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
1474; SSE2-LABEL: load_sext_4i8_to_4i32:
1475; SSE2:       # %bb.0: # %entry
1476; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1477; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1478; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1479; SSE2-NEXT:    psrad $24, %xmm0
1480; SSE2-NEXT:    retq
1481;
1482; SSSE3-LABEL: load_sext_4i8_to_4i32:
1483; SSSE3:       # %bb.0: # %entry
1484; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1485; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1486; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1487; SSSE3-NEXT:    psrad $24, %xmm0
1488; SSSE3-NEXT:    retq
1489;
1490; SSE41-LABEL: load_sext_4i8_to_4i32:
1491; SSE41:       # %bb.0: # %entry
1492; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
1493; SSE41-NEXT:    retq
1494;
1495; AVX-LABEL: load_sext_4i8_to_4i32:
1496; AVX:       # %bb.0: # %entry
1497; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
1498; AVX-NEXT:    retq
1499;
1500; X32-SSE41-LABEL: load_sext_4i8_to_4i32:
1501; X32-SSE41:       # %bb.0: # %entry
1502; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1503; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
1504; X32-SSE41-NEXT:    retl
1505entry:
1506 %X = load <4 x i8>, <4 x i8>* %ptr
1507 %Y = sext <4 x i8> %X to <4 x i32>
1508 ret <4 x i32> %Y
1509}
1510
1511define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
1512; SSE2-LABEL: load_sext_4i1_to_4i64:
1513; SSE2:       # %bb.0: # %entry
1514; SSE2-NEXT:    movl (%rdi), %eax
1515; SSE2-NEXT:    movl %eax, %ecx
1516; SSE2-NEXT:    shrl $3, %ecx
1517; SSE2-NEXT:    movd %ecx, %xmm0
1518; SSE2-NEXT:    movl %eax, %ecx
1519; SSE2-NEXT:    shrl $2, %ecx
1520; SSE2-NEXT:    movd %ecx, %xmm1
1521; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1522; SSE2-NEXT:    movd %eax, %xmm2
1523; SSE2-NEXT:    shrl %eax
1524; SSE2-NEXT:    movd %eax, %xmm0
1525; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1526; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1527; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
1528; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
1529; SSE2-NEXT:    psllq $63, %xmm0
1530; SSE2-NEXT:    psrad $31, %xmm0
1531; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1532; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
1533; SSE2-NEXT:    psllq $63, %xmm1
1534; SSE2-NEXT:    psrad $31, %xmm1
1535; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1536; SSE2-NEXT:    retq
1537;
1538; SSSE3-LABEL: load_sext_4i1_to_4i64:
1539; SSSE3:       # %bb.0: # %entry
1540; SSSE3-NEXT:    movl (%rdi), %eax
1541; SSSE3-NEXT:    movl %eax, %ecx
1542; SSSE3-NEXT:    shrl $3, %ecx
1543; SSSE3-NEXT:    movd %ecx, %xmm0
1544; SSSE3-NEXT:    movl %eax, %ecx
1545; SSSE3-NEXT:    shrl $2, %ecx
1546; SSSE3-NEXT:    movd %ecx, %xmm1
1547; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1548; SSSE3-NEXT:    movd %eax, %xmm2
1549; SSSE3-NEXT:    shrl %eax
1550; SSSE3-NEXT:    movd %eax, %xmm0
1551; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1552; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1553; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm2
1554; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
1555; SSSE3-NEXT:    psllq $63, %xmm0
1556; SSSE3-NEXT:    psrad $31, %xmm0
1557; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1558; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
1559; SSSE3-NEXT:    psllq $63, %xmm1
1560; SSSE3-NEXT:    psrad $31, %xmm1
1561; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1562; SSSE3-NEXT:    retq
1563;
1564; SSE41-LABEL: load_sext_4i1_to_4i64:
1565; SSE41:       # %bb.0: # %entry
1566; SSE41-NEXT:    movl (%rdi), %eax
1567; SSE41-NEXT:    movl %eax, %ecx
1568; SSE41-NEXT:    shrl %ecx
1569; SSE41-NEXT:    movd %eax, %xmm1
1570; SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
1571; SSE41-NEXT:    movl %eax, %ecx
1572; SSE41-NEXT:    shrl $2, %ecx
1573; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
1574; SSE41-NEXT:    shrl $3, %eax
1575; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
1576; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
1577; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1578; SSE41-NEXT:    psllq $63, %xmm0
1579; SSE41-NEXT:    psrad $31, %xmm0
1580; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1581; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1582; SSE41-NEXT:    psllq $63, %xmm1
1583; SSE41-NEXT:    psrad $31, %xmm1
1584; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1585; SSE41-NEXT:    retq
1586;
1587; AVX1-LABEL: load_sext_4i1_to_4i64:
1588; AVX1:       # %bb.0: # %entry
1589; AVX1-NEXT:    movzbl (%rdi), %eax
1590; AVX1-NEXT:    movq %rax, %rcx
1591; AVX1-NEXT:    shlq $62, %rcx
1592; AVX1-NEXT:    sarq $63, %rcx
1593; AVX1-NEXT:    movq %rax, %rdx
1594; AVX1-NEXT:    shlq $63, %rdx
1595; AVX1-NEXT:    sarq $63, %rdx
1596; AVX1-NEXT:    vmovd %edx, %xmm0
1597; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
1598; AVX1-NEXT:    movq %rax, %rcx
1599; AVX1-NEXT:    shlq $61, %rcx
1600; AVX1-NEXT:    sarq $63, %rcx
1601; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
1602; AVX1-NEXT:    shlq $60, %rax
1603; AVX1-NEXT:    sarq $63, %rax
1604; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1605; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1606; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1607; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1608; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1609; AVX1-NEXT:    retq
1610;
1611; AVX2-LABEL: load_sext_4i1_to_4i64:
1612; AVX2:       # %bb.0: # %entry
1613; AVX2-NEXT:    movzbl (%rdi), %eax
1614; AVX2-NEXT:    movq %rax, %rcx
1615; AVX2-NEXT:    shlq $60, %rcx
1616; AVX2-NEXT:    sarq $63, %rcx
1617; AVX2-NEXT:    vmovq %rcx, %xmm0
1618; AVX2-NEXT:    movq %rax, %rcx
1619; AVX2-NEXT:    shlq $61, %rcx
1620; AVX2-NEXT:    sarq $63, %rcx
1621; AVX2-NEXT:    vmovq %rcx, %xmm1
1622; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1623; AVX2-NEXT:    movq %rax, %rcx
1624; AVX2-NEXT:    shlq $62, %rcx
1625; AVX2-NEXT:    sarq $63, %rcx
1626; AVX2-NEXT:    vmovq %rcx, %xmm1
1627; AVX2-NEXT:    shlq $63, %rax
1628; AVX2-NEXT:    sarq $63, %rax
1629; AVX2-NEXT:    vmovq %rax, %xmm2
1630; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1631; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1632; AVX2-NEXT:    retq
1633;
1634; AVX512F-LABEL: load_sext_4i1_to_4i64:
1635; AVX512F:       # %bb.0: # %entry
1636; AVX512F-NEXT:    movzbl (%rdi), %eax
1637; AVX512F-NEXT:    kmovw %eax, %k1
1638; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1639; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1640; AVX512F-NEXT:    retq
1641;
1642; AVX512BW-LABEL: load_sext_4i1_to_4i64:
1643; AVX512BW:       # %bb.0: # %entry
1644; AVX512BW-NEXT:    movzbl (%rdi), %eax
1645; AVX512BW-NEXT:    kmovd %eax, %k1
1646; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1647; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1648; AVX512BW-NEXT:    retq
1649;
1650; X32-SSE41-LABEL: load_sext_4i1_to_4i64:
1651; X32-SSE41:       # %bb.0: # %entry
1652; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1653; X32-SSE41-NEXT:    movzbl (%eax), %eax
1654; X32-SSE41-NEXT:    movl %eax, %ecx
1655; X32-SSE41-NEXT:    shrl %ecx
1656; X32-SSE41-NEXT:    movd %eax, %xmm1
1657; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
1658; X32-SSE41-NEXT:    movl %eax, %ecx
1659; X32-SSE41-NEXT:    shrl $2, %ecx
1660; X32-SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
1661; X32-SSE41-NEXT:    shrl $3, %eax
1662; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm1
1663; X32-SSE41-NEXT:    pand {{\.LCPI.*}}, %xmm1
1664; X32-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1665; X32-SSE41-NEXT:    psllq $63, %xmm0
1666; X32-SSE41-NEXT:    psrad $31, %xmm0
1667; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1668; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1669; X32-SSE41-NEXT:    psllq $63, %xmm1
1670; X32-SSE41-NEXT:    psrad $31, %xmm1
1671; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1672; X32-SSE41-NEXT:    retl
1673entry:
1674 %X = load <4 x i1>, <4 x i1>* %ptr
1675 %Y = sext <4 x i1> %X to <4 x i64>
1676 ret <4 x i64> %Y
1677}
1678
1679define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
1680; SSE2-LABEL: load_sext_4i8_to_4i64:
1681; SSE2:       # %bb.0: # %entry
1682; SSE2-NEXT:    movsbq 1(%rdi), %rax
1683; SSE2-NEXT:    movq %rax, %xmm1
1684; SSE2-NEXT:    movsbq (%rdi), %rax
1685; SSE2-NEXT:    movq %rax, %xmm0
1686; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1687; SSE2-NEXT:    movsbq 3(%rdi), %rax
1688; SSE2-NEXT:    movq %rax, %xmm2
1689; SSE2-NEXT:    movsbq 2(%rdi), %rax
1690; SSE2-NEXT:    movq %rax, %xmm1
1691; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1692; SSE2-NEXT:    retq
1693;
1694; SSSE3-LABEL: load_sext_4i8_to_4i64:
1695; SSSE3:       # %bb.0: # %entry
1696; SSSE3-NEXT:    movsbq 1(%rdi), %rax
1697; SSSE3-NEXT:    movq %rax, %xmm1
1698; SSSE3-NEXT:    movsbq (%rdi), %rax
1699; SSSE3-NEXT:    movq %rax, %xmm0
1700; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1701; SSSE3-NEXT:    movsbq 3(%rdi), %rax
1702; SSSE3-NEXT:    movq %rax, %xmm2
1703; SSSE3-NEXT:    movsbq 2(%rdi), %rax
1704; SSSE3-NEXT:    movq %rax, %xmm1
1705; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1706; SSSE3-NEXT:    retq
1707;
1708; SSE41-LABEL: load_sext_4i8_to_4i64:
1709; SSE41:       # %bb.0: # %entry
1710; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
1711; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
1712; SSE41-NEXT:    retq
1713;
1714; AVX1-LABEL: load_sext_4i8_to_4i64:
1715; AVX1:       # %bb.0: # %entry
1716; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
1717; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1718; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1719; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1720; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1721; AVX1-NEXT:    retq
1722;
1723; AVX2-LABEL: load_sext_4i8_to_4i64:
1724; AVX2:       # %bb.0: # %entry
1725; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
1726; AVX2-NEXT:    retq
1727;
1728; AVX512-LABEL: load_sext_4i8_to_4i64:
1729; AVX512:       # %bb.0: # %entry
1730; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
1731; AVX512-NEXT:    retq
1732;
1733; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
1734; X32-SSE41:       # %bb.0: # %entry
1735; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1736; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
1737; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
1738; X32-SSE41-NEXT:    retl
1739entry:
1740 %X = load <4 x i8>, <4 x i8>* %ptr
1741 %Y = sext <4 x i8> %X to <4 x i64>
1742 ret <4 x i64> %Y
1743}
1744
1745define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) {
1746; SSE2-LABEL: load_sext_4i8_to_4i64_extract:
1747; SSE2:       # %bb.0:
1748; SSE2-NEXT:    movsbq 3(%rdi), %rax
1749; SSE2-NEXT:    movq %rax, %xmm1
1750; SSE2-NEXT:    movsbq 2(%rdi), %rax
1751; SSE2-NEXT:    movq %rax, %xmm0
1752; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1753; SSE2-NEXT:    retq
1754;
1755; SSSE3-LABEL: load_sext_4i8_to_4i64_extract:
1756; SSSE3:       # %bb.0:
1757; SSSE3-NEXT:    movsbq 3(%rdi), %rax
1758; SSSE3-NEXT:    movq %rax, %xmm1
1759; SSSE3-NEXT:    movsbq 2(%rdi), %rax
1760; SSSE3-NEXT:    movq %rax, %xmm0
1761; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1762; SSSE3-NEXT:    retq
1763;
1764; SSE41-LABEL: load_sext_4i8_to_4i64_extract:
1765; SSE41:       # %bb.0:
1766; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm0
1767; SSE41-NEXT:    retq
1768;
1769; AVX1-LABEL: load_sext_4i8_to_4i64_extract:
1770; AVX1:       # %bb.0:
1771; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
1772; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1773; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1774; AVX1-NEXT:    retq
1775;
1776; AVX2-LABEL: load_sext_4i8_to_4i64_extract:
1777; AVX2:       # %bb.0:
1778; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
1779; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1780; AVX2-NEXT:    vzeroupper
1781; AVX2-NEXT:    retq
1782;
1783; AVX512-LABEL: load_sext_4i8_to_4i64_extract:
1784; AVX512:       # %bb.0:
1785; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
1786; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
1787; AVX512-NEXT:    vzeroupper
1788; AVX512-NEXT:    retq
1789;
1790; X32-SSE41-LABEL: load_sext_4i8_to_4i64_extract:
1791; X32-SSE41:       # %bb.0:
1792; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1793; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm0
1794; X32-SSE41-NEXT:    retl
1795 %ld = load <4 x i8>, <4 x i8>* %ptr
1796 %sext = sext <4 x i8> %ld to <4 x i64>
1797 %extract = shufflevector <4 x i64> %sext, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
1798 ret <2 x i64> %extract
1799}
1800
1801define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
1802; SSE2-LABEL: load_sext_8i1_to_8i16:
1803; SSE2:       # %bb.0: # %entry
1804; SSE2-NEXT:    movsbq (%rdi), %rax
1805; SSE2-NEXT:    movq %rax, %rcx
1806; SSE2-NEXT:    shrq $7, %rcx
1807; SSE2-NEXT:    movd %ecx, %xmm0
1808; SSE2-NEXT:    movq %rax, %rcx
1809; SSE2-NEXT:    shlq $57, %rcx
1810; SSE2-NEXT:    sarq $63, %rcx
1811; SSE2-NEXT:    movd %ecx, %xmm2
1812; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1813; SSE2-NEXT:    movq %rax, %rcx
1814; SSE2-NEXT:    shlq $58, %rcx
1815; SSE2-NEXT:    sarq $63, %rcx
1816; SSE2-NEXT:    movd %ecx, %xmm0
1817; SSE2-NEXT:    movq %rax, %rcx
1818; SSE2-NEXT:    shlq $59, %rcx
1819; SSE2-NEXT:    sarq $63, %rcx
1820; SSE2-NEXT:    movd %ecx, %xmm1
1821; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1822; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1823; SSE2-NEXT:    movq %rax, %rcx
1824; SSE2-NEXT:    shlq $60, %rcx
1825; SSE2-NEXT:    sarq $63, %rcx
1826; SSE2-NEXT:    movd %ecx, %xmm0
1827; SSE2-NEXT:    movq %rax, %rcx
1828; SSE2-NEXT:    shlq $61, %rcx
1829; SSE2-NEXT:    sarq $63, %rcx
1830; SSE2-NEXT:    movd %ecx, %xmm2
1831; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1832; SSE2-NEXT:    movq %rax, %rcx
1833; SSE2-NEXT:    shlq $62, %rcx
1834; SSE2-NEXT:    sarq $63, %rcx
1835; SSE2-NEXT:    movd %ecx, %xmm3
1836; SSE2-NEXT:    shlq $63, %rax
1837; SSE2-NEXT:    sarq $63, %rax
1838; SSE2-NEXT:    movd %eax, %xmm0
1839; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1840; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1841; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1842; SSE2-NEXT:    retq
1843;
1844; SSSE3-LABEL: load_sext_8i1_to_8i16:
1845; SSSE3:       # %bb.0: # %entry
1846; SSSE3-NEXT:    movsbq (%rdi), %rax
1847; SSSE3-NEXT:    movq %rax, %rcx
1848; SSSE3-NEXT:    shrq $7, %rcx
1849; SSSE3-NEXT:    movd %ecx, %xmm0
1850; SSSE3-NEXT:    movq %rax, %rcx
1851; SSSE3-NEXT:    shlq $57, %rcx
1852; SSSE3-NEXT:    sarq $63, %rcx
1853; SSSE3-NEXT:    movd %ecx, %xmm2
1854; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1855; SSSE3-NEXT:    movq %rax, %rcx
1856; SSSE3-NEXT:    shlq $58, %rcx
1857; SSSE3-NEXT:    sarq $63, %rcx
1858; SSSE3-NEXT:    movd %ecx, %xmm0
1859; SSSE3-NEXT:    movq %rax, %rcx
1860; SSSE3-NEXT:    shlq $59, %rcx
1861; SSSE3-NEXT:    sarq $63, %rcx
1862; SSSE3-NEXT:    movd %ecx, %xmm1
1863; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1864; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1865; SSSE3-NEXT:    movq %rax, %rcx
1866; SSSE3-NEXT:    shlq $60, %rcx
1867; SSSE3-NEXT:    sarq $63, %rcx
1868; SSSE3-NEXT:    movd %ecx, %xmm0
1869; SSSE3-NEXT:    movq %rax, %rcx
1870; SSSE3-NEXT:    shlq $61, %rcx
1871; SSSE3-NEXT:    sarq $63, %rcx
1872; SSSE3-NEXT:    movd %ecx, %xmm2
1873; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1874; SSSE3-NEXT:    movq %rax, %rcx
1875; SSSE3-NEXT:    shlq $62, %rcx
1876; SSSE3-NEXT:    sarq $63, %rcx
1877; SSSE3-NEXT:    movd %ecx, %xmm3
1878; SSSE3-NEXT:    shlq $63, %rax
1879; SSSE3-NEXT:    sarq $63, %rax
1880; SSSE3-NEXT:    movd %eax, %xmm0
1881; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1882; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1883; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1884; SSSE3-NEXT:    retq
1885;
1886; SSE41-LABEL: load_sext_8i1_to_8i16:
1887; SSE41:       # %bb.0: # %entry
1888; SSE41-NEXT:    movsbq (%rdi), %rax
1889; SSE41-NEXT:    movq %rax, %rcx
1890; SSE41-NEXT:    shlq $62, %rcx
1891; SSE41-NEXT:    sarq $63, %rcx
1892; SSE41-NEXT:    movq %rax, %rdx
1893; SSE41-NEXT:    shlq $63, %rdx
1894; SSE41-NEXT:    sarq $63, %rdx
1895; SSE41-NEXT:    movd %edx, %xmm0
1896; SSE41-NEXT:    pinsrw $1, %ecx, %xmm0
1897; SSE41-NEXT:    movq %rax, %rcx
1898; SSE41-NEXT:    shlq $61, %rcx
1899; SSE41-NEXT:    sarq $63, %rcx
1900; SSE41-NEXT:    pinsrw $2, %ecx, %xmm0
1901; SSE41-NEXT:    movq %rax, %rcx
1902; SSE41-NEXT:    shlq $60, %rcx
1903; SSE41-NEXT:    sarq $63, %rcx
1904; SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
1905; SSE41-NEXT:    movq %rax, %rcx
1906; SSE41-NEXT:    shlq $59, %rcx
1907; SSE41-NEXT:    sarq $63, %rcx
1908; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
1909; SSE41-NEXT:    movq %rax, %rcx
1910; SSE41-NEXT:    shlq $58, %rcx
1911; SSE41-NEXT:    sarq $63, %rcx
1912; SSE41-NEXT:    pinsrw $5, %ecx, %xmm0
1913; SSE41-NEXT:    movq %rax, %rcx
1914; SSE41-NEXT:    shlq $57, %rcx
1915; SSE41-NEXT:    sarq $63, %rcx
1916; SSE41-NEXT:    pinsrw $6, %ecx, %xmm0
1917; SSE41-NEXT:    shrq $7, %rax
1918; SSE41-NEXT:    pinsrw $7, %eax, %xmm0
1919; SSE41-NEXT:    retq
1920;
1921; AVX1-LABEL: load_sext_8i1_to_8i16:
1922; AVX1:       # %bb.0: # %entry
1923; AVX1-NEXT:    movsbq (%rdi), %rax
1924; AVX1-NEXT:    movq %rax, %rcx
1925; AVX1-NEXT:    shlq $62, %rcx
1926; AVX1-NEXT:    sarq $63, %rcx
1927; AVX1-NEXT:    movq %rax, %rdx
1928; AVX1-NEXT:    shlq $63, %rdx
1929; AVX1-NEXT:    sarq $63, %rdx
1930; AVX1-NEXT:    vmovd %edx, %xmm0
1931; AVX1-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
1932; AVX1-NEXT:    movq %rax, %rcx
1933; AVX1-NEXT:    shlq $61, %rcx
1934; AVX1-NEXT:    sarq $63, %rcx
1935; AVX1-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
1936; AVX1-NEXT:    movq %rax, %rcx
1937; AVX1-NEXT:    shlq $60, %rcx
1938; AVX1-NEXT:    sarq $63, %rcx
1939; AVX1-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
1940; AVX1-NEXT:    movq %rax, %rcx
1941; AVX1-NEXT:    shlq $59, %rcx
1942; AVX1-NEXT:    sarq $63, %rcx
1943; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
1944; AVX1-NEXT:    movq %rax, %rcx
1945; AVX1-NEXT:    shlq $58, %rcx
1946; AVX1-NEXT:    sarq $63, %rcx
1947; AVX1-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
1948; AVX1-NEXT:    movq %rax, %rcx
1949; AVX1-NEXT:    shlq $57, %rcx
1950; AVX1-NEXT:    sarq $63, %rcx
1951; AVX1-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
1952; AVX1-NEXT:    shrq $7, %rax
1953; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
1954; AVX1-NEXT:    retq
1955;
1956; AVX2-LABEL: load_sext_8i1_to_8i16:
1957; AVX2:       # %bb.0: # %entry
1958; AVX2-NEXT:    movsbq (%rdi), %rax
1959; AVX2-NEXT:    movq %rax, %rcx
1960; AVX2-NEXT:    shlq $62, %rcx
1961; AVX2-NEXT:    sarq $63, %rcx
1962; AVX2-NEXT:    movq %rax, %rdx
1963; AVX2-NEXT:    shlq $63, %rdx
1964; AVX2-NEXT:    sarq $63, %rdx
1965; AVX2-NEXT:    vmovd %edx, %xmm0
1966; AVX2-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
1967; AVX2-NEXT:    movq %rax, %rcx
1968; AVX2-NEXT:    shlq $61, %rcx
1969; AVX2-NEXT:    sarq $63, %rcx
1970; AVX2-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
1971; AVX2-NEXT:    movq %rax, %rcx
1972; AVX2-NEXT:    shlq $60, %rcx
1973; AVX2-NEXT:    sarq $63, %rcx
1974; AVX2-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
1975; AVX2-NEXT:    movq %rax, %rcx
1976; AVX2-NEXT:    shlq $59, %rcx
1977; AVX2-NEXT:    sarq $63, %rcx
1978; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
1979; AVX2-NEXT:    movq %rax, %rcx
1980; AVX2-NEXT:    shlq $58, %rcx
1981; AVX2-NEXT:    sarq $63, %rcx
1982; AVX2-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
1983; AVX2-NEXT:    movq %rax, %rcx
1984; AVX2-NEXT:    shlq $57, %rcx
1985; AVX2-NEXT:    sarq $63, %rcx
1986; AVX2-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
1987; AVX2-NEXT:    shrq $7, %rax
1988; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
1989; AVX2-NEXT:    retq
1990;
1991; AVX512F-LABEL: load_sext_8i1_to_8i16:
1992; AVX512F:       # %bb.0: # %entry
1993; AVX512F-NEXT:    movzbl (%rdi), %eax
1994; AVX512F-NEXT:    kmovw %eax, %k1
1995; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1996; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
1997; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1998; AVX512F-NEXT:    vzeroupper
1999; AVX512F-NEXT:    retq
2000;
2001; AVX512BW-LABEL: load_sext_8i1_to_8i16:
2002; AVX512BW:       # %bb.0: # %entry
2003; AVX512BW-NEXT:    movzbl (%rdi), %eax
2004; AVX512BW-NEXT:    kmovd %eax, %k0
2005; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
2006; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2007; AVX512BW-NEXT:    vzeroupper
2008; AVX512BW-NEXT:    retq
2009;
2010; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
2011; X32-SSE41:       # %bb.0: # %entry
2012; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2013; X32-SSE41-NEXT:    movsbl (%eax), %eax
2014; X32-SSE41-NEXT:    movl %eax, %ecx
2015; X32-SSE41-NEXT:    shll $30, %ecx
2016; X32-SSE41-NEXT:    sarl $31, %ecx
2017; X32-SSE41-NEXT:    movl %eax, %edx
2018; X32-SSE41-NEXT:    shll $31, %edx
2019; X32-SSE41-NEXT:    sarl $31, %edx
2020; X32-SSE41-NEXT:    movd %edx, %xmm0
2021; X32-SSE41-NEXT:    pinsrw $1, %ecx, %xmm0
2022; X32-SSE41-NEXT:    movl %eax, %ecx
2023; X32-SSE41-NEXT:    shll $29, %ecx
2024; X32-SSE41-NEXT:    sarl $31, %ecx
2025; X32-SSE41-NEXT:    pinsrw $2, %ecx, %xmm0
2026; X32-SSE41-NEXT:    movl %eax, %ecx
2027; X32-SSE41-NEXT:    shll $28, %ecx
2028; X32-SSE41-NEXT:    sarl $31, %ecx
2029; X32-SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
2030; X32-SSE41-NEXT:    movl %eax, %ecx
2031; X32-SSE41-NEXT:    shll $27, %ecx
2032; X32-SSE41-NEXT:    sarl $31, %ecx
2033; X32-SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
2034; X32-SSE41-NEXT:    movl %eax, %ecx
2035; X32-SSE41-NEXT:    shll $26, %ecx
2036; X32-SSE41-NEXT:    sarl $31, %ecx
2037; X32-SSE41-NEXT:    pinsrw $5, %ecx, %xmm0
2038; X32-SSE41-NEXT:    movl %eax, %ecx
2039; X32-SSE41-NEXT:    shll $25, %ecx
2040; X32-SSE41-NEXT:    sarl $31, %ecx
2041; X32-SSE41-NEXT:    pinsrw $6, %ecx, %xmm0
2042; X32-SSE41-NEXT:    shrl $7, %eax
2043; X32-SSE41-NEXT:    pinsrw $7, %eax, %xmm0
2044; X32-SSE41-NEXT:    retl
2045entry:
2046 %X = load <8 x i1>, <8 x i1>* %ptr
2047 %Y = sext <8 x i1> %X to <8 x i16>
2048 ret <8 x i16> %Y
2049}
2050
2051define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) {
2052; SSE2-LABEL: load_sext_8i8_to_8i16:
2053; SSE2:       # %bb.0: # %entry
2054; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2055; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2056; SSE2-NEXT:    psraw $8, %xmm0
2057; SSE2-NEXT:    retq
2058;
2059; SSSE3-LABEL: load_sext_8i8_to_8i16:
2060; SSSE3:       # %bb.0: # %entry
2061; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2062; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2063; SSSE3-NEXT:    psraw $8, %xmm0
2064; SSSE3-NEXT:    retq
2065;
2066; SSE41-LABEL: load_sext_8i8_to_8i16:
2067; SSE41:       # %bb.0: # %entry
2068; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
2069; SSE41-NEXT:    retq
2070;
2071; AVX-LABEL: load_sext_8i8_to_8i16:
2072; AVX:       # %bb.0: # %entry
2073; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
2074; AVX-NEXT:    retq
2075;
2076; X32-SSE41-LABEL: load_sext_8i8_to_8i16:
2077; X32-SSE41:       # %bb.0: # %entry
2078; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2079; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
2080; X32-SSE41-NEXT:    retl
2081entry:
2082 %X = load <8 x i8>, <8 x i8>* %ptr
2083 %Y = sext <8 x i8> %X to <8 x i16>
2084 ret <8 x i16> %Y
2085}
2086
2087define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
2088; SSE2-LABEL: load_sext_8i8_to_8i64:
2089; SSE2:       # %bb.0: # %entry
2090; SSE2-NEXT:    movsbq 1(%rdi), %rax
2091; SSE2-NEXT:    movq %rax, %xmm1
2092; SSE2-NEXT:    movsbq (%rdi), %rax
2093; SSE2-NEXT:    movq %rax, %xmm0
2094; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2095; SSE2-NEXT:    movsbq 3(%rdi), %rax
2096; SSE2-NEXT:    movq %rax, %xmm2
2097; SSE2-NEXT:    movsbq 2(%rdi), %rax
2098; SSE2-NEXT:    movq %rax, %xmm1
2099; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2100; SSE2-NEXT:    movsbq 5(%rdi), %rax
2101; SSE2-NEXT:    movq %rax, %xmm3
2102; SSE2-NEXT:    movsbq 4(%rdi), %rax
2103; SSE2-NEXT:    movq %rax, %xmm2
2104; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2105; SSE2-NEXT:    movsbq 7(%rdi), %rax
2106; SSE2-NEXT:    movq %rax, %xmm4
2107; SSE2-NEXT:    movsbq 6(%rdi), %rax
2108; SSE2-NEXT:    movq %rax, %xmm3
2109; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
2110; SSE2-NEXT:    retq
2111;
2112; SSSE3-LABEL: load_sext_8i8_to_8i64:
2113; SSSE3:       # %bb.0: # %entry
2114; SSSE3-NEXT:    movsbq 1(%rdi), %rax
2115; SSSE3-NEXT:    movq %rax, %xmm1
2116; SSSE3-NEXT:    movsbq (%rdi), %rax
2117; SSSE3-NEXT:    movq %rax, %xmm0
2118; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2119; SSSE3-NEXT:    movsbq 3(%rdi), %rax
2120; SSSE3-NEXT:    movq %rax, %xmm2
2121; SSSE3-NEXT:    movsbq 2(%rdi), %rax
2122; SSSE3-NEXT:    movq %rax, %xmm1
2123; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2124; SSSE3-NEXT:    movsbq 5(%rdi), %rax
2125; SSSE3-NEXT:    movq %rax, %xmm3
2126; SSSE3-NEXT:    movsbq 4(%rdi), %rax
2127; SSSE3-NEXT:    movq %rax, %xmm2
2128; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2129; SSSE3-NEXT:    movsbq 7(%rdi), %rax
2130; SSSE3-NEXT:    movq %rax, %xmm4
2131; SSSE3-NEXT:    movsbq 6(%rdi), %rax
2132; SSSE3-NEXT:    movq %rax, %xmm3
2133; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
2134; SSSE3-NEXT:    retq
2135;
2136; SSE41-LABEL: load_sext_8i8_to_8i64:
2137; SSE41:       # %bb.0: # %entry
2138; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
2139; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
2140; SSE41-NEXT:    pmovsxbq 4(%rdi), %xmm2
2141; SSE41-NEXT:    pmovsxbq 6(%rdi), %xmm3
2142; SSE41-NEXT:    retq
2143;
2144; AVX1-LABEL: load_sext_8i8_to_8i64:
2145; AVX1:       # %bb.0: # %entry
2146; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
2147; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
2148; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2149; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
2150; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2151; AVX1-NEXT:    vpmovsxbd 4(%rdi), %xmm1
2152; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm2
2153; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2154; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
2155; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
2156; AVX1-NEXT:    retq
2157;
2158; AVX2-LABEL: load_sext_8i8_to_8i64:
2159; AVX2:       # %bb.0: # %entry
2160; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
2161; AVX2-NEXT:    vpmovsxbq 4(%rdi), %ymm1
2162; AVX2-NEXT:    retq
2163;
2164; AVX512-LABEL: load_sext_8i8_to_8i64:
2165; AVX512:       # %bb.0: # %entry
2166; AVX512-NEXT:    vpmovsxbq (%rdi), %zmm0
2167; AVX512-NEXT:    retq
2168;
2169; X32-SSE41-LABEL: load_sext_8i8_to_8i64:
2170; X32-SSE41:       # %bb.0: # %entry
2171; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2172; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
2173; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
2174; X32-SSE41-NEXT:    pmovsxbq 4(%eax), %xmm2
2175; X32-SSE41-NEXT:    pmovsxbq 6(%eax), %xmm3
2176; X32-SSE41-NEXT:    retl
2177entry:
2178 %X = load <8 x i8>, <8 x i8>* %ptr
2179 %Y = sext <8 x i8> %X to <8 x i64>
2180 ret <8 x i64> %Y
2181}
2182
2183define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
2184; SSE2-LABEL: load_sext_8i1_to_8i32:
2185; SSE2:       # %bb.0: # %entry
2186; SSE2-NEXT:    movzbl (%rdi), %eax
2187; SSE2-NEXT:    movl %eax, %ecx
2188; SSE2-NEXT:    shrl $7, %ecx
2189; SSE2-NEXT:    movd %ecx, %xmm0
2190; SSE2-NEXT:    movl %eax, %ecx
2191; SSE2-NEXT:    shrl $6, %ecx
2192; SSE2-NEXT:    andl $1, %ecx
2193; SSE2-NEXT:    movd %ecx, %xmm1
2194; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2195; SSE2-NEXT:    movl %eax, %ecx
2196; SSE2-NEXT:    shrl $5, %ecx
2197; SSE2-NEXT:    andl $1, %ecx
2198; SSE2-NEXT:    movd %ecx, %xmm0
2199; SSE2-NEXT:    movl %eax, %ecx
2200; SSE2-NEXT:    shrl $4, %ecx
2201; SSE2-NEXT:    andl $1, %ecx
2202; SSE2-NEXT:    movd %ecx, %xmm2
2203; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2204; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2205; SSE2-NEXT:    movl %eax, %ecx
2206; SSE2-NEXT:    shrl $3, %ecx
2207; SSE2-NEXT:    andl $1, %ecx
2208; SSE2-NEXT:    movd %ecx, %xmm0
2209; SSE2-NEXT:    movl %eax, %ecx
2210; SSE2-NEXT:    shrl $2, %ecx
2211; SSE2-NEXT:    andl $1, %ecx
2212; SSE2-NEXT:    movd %ecx, %xmm3
2213; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
2214; SSE2-NEXT:    movl %eax, %ecx
2215; SSE2-NEXT:    andl $1, %ecx
2216; SSE2-NEXT:    movd %ecx, %xmm1
2217; SSE2-NEXT:    shrl %eax
2218; SSE2-NEXT:    andl $1, %eax
2219; SSE2-NEXT:    movd %eax, %xmm0
2220; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2221; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2222; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2223; SSE2-NEXT:    movdqa %xmm1, %xmm0
2224; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2225; SSE2-NEXT:    pslld $31, %xmm0
2226; SSE2-NEXT:    psrad $31, %xmm0
2227; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2228; SSE2-NEXT:    pslld $31, %xmm1
2229; SSE2-NEXT:    psrad $31, %xmm1
2230; SSE2-NEXT:    retq
2231;
2232; SSSE3-LABEL: load_sext_8i1_to_8i32:
2233; SSSE3:       # %bb.0: # %entry
2234; SSSE3-NEXT:    movzbl (%rdi), %eax
2235; SSSE3-NEXT:    movl %eax, %ecx
2236; SSSE3-NEXT:    shrl $7, %ecx
2237; SSSE3-NEXT:    movd %ecx, %xmm0
2238; SSSE3-NEXT:    movl %eax, %ecx
2239; SSSE3-NEXT:    shrl $6, %ecx
2240; SSSE3-NEXT:    andl $1, %ecx
2241; SSSE3-NEXT:    movd %ecx, %xmm1
2242; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2243; SSSE3-NEXT:    movl %eax, %ecx
2244; SSSE3-NEXT:    shrl $5, %ecx
2245; SSSE3-NEXT:    andl $1, %ecx
2246; SSSE3-NEXT:    movd %ecx, %xmm0
2247; SSSE3-NEXT:    movl %eax, %ecx
2248; SSSE3-NEXT:    shrl $4, %ecx
2249; SSSE3-NEXT:    andl $1, %ecx
2250; SSSE3-NEXT:    movd %ecx, %xmm2
2251; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2252; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2253; SSSE3-NEXT:    movl %eax, %ecx
2254; SSSE3-NEXT:    shrl $3, %ecx
2255; SSSE3-NEXT:    andl $1, %ecx
2256; SSSE3-NEXT:    movd %ecx, %xmm0
2257; SSSE3-NEXT:    movl %eax, %ecx
2258; SSSE3-NEXT:    shrl $2, %ecx
2259; SSSE3-NEXT:    andl $1, %ecx
2260; SSSE3-NEXT:    movd %ecx, %xmm3
2261; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
2262; SSSE3-NEXT:    movl %eax, %ecx
2263; SSSE3-NEXT:    andl $1, %ecx
2264; SSSE3-NEXT:    movd %ecx, %xmm1
2265; SSSE3-NEXT:    shrl %eax
2266; SSSE3-NEXT:    andl $1, %eax
2267; SSSE3-NEXT:    movd %eax, %xmm0
2268; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2269; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2270; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2271; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2272; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2273; SSSE3-NEXT:    pslld $31, %xmm0
2274; SSSE3-NEXT:    psrad $31, %xmm0
2275; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2276; SSSE3-NEXT:    pslld $31, %xmm1
2277; SSSE3-NEXT:    psrad $31, %xmm1
2278; SSSE3-NEXT:    retq
2279;
2280; SSE41-LABEL: load_sext_8i1_to_8i32:
2281; SSE41:       # %bb.0: # %entry
2282; SSE41-NEXT:    movzbl (%rdi), %eax
2283; SSE41-NEXT:    movl %eax, %ecx
2284; SSE41-NEXT:    shrl %ecx
2285; SSE41-NEXT:    andl $1, %ecx
2286; SSE41-NEXT:    movl %eax, %edx
2287; SSE41-NEXT:    andl $1, %edx
2288; SSE41-NEXT:    movd %edx, %xmm1
2289; SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
2290; SSE41-NEXT:    movl %eax, %ecx
2291; SSE41-NEXT:    shrl $2, %ecx
2292; SSE41-NEXT:    andl $1, %ecx
2293; SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
2294; SSE41-NEXT:    movl %eax, %ecx
2295; SSE41-NEXT:    shrl $3, %ecx
2296; SSE41-NEXT:    andl $1, %ecx
2297; SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
2298; SSE41-NEXT:    movl %eax, %ecx
2299; SSE41-NEXT:    shrl $4, %ecx
2300; SSE41-NEXT:    andl $1, %ecx
2301; SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
2302; SSE41-NEXT:    movl %eax, %ecx
2303; SSE41-NEXT:    shrl $5, %ecx
2304; SSE41-NEXT:    andl $1, %ecx
2305; SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
2306; SSE41-NEXT:    movl %eax, %ecx
2307; SSE41-NEXT:    shrl $6, %ecx
2308; SSE41-NEXT:    andl $1, %ecx
2309; SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
2310; SSE41-NEXT:    shrl $7, %eax
2311; SSE41-NEXT:    pinsrw $7, %eax, %xmm1
2312; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2313; SSE41-NEXT:    pslld $31, %xmm0
2314; SSE41-NEXT:    psrad $31, %xmm0
2315; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2316; SSE41-NEXT:    pslld $31, %xmm1
2317; SSE41-NEXT:    psrad $31, %xmm1
2318; SSE41-NEXT:    retq
2319;
2320; AVX1-LABEL: load_sext_8i1_to_8i32:
2321; AVX1:       # %bb.0: # %entry
2322; AVX1-NEXT:    movsbq (%rdi), %rax
2323; AVX1-NEXT:    movq %rax, %rcx
2324; AVX1-NEXT:    shlq $58, %rcx
2325; AVX1-NEXT:    sarq $63, %rcx
2326; AVX1-NEXT:    movq %rax, %rdx
2327; AVX1-NEXT:    shlq $59, %rdx
2328; AVX1-NEXT:    sarq $63, %rdx
2329; AVX1-NEXT:    vmovd %edx, %xmm0
2330; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
2331; AVX1-NEXT:    movq %rax, %rcx
2332; AVX1-NEXT:    shlq $57, %rcx
2333; AVX1-NEXT:    sarq $63, %rcx
2334; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
2335; AVX1-NEXT:    movq %rax, %rcx
2336; AVX1-NEXT:    shrq $7, %rcx
2337; AVX1-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
2338; AVX1-NEXT:    movq %rax, %rcx
2339; AVX1-NEXT:    shlq $62, %rcx
2340; AVX1-NEXT:    sarq $63, %rcx
2341; AVX1-NEXT:    movq %rax, %rdx
2342; AVX1-NEXT:    shlq $63, %rdx
2343; AVX1-NEXT:    sarq $63, %rdx
2344; AVX1-NEXT:    vmovd %edx, %xmm1
2345; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
2346; AVX1-NEXT:    movq %rax, %rcx
2347; AVX1-NEXT:    shlq $61, %rcx
2348; AVX1-NEXT:    sarq $63, %rcx
2349; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
2350; AVX1-NEXT:    shlq $60, %rax
2351; AVX1-NEXT:    sarq $63, %rax
2352; AVX1-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
2353; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2354; AVX1-NEXT:    retq
2355;
2356; AVX2-LABEL: load_sext_8i1_to_8i32:
2357; AVX2:       # %bb.0: # %entry
2358; AVX2-NEXT:    movsbq (%rdi), %rax
2359; AVX2-NEXT:    movq %rax, %rcx
2360; AVX2-NEXT:    shlq $58, %rcx
2361; AVX2-NEXT:    sarq $63, %rcx
2362; AVX2-NEXT:    movq %rax, %rdx
2363; AVX2-NEXT:    shlq $59, %rdx
2364; AVX2-NEXT:    sarq $63, %rdx
2365; AVX2-NEXT:    vmovd %edx, %xmm0
2366; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
2367; AVX2-NEXT:    movq %rax, %rcx
2368; AVX2-NEXT:    shlq $57, %rcx
2369; AVX2-NEXT:    sarq $63, %rcx
2370; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
2371; AVX2-NEXT:    movq %rax, %rcx
2372; AVX2-NEXT:    shrq $7, %rcx
2373; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
2374; AVX2-NEXT:    movq %rax, %rcx
2375; AVX2-NEXT:    shlq $62, %rcx
2376; AVX2-NEXT:    sarq $63, %rcx
2377; AVX2-NEXT:    movq %rax, %rdx
2378; AVX2-NEXT:    shlq $63, %rdx
2379; AVX2-NEXT:    sarq $63, %rdx
2380; AVX2-NEXT:    vmovd %edx, %xmm1
2381; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
2382; AVX2-NEXT:    movq %rax, %rcx
2383; AVX2-NEXT:    shlq $61, %rcx
2384; AVX2-NEXT:    sarq $63, %rcx
2385; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
2386; AVX2-NEXT:    shlq $60, %rax
2387; AVX2-NEXT:    sarq $63, %rax
2388; AVX2-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
2389; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
2390; AVX2-NEXT:    retq
2391;
2392; AVX512F-LABEL: load_sext_8i1_to_8i32:
2393; AVX512F:       # %bb.0: # %entry
2394; AVX512F-NEXT:    movzbl (%rdi), %eax
2395; AVX512F-NEXT:    kmovw %eax, %k1
2396; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2397; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2398; AVX512F-NEXT:    retq
2399;
2400; AVX512BW-LABEL: load_sext_8i1_to_8i32:
2401; AVX512BW:       # %bb.0: # %entry
2402; AVX512BW-NEXT:    movzbl (%rdi), %eax
2403; AVX512BW-NEXT:    kmovd %eax, %k1
2404; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2405; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2406; AVX512BW-NEXT:    retq
2407;
2408; X32-SSE41-LABEL: load_sext_8i1_to_8i32:
2409; X32-SSE41:       # %bb.0: # %entry
2410; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2411; X32-SSE41-NEXT:    movzbl (%eax), %eax
2412; X32-SSE41-NEXT:    movl %eax, %ecx
2413; X32-SSE41-NEXT:    shrl %ecx
2414; X32-SSE41-NEXT:    andl $1, %ecx
2415; X32-SSE41-NEXT:    movl %eax, %edx
2416; X32-SSE41-NEXT:    andl $1, %edx
2417; X32-SSE41-NEXT:    movd %edx, %xmm1
2418; X32-SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
2419; X32-SSE41-NEXT:    movl %eax, %ecx
2420; X32-SSE41-NEXT:    shrl $2, %ecx
2421; X32-SSE41-NEXT:    andl $1, %ecx
2422; X32-SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
2423; X32-SSE41-NEXT:    movl %eax, %ecx
2424; X32-SSE41-NEXT:    shrl $3, %ecx
2425; X32-SSE41-NEXT:    andl $1, %ecx
2426; X32-SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
2427; X32-SSE41-NEXT:    movl %eax, %ecx
2428; X32-SSE41-NEXT:    shrl $4, %ecx
2429; X32-SSE41-NEXT:    andl $1, %ecx
2430; X32-SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
2431; X32-SSE41-NEXT:    movl %eax, %ecx
2432; X32-SSE41-NEXT:    shrl $5, %ecx
2433; X32-SSE41-NEXT:    andl $1, %ecx
2434; X32-SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
2435; X32-SSE41-NEXT:    movl %eax, %ecx
2436; X32-SSE41-NEXT:    shrl $6, %ecx
2437; X32-SSE41-NEXT:    andl $1, %ecx
2438; X32-SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
2439; X32-SSE41-NEXT:    shrl $7, %eax
2440; X32-SSE41-NEXT:    pinsrw $7, %eax, %xmm1
2441; X32-SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2442; X32-SSE41-NEXT:    pslld $31, %xmm0
2443; X32-SSE41-NEXT:    psrad $31, %xmm0
2444; X32-SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2445; X32-SSE41-NEXT:    pslld $31, %xmm1
2446; X32-SSE41-NEXT:    psrad $31, %xmm1
2447; X32-SSE41-NEXT:    retl
2448entry:
2449 %X = load <8 x i1>, <8 x i1>* %ptr
2450 %Y = sext <8 x i1> %X to <8 x i32>
2451 ret <8 x i32> %Y
2452}
2453
2454define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
2455; SSE2-LABEL: load_sext_8i8_to_8i32:
2456; SSE2:       # %bb.0: # %entry
2457; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2458; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2459; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2460; SSE2-NEXT:    psrad $24, %xmm0
2461; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2462; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2463; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
2464; SSE2-NEXT:    psrad $24, %xmm1
2465; SSE2-NEXT:    retq
2466;
2467; SSSE3-LABEL: load_sext_8i8_to_8i32:
2468; SSSE3:       # %bb.0: # %entry
2469; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2470; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2471; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2472; SSSE3-NEXT:    psrad $24, %xmm0
2473; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2474; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2475; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
2476; SSSE3-NEXT:    psrad $24, %xmm1
2477; SSSE3-NEXT:    retq
2478;
2479; SSE41-LABEL: load_sext_8i8_to_8i32:
2480; SSE41:       # %bb.0: # %entry
2481; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
2482; SSE41-NEXT:    pmovsxbd 4(%rdi), %xmm1
2483; SSE41-NEXT:    retq
2484;
2485; AVX1-LABEL: load_sext_8i8_to_8i32:
2486; AVX1:       # %bb.0: # %entry
2487; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
2488; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
2489; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2490; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
2491; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2492; AVX1-NEXT:    retq
2493;
2494; AVX2-LABEL: load_sext_8i8_to_8i32:
2495; AVX2:       # %bb.0: # %entry
2496; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
2497; AVX2-NEXT:    retq
2498;
2499; AVX512-LABEL: load_sext_8i8_to_8i32:
2500; AVX512:       # %bb.0: # %entry
2501; AVX512-NEXT:    vpmovsxbd (%rdi), %ymm0
2502; AVX512-NEXT:    retq
2503;
2504; X32-SSE41-LABEL: load_sext_8i8_to_8i32:
2505; X32-SSE41:       # %bb.0: # %entry
2506; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2507; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
2508; X32-SSE41-NEXT:    pmovsxbd 4(%eax), %xmm1
2509; X32-SSE41-NEXT:    retl
2510entry:
2511 %X = load <8 x i8>, <8 x i8>* %ptr
2512 %Y = sext <8 x i8> %X to <8 x i32>
2513 ret <8 x i32> %Y
2514}
2515
2516define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
2517; SSE2-LABEL: load_sext_16i1_to_16i8:
2518; SSE2:       # %bb.0: # %entry
2519; SSE2-NEXT:    pushq %rbp
2520; SSE2-NEXT:    pushq %r15
2521; SSE2-NEXT:    pushq %r14
2522; SSE2-NEXT:    pushq %r13
2523; SSE2-NEXT:    pushq %r12
2524; SSE2-NEXT:    pushq %rbx
2525; SSE2-NEXT:    movswq (%rdi), %rax
2526; SSE2-NEXT:    movq %rax, %r8
2527; SSE2-NEXT:    movq %rax, %r9
2528; SSE2-NEXT:    movq %rax, %r10
2529; SSE2-NEXT:    movq %rax, %r11
2530; SSE2-NEXT:    movq %rax, %r14
2531; SSE2-NEXT:    movq %rax, %r15
2532; SSE2-NEXT:    movq %rax, %r12
2533; SSE2-NEXT:    movq %rax, %r13
2534; SSE2-NEXT:    movq %rax, %rbx
2535; SSE2-NEXT:    movq %rax, %rcx
2536; SSE2-NEXT:    movq %rax, %rdx
2537; SSE2-NEXT:    movq %rax, %rsi
2538; SSE2-NEXT:    movq %rax, %rdi
2539; SSE2-NEXT:    movq %rax, %rbp
2540; SSE2-NEXT:    shrq $15, %rbp
2541; SSE2-NEXT:    movd %ebp, %xmm0
2542; SSE2-NEXT:    movq %rax, %rbp
2543; SSE2-NEXT:    movsbq %al, %rax
2544; SSE2-NEXT:    shlq $49, %r8
2545; SSE2-NEXT:    sarq $63, %r8
2546; SSE2-NEXT:    movd %r8d, %xmm1
2547; SSE2-NEXT:    shlq $50, %r9
2548; SSE2-NEXT:    sarq $63, %r9
2549; SSE2-NEXT:    movd %r9d, %xmm2
2550; SSE2-NEXT:    shlq $51, %r10
2551; SSE2-NEXT:    sarq $63, %r10
2552; SSE2-NEXT:    movd %r10d, %xmm3
2553; SSE2-NEXT:    shlq $52, %r11
2554; SSE2-NEXT:    sarq $63, %r11
2555; SSE2-NEXT:    movd %r11d, %xmm4
2556; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2557; SSE2-NEXT:    shlq $53, %r14
2558; SSE2-NEXT:    sarq $63, %r14
2559; SSE2-NEXT:    movd %r14d, %xmm0
2560; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2561; SSE2-NEXT:    shlq $54, %r15
2562; SSE2-NEXT:    sarq $63, %r15
2563; SSE2-NEXT:    movd %r15d, %xmm2
2564; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
2565; SSE2-NEXT:    shlq $55, %r12
2566; SSE2-NEXT:    sarq $63, %r12
2567; SSE2-NEXT:    movd %r12d, %xmm1
2568; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2569; SSE2-NEXT:    shlq $60, %r13
2570; SSE2-NEXT:    sarq $63, %r13
2571; SSE2-NEXT:    movd %r13d, %xmm4
2572; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2573; SSE2-NEXT:    shlq $61, %rbx
2574; SSE2-NEXT:    sarq $63, %rbx
2575; SSE2-NEXT:    movd %ebx, %xmm2
2576; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2577; SSE2-NEXT:    shlq $62, %rcx
2578; SSE2-NEXT:    sarq $63, %rcx
2579; SSE2-NEXT:    movd %ecx, %xmm5
2580; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2581; SSE2-NEXT:    shlq $63, %rdx
2582; SSE2-NEXT:    sarq $63, %rdx
2583; SSE2-NEXT:    movd %edx, %xmm0
2584; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2585; SSE2-NEXT:    shlq $58, %rsi
2586; SSE2-NEXT:    sarq $63, %rsi
2587; SSE2-NEXT:    movd %esi, %xmm3
2588; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2589; SSE2-NEXT:    shlq $59, %rdi
2590; SSE2-NEXT:    sarq $63, %rdi
2591; SSE2-NEXT:    movd %edi, %xmm4
2592; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2593; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2594; SSE2-NEXT:    shlq $57, %rbp
2595; SSE2-NEXT:    sarq $63, %rbp
2596; SSE2-NEXT:    movd %ebp, %xmm2
2597; SSE2-NEXT:    shrq $7, %rax
2598; SSE2-NEXT:    movd %eax, %xmm3
2599; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
2600; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
2601; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2602; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2603; SSE2-NEXT:    popq %rbx
2604; SSE2-NEXT:    popq %r12
2605; SSE2-NEXT:    popq %r13
2606; SSE2-NEXT:    popq %r14
2607; SSE2-NEXT:    popq %r15
2608; SSE2-NEXT:    popq %rbp
2609; SSE2-NEXT:    retq
2610;
2611; SSSE3-LABEL: load_sext_16i1_to_16i8:
2612; SSSE3:       # %bb.0: # %entry
2613; SSSE3-NEXT:    pushq %rbp
2614; SSSE3-NEXT:    pushq %r15
2615; SSSE3-NEXT:    pushq %r14
2616; SSSE3-NEXT:    pushq %r13
2617; SSSE3-NEXT:    pushq %r12
2618; SSSE3-NEXT:    pushq %rbx
2619; SSSE3-NEXT:    movswq (%rdi), %rax
2620; SSSE3-NEXT:    movq %rax, %r8
2621; SSSE3-NEXT:    movq %rax, %r9
2622; SSSE3-NEXT:    movq %rax, %r10
2623; SSSE3-NEXT:    movq %rax, %r11
2624; SSSE3-NEXT:    movq %rax, %r14
2625; SSSE3-NEXT:    movq %rax, %r15
2626; SSSE3-NEXT:    movq %rax, %r12
2627; SSSE3-NEXT:    movq %rax, %r13
2628; SSSE3-NEXT:    movq %rax, %rbx
2629; SSSE3-NEXT:    movq %rax, %rcx
2630; SSSE3-NEXT:    movq %rax, %rdx
2631; SSSE3-NEXT:    movq %rax, %rsi
2632; SSSE3-NEXT:    movq %rax, %rdi
2633; SSSE3-NEXT:    movq %rax, %rbp
2634; SSSE3-NEXT:    shrq $15, %rbp
2635; SSSE3-NEXT:    movd %ebp, %xmm0
2636; SSSE3-NEXT:    movq %rax, %rbp
2637; SSSE3-NEXT:    movsbq %al, %rax
2638; SSSE3-NEXT:    shlq $49, %r8
2639; SSSE3-NEXT:    sarq $63, %r8
2640; SSSE3-NEXT:    movd %r8d, %xmm1
2641; SSSE3-NEXT:    shlq $50, %r9
2642; SSSE3-NEXT:    sarq $63, %r9
2643; SSSE3-NEXT:    movd %r9d, %xmm2
2644; SSSE3-NEXT:    shlq $51, %r10
2645; SSSE3-NEXT:    sarq $63, %r10
2646; SSSE3-NEXT:    movd %r10d, %xmm3
2647; SSSE3-NEXT:    shlq $52, %r11
2648; SSSE3-NEXT:    sarq $63, %r11
2649; SSSE3-NEXT:    movd %r11d, %xmm4
2650; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2651; SSSE3-NEXT:    shlq $53, %r14
2652; SSSE3-NEXT:    sarq $63, %r14
2653; SSSE3-NEXT:    movd %r14d, %xmm0
2654; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2655; SSSE3-NEXT:    shlq $54, %r15
2656; SSSE3-NEXT:    sarq $63, %r15
2657; SSSE3-NEXT:    movd %r15d, %xmm2
2658; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
2659; SSSE3-NEXT:    shlq $55, %r12
2660; SSSE3-NEXT:    sarq $63, %r12
2661; SSSE3-NEXT:    movd %r12d, %xmm1
2662; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2663; SSSE3-NEXT:    shlq $60, %r13
2664; SSSE3-NEXT:    sarq $63, %r13
2665; SSSE3-NEXT:    movd %r13d, %xmm4
2666; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2667; SSSE3-NEXT:    shlq $61, %rbx
2668; SSSE3-NEXT:    sarq $63, %rbx
2669; SSSE3-NEXT:    movd %ebx, %xmm2
2670; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2671; SSSE3-NEXT:    shlq $62, %rcx
2672; SSSE3-NEXT:    sarq $63, %rcx
2673; SSSE3-NEXT:    movd %ecx, %xmm5
2674; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2675; SSSE3-NEXT:    shlq $63, %rdx
2676; SSSE3-NEXT:    sarq $63, %rdx
2677; SSSE3-NEXT:    movd %edx, %xmm0
2678; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2679; SSSE3-NEXT:    shlq $58, %rsi
2680; SSSE3-NEXT:    sarq $63, %rsi
2681; SSSE3-NEXT:    movd %esi, %xmm3
2682; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2683; SSSE3-NEXT:    shlq $59, %rdi
2684; SSSE3-NEXT:    sarq $63, %rdi
2685; SSSE3-NEXT:    movd %edi, %xmm4
2686; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2687; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2688; SSSE3-NEXT:    shlq $57, %rbp
2689; SSSE3-NEXT:    sarq $63, %rbp
2690; SSSE3-NEXT:    movd %ebp, %xmm2
2691; SSSE3-NEXT:    shrq $7, %rax
2692; SSSE3-NEXT:    movd %eax, %xmm3
2693; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
2694; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
2695; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2696; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2697; SSSE3-NEXT:    popq %rbx
2698; SSSE3-NEXT:    popq %r12
2699; SSSE3-NEXT:    popq %r13
2700; SSSE3-NEXT:    popq %r14
2701; SSSE3-NEXT:    popq %r15
2702; SSSE3-NEXT:    popq %rbp
2703; SSSE3-NEXT:    retq
2704;
2705; SSE41-LABEL: load_sext_16i1_to_16i8:
2706; SSE41:       # %bb.0: # %entry
2707; SSE41-NEXT:    movswq (%rdi), %rax
2708; SSE41-NEXT:    movq %rax, %rcx
2709; SSE41-NEXT:    shlq $62, %rcx
2710; SSE41-NEXT:    sarq $63, %rcx
2711; SSE41-NEXT:    movq %rax, %rdx
2712; SSE41-NEXT:    shlq $63, %rdx
2713; SSE41-NEXT:    sarq $63, %rdx
2714; SSE41-NEXT:    movd %edx, %xmm0
2715; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
2716; SSE41-NEXT:    movq %rax, %rcx
2717; SSE41-NEXT:    shlq $61, %rcx
2718; SSE41-NEXT:    sarq $63, %rcx
2719; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
2720; SSE41-NEXT:    movq %rax, %rcx
2721; SSE41-NEXT:    shlq $60, %rcx
2722; SSE41-NEXT:    sarq $63, %rcx
2723; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
2724; SSE41-NEXT:    movq %rax, %rcx
2725; SSE41-NEXT:    shlq $59, %rcx
2726; SSE41-NEXT:    sarq $63, %rcx
2727; SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
2728; SSE41-NEXT:    movq %rax, %rcx
2729; SSE41-NEXT:    shlq $58, %rcx
2730; SSE41-NEXT:    sarq $63, %rcx
2731; SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
2732; SSE41-NEXT:    movq %rax, %rcx
2733; SSE41-NEXT:    shlq $57, %rcx
2734; SSE41-NEXT:    sarq $63, %rcx
2735; SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
2736; SSE41-NEXT:    movsbq %al, %rcx
2737; SSE41-NEXT:    shrq $7, %rcx
2738; SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
2739; SSE41-NEXT:    movq %rax, %rcx
2740; SSE41-NEXT:    shlq $55, %rcx
2741; SSE41-NEXT:    sarq $63, %rcx
2742; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
2743; SSE41-NEXT:    movq %rax, %rcx
2744; SSE41-NEXT:    shlq $54, %rcx
2745; SSE41-NEXT:    sarq $63, %rcx
2746; SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
2747; SSE41-NEXT:    movq %rax, %rcx
2748; SSE41-NEXT:    shlq $53, %rcx
2749; SSE41-NEXT:    sarq $63, %rcx
2750; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
2751; SSE41-NEXT:    movq %rax, %rcx
2752; SSE41-NEXT:    shlq $52, %rcx
2753; SSE41-NEXT:    sarq $63, %rcx
2754; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
2755; SSE41-NEXT:    movq %rax, %rcx
2756; SSE41-NEXT:    shlq $51, %rcx
2757; SSE41-NEXT:    sarq $63, %rcx
2758; SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
2759; SSE41-NEXT:    movq %rax, %rcx
2760; SSE41-NEXT:    shlq $50, %rcx
2761; SSE41-NEXT:    sarq $63, %rcx
2762; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
2763; SSE41-NEXT:    movq %rax, %rcx
2764; SSE41-NEXT:    shlq $49, %rcx
2765; SSE41-NEXT:    sarq $63, %rcx
2766; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
2767; SSE41-NEXT:    shrq $15, %rax
2768; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
2769; SSE41-NEXT:    retq
2770;
2771; AVX1-LABEL: load_sext_16i1_to_16i8:
2772; AVX1:       # %bb.0: # %entry
2773; AVX1-NEXT:    movswq (%rdi), %rax
2774; AVX1-NEXT:    movq %rax, %rcx
2775; AVX1-NEXT:    shlq $62, %rcx
2776; AVX1-NEXT:    sarq $63, %rcx
2777; AVX1-NEXT:    movq %rax, %rdx
2778; AVX1-NEXT:    shlq $63, %rdx
2779; AVX1-NEXT:    sarq $63, %rdx
2780; AVX1-NEXT:    vmovd %edx, %xmm0
2781; AVX1-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
2782; AVX1-NEXT:    movq %rax, %rcx
2783; AVX1-NEXT:    shlq $61, %rcx
2784; AVX1-NEXT:    sarq $63, %rcx
2785; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
2786; AVX1-NEXT:    movq %rax, %rcx
2787; AVX1-NEXT:    shlq $60, %rcx
2788; AVX1-NEXT:    sarq $63, %rcx
2789; AVX1-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
2790; AVX1-NEXT:    movq %rax, %rcx
2791; AVX1-NEXT:    shlq $59, %rcx
2792; AVX1-NEXT:    sarq $63, %rcx
2793; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
2794; AVX1-NEXT:    movq %rax, %rcx
2795; AVX1-NEXT:    shlq $58, %rcx
2796; AVX1-NEXT:    sarq $63, %rcx
2797; AVX1-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
2798; AVX1-NEXT:    movq %rax, %rcx
2799; AVX1-NEXT:    shlq $57, %rcx
2800; AVX1-NEXT:    sarq $63, %rcx
2801; AVX1-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
2802; AVX1-NEXT:    movsbq %al, %rcx
2803; AVX1-NEXT:    shrq $7, %rcx
2804; AVX1-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
2805; AVX1-NEXT:    movq %rax, %rcx
2806; AVX1-NEXT:    shlq $55, %rcx
2807; AVX1-NEXT:    sarq $63, %rcx
2808; AVX1-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
2809; AVX1-NEXT:    movq %rax, %rcx
2810; AVX1-NEXT:    shlq $54, %rcx
2811; AVX1-NEXT:    sarq $63, %rcx
2812; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
2813; AVX1-NEXT:    movq %rax, %rcx
2814; AVX1-NEXT:    shlq $53, %rcx
2815; AVX1-NEXT:    sarq $63, %rcx
2816; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
2817; AVX1-NEXT:    movq %rax, %rcx
2818; AVX1-NEXT:    shlq $52, %rcx
2819; AVX1-NEXT:    sarq $63, %rcx
2820; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
2821; AVX1-NEXT:    movq %rax, %rcx
2822; AVX1-NEXT:    shlq $51, %rcx
2823; AVX1-NEXT:    sarq $63, %rcx
2824; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
2825; AVX1-NEXT:    movq %rax, %rcx
2826; AVX1-NEXT:    shlq $50, %rcx
2827; AVX1-NEXT:    sarq $63, %rcx
2828; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
2829; AVX1-NEXT:    movq %rax, %rcx
2830; AVX1-NEXT:    shlq $49, %rcx
2831; AVX1-NEXT:    sarq $63, %rcx
2832; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
2833; AVX1-NEXT:    shrq $15, %rax
2834; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2835; AVX1-NEXT:    retq
2836;
2837; AVX2-LABEL: load_sext_16i1_to_16i8:
2838; AVX2:       # %bb.0: # %entry
2839; AVX2-NEXT:    movswq (%rdi), %rax
2840; AVX2-NEXT:    movq %rax, %rcx
2841; AVX2-NEXT:    shlq $62, %rcx
2842; AVX2-NEXT:    sarq $63, %rcx
2843; AVX2-NEXT:    movq %rax, %rdx
2844; AVX2-NEXT:    shlq $63, %rdx
2845; AVX2-NEXT:    sarq $63, %rdx
2846; AVX2-NEXT:    vmovd %edx, %xmm0
2847; AVX2-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
2848; AVX2-NEXT:    movq %rax, %rcx
2849; AVX2-NEXT:    shlq $61, %rcx
2850; AVX2-NEXT:    sarq $63, %rcx
2851; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
2852; AVX2-NEXT:    movq %rax, %rcx
2853; AVX2-NEXT:    shlq $60, %rcx
2854; AVX2-NEXT:    sarq $63, %rcx
2855; AVX2-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
2856; AVX2-NEXT:    movq %rax, %rcx
2857; AVX2-NEXT:    shlq $59, %rcx
2858; AVX2-NEXT:    sarq $63, %rcx
2859; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
2860; AVX2-NEXT:    movq %rax, %rcx
2861; AVX2-NEXT:    shlq $58, %rcx
2862; AVX2-NEXT:    sarq $63, %rcx
2863; AVX2-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
2864; AVX2-NEXT:    movq %rax, %rcx
2865; AVX2-NEXT:    shlq $57, %rcx
2866; AVX2-NEXT:    sarq $63, %rcx
2867; AVX2-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
2868; AVX2-NEXT:    movsbq %al, %rcx
2869; AVX2-NEXT:    shrq $7, %rcx
2870; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
2871; AVX2-NEXT:    movq %rax, %rcx
2872; AVX2-NEXT:    shlq $55, %rcx
2873; AVX2-NEXT:    sarq $63, %rcx
2874; AVX2-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
2875; AVX2-NEXT:    movq %rax, %rcx
2876; AVX2-NEXT:    shlq $54, %rcx
2877; AVX2-NEXT:    sarq $63, %rcx
2878; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
2879; AVX2-NEXT:    movq %rax, %rcx
2880; AVX2-NEXT:    shlq $53, %rcx
2881; AVX2-NEXT:    sarq $63, %rcx
2882; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
2883; AVX2-NEXT:    movq %rax, %rcx
2884; AVX2-NEXT:    shlq $52, %rcx
2885; AVX2-NEXT:    sarq $63, %rcx
2886; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
2887; AVX2-NEXT:    movq %rax, %rcx
2888; AVX2-NEXT:    shlq $51, %rcx
2889; AVX2-NEXT:    sarq $63, %rcx
2890; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
2891; AVX2-NEXT:    movq %rax, %rcx
2892; AVX2-NEXT:    shlq $50, %rcx
2893; AVX2-NEXT:    sarq $63, %rcx
2894; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
2895; AVX2-NEXT:    movq %rax, %rcx
2896; AVX2-NEXT:    shlq $49, %rcx
2897; AVX2-NEXT:    sarq $63, %rcx
2898; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
2899; AVX2-NEXT:    shrq $15, %rax
2900; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2901; AVX2-NEXT:    retq
2902;
2903; AVX512F-LABEL: load_sext_16i1_to_16i8:
2904; AVX512F:       # %bb.0: # %entry
2905; AVX512F-NEXT:    kmovw (%rdi), %k1
2906; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2907; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2908; AVX512F-NEXT:    vzeroupper
2909; AVX512F-NEXT:    retq
2910;
2911; AVX512BW-LABEL: load_sext_16i1_to_16i8:
2912; AVX512BW:       # %bb.0: # %entry
2913; AVX512BW-NEXT:    kmovw (%rdi), %k0
2914; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
2915; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2916; AVX512BW-NEXT:    vzeroupper
2917; AVX512BW-NEXT:    retq
2918;
2919; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
2920; X32-SSE41:       # %bb.0: # %entry
2921; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2922; X32-SSE41-NEXT:    movswl (%eax), %eax
2923; X32-SSE41-NEXT:    movl %eax, %ecx
2924; X32-SSE41-NEXT:    shll $30, %ecx
2925; X32-SSE41-NEXT:    sarl $31, %ecx
2926; X32-SSE41-NEXT:    movl %eax, %edx
2927; X32-SSE41-NEXT:    shll $31, %edx
2928; X32-SSE41-NEXT:    sarl $31, %edx
2929; X32-SSE41-NEXT:    movd %edx, %xmm0
2930; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
2931; X32-SSE41-NEXT:    movl %eax, %ecx
2932; X32-SSE41-NEXT:    shll $29, %ecx
2933; X32-SSE41-NEXT:    sarl $31, %ecx
2934; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
2935; X32-SSE41-NEXT:    movl %eax, %ecx
2936; X32-SSE41-NEXT:    shll $28, %ecx
2937; X32-SSE41-NEXT:    sarl $31, %ecx
2938; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
2939; X32-SSE41-NEXT:    movl %eax, %ecx
2940; X32-SSE41-NEXT:    shll $27, %ecx
2941; X32-SSE41-NEXT:    sarl $31, %ecx
2942; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
2943; X32-SSE41-NEXT:    movl %eax, %ecx
2944; X32-SSE41-NEXT:    shll $26, %ecx
2945; X32-SSE41-NEXT:    sarl $31, %ecx
2946; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
2947; X32-SSE41-NEXT:    movl %eax, %ecx
2948; X32-SSE41-NEXT:    shll $25, %ecx
2949; X32-SSE41-NEXT:    sarl $31, %ecx
2950; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
2951; X32-SSE41-NEXT:    movsbl %al, %ecx
2952; X32-SSE41-NEXT:    shrl $7, %ecx
2953; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
2954; X32-SSE41-NEXT:    movl %eax, %ecx
2955; X32-SSE41-NEXT:    shll $23, %ecx
2956; X32-SSE41-NEXT:    sarl $31, %ecx
2957; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
2958; X32-SSE41-NEXT:    movl %eax, %ecx
2959; X32-SSE41-NEXT:    shll $22, %ecx
2960; X32-SSE41-NEXT:    sarl $31, %ecx
2961; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
2962; X32-SSE41-NEXT:    movl %eax, %ecx
2963; X32-SSE41-NEXT:    shll $21, %ecx
2964; X32-SSE41-NEXT:    sarl $31, %ecx
2965; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
2966; X32-SSE41-NEXT:    movl %eax, %ecx
2967; X32-SSE41-NEXT:    shll $20, %ecx
2968; X32-SSE41-NEXT:    sarl $31, %ecx
2969; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
2970; X32-SSE41-NEXT:    movl %eax, %ecx
2971; X32-SSE41-NEXT:    shll $19, %ecx
2972; X32-SSE41-NEXT:    sarl $31, %ecx
2973; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
2974; X32-SSE41-NEXT:    movl %eax, %ecx
2975; X32-SSE41-NEXT:    shll $18, %ecx
2976; X32-SSE41-NEXT:    sarl $31, %ecx
2977; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
2978; X32-SSE41-NEXT:    movl %eax, %ecx
2979; X32-SSE41-NEXT:    shll $17, %ecx
2980; X32-SSE41-NEXT:    sarl $31, %ecx
2981; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
2982; X32-SSE41-NEXT:    shrl $15, %eax
2983; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm0
2984; X32-SSE41-NEXT:    retl
2985entry:
2986 %X = load <16 x i1>, <16 x i1>* %ptr
2987 %Y = sext <16 x i1> %X to <16 x i8>
2988 ret <16 x i8> %Y
2989}
2990
2991define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
2992; SSE2-LABEL: load_sext_16i1_to_16i16:
2993; SSE2:       # %bb.0: # %entry
2994; SSE2-NEXT:    movzwl (%rdi), %eax
2995; SSE2-NEXT:    movl %eax, %ecx
2996; SSE2-NEXT:    shrl $15, %ecx
2997; SSE2-NEXT:    movd %ecx, %xmm0
2998; SSE2-NEXT:    movl %eax, %ecx
2999; SSE2-NEXT:    shrl $14, %ecx
3000; SSE2-NEXT:    andl $1, %ecx
3001; SSE2-NEXT:    movd %ecx, %xmm1
3002; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3003; SSE2-NEXT:    movl %eax, %ecx
3004; SSE2-NEXT:    shrl $13, %ecx
3005; SSE2-NEXT:    andl $1, %ecx
3006; SSE2-NEXT:    movd %ecx, %xmm0
3007; SSE2-NEXT:    movl %eax, %ecx
3008; SSE2-NEXT:    shrl $12, %ecx
3009; SSE2-NEXT:    andl $1, %ecx
3010; SSE2-NEXT:    movd %ecx, %xmm2
3011; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
3012; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3013; SSE2-NEXT:    movl %eax, %ecx
3014; SSE2-NEXT:    shrl $11, %ecx
3015; SSE2-NEXT:    andl $1, %ecx
3016; SSE2-NEXT:    movd %ecx, %xmm0
3017; SSE2-NEXT:    movl %eax, %ecx
3018; SSE2-NEXT:    shrl $10, %ecx
3019; SSE2-NEXT:    andl $1, %ecx
3020; SSE2-NEXT:    movd %ecx, %xmm1
3021; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3022; SSE2-NEXT:    movl %eax, %ecx
3023; SSE2-NEXT:    shrl $9, %ecx
3024; SSE2-NEXT:    andl $1, %ecx
3025; SSE2-NEXT:    movd %ecx, %xmm3
3026; SSE2-NEXT:    movl %eax, %ecx
3027; SSE2-NEXT:    shrl $8, %ecx
3028; SSE2-NEXT:    andl $1, %ecx
3029; SSE2-NEXT:    movd %ecx, %xmm0
3030; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
3031; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3032; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3033; SSE2-NEXT:    movl %eax, %ecx
3034; SSE2-NEXT:    shrl $7, %ecx
3035; SSE2-NEXT:    andl $1, %ecx
3036; SSE2-NEXT:    movd %ecx, %xmm1
3037; SSE2-NEXT:    movl %eax, %ecx
3038; SSE2-NEXT:    shrl $6, %ecx
3039; SSE2-NEXT:    andl $1, %ecx
3040; SSE2-NEXT:    movd %ecx, %xmm2
3041; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3042; SSE2-NEXT:    movl %eax, %ecx
3043; SSE2-NEXT:    shrl $5, %ecx
3044; SSE2-NEXT:    andl $1, %ecx
3045; SSE2-NEXT:    movd %ecx, %xmm1
3046; SSE2-NEXT:    movl %eax, %ecx
3047; SSE2-NEXT:    shrl $4, %ecx
3048; SSE2-NEXT:    andl $1, %ecx
3049; SSE2-NEXT:    movd %ecx, %xmm3
3050; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
3051; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3052; SSE2-NEXT:    movl %eax, %ecx
3053; SSE2-NEXT:    shrl $3, %ecx
3054; SSE2-NEXT:    andl $1, %ecx
3055; SSE2-NEXT:    movd %ecx, %xmm1
3056; SSE2-NEXT:    movl %eax, %ecx
3057; SSE2-NEXT:    shrl $2, %ecx
3058; SSE2-NEXT:    andl $1, %ecx
3059; SSE2-NEXT:    movd %ecx, %xmm2
3060; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3061; SSE2-NEXT:    movl %eax, %ecx
3062; SSE2-NEXT:    andl $1, %ecx
3063; SSE2-NEXT:    movd %ecx, %xmm1
3064; SSE2-NEXT:    shrl %eax
3065; SSE2-NEXT:    andl $1, %eax
3066; SSE2-NEXT:    movd %eax, %xmm4
3067; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
3068; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3069; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
3070; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
3071; SSE2-NEXT:    movdqa %xmm1, %xmm0
3072; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3073; SSE2-NEXT:    psllw $15, %xmm0
3074; SSE2-NEXT:    psraw $15, %xmm0
3075; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
3076; SSE2-NEXT:    psllw $15, %xmm1
3077; SSE2-NEXT:    psraw $15, %xmm1
3078; SSE2-NEXT:    retq
3079;
3080; SSSE3-LABEL: load_sext_16i1_to_16i16:
3081; SSSE3:       # %bb.0: # %entry
3082; SSSE3-NEXT:    movzwl (%rdi), %eax
3083; SSSE3-NEXT:    movl %eax, %ecx
3084; SSSE3-NEXT:    shrl $15, %ecx
3085; SSSE3-NEXT:    movd %ecx, %xmm0
3086; SSSE3-NEXT:    movl %eax, %ecx
3087; SSSE3-NEXT:    shrl $14, %ecx
3088; SSSE3-NEXT:    andl $1, %ecx
3089; SSSE3-NEXT:    movd %ecx, %xmm1
3090; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3091; SSSE3-NEXT:    movl %eax, %ecx
3092; SSSE3-NEXT:    shrl $13, %ecx
3093; SSSE3-NEXT:    andl $1, %ecx
3094; SSSE3-NEXT:    movd %ecx, %xmm0
3095; SSSE3-NEXT:    movl %eax, %ecx
3096; SSSE3-NEXT:    shrl $12, %ecx
3097; SSSE3-NEXT:    andl $1, %ecx
3098; SSSE3-NEXT:    movd %ecx, %xmm2
3099; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
3100; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3101; SSSE3-NEXT:    movl %eax, %ecx
3102; SSSE3-NEXT:    shrl $11, %ecx
3103; SSSE3-NEXT:    andl $1, %ecx
3104; SSSE3-NEXT:    movd %ecx, %xmm0
3105; SSSE3-NEXT:    movl %eax, %ecx
3106; SSSE3-NEXT:    shrl $10, %ecx
3107; SSSE3-NEXT:    andl $1, %ecx
3108; SSSE3-NEXT:    movd %ecx, %xmm1
3109; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3110; SSSE3-NEXT:    movl %eax, %ecx
3111; SSSE3-NEXT:    shrl $9, %ecx
3112; SSSE3-NEXT:    andl $1, %ecx
3113; SSSE3-NEXT:    movd %ecx, %xmm3
3114; SSSE3-NEXT:    movl %eax, %ecx
3115; SSSE3-NEXT:    shrl $8, %ecx
3116; SSSE3-NEXT:    andl $1, %ecx
3117; SSSE3-NEXT:    movd %ecx, %xmm0
3118; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
3119; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3120; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3121; SSSE3-NEXT:    movl %eax, %ecx
3122; SSSE3-NEXT:    shrl $7, %ecx
3123; SSSE3-NEXT:    andl $1, %ecx
3124; SSSE3-NEXT:    movd %ecx, %xmm1
3125; SSSE3-NEXT:    movl %eax, %ecx
3126; SSSE3-NEXT:    shrl $6, %ecx
3127; SSSE3-NEXT:    andl $1, %ecx
3128; SSSE3-NEXT:    movd %ecx, %xmm2
3129; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3130; SSSE3-NEXT:    movl %eax, %ecx
3131; SSSE3-NEXT:    shrl $5, %ecx
3132; SSSE3-NEXT:    andl $1, %ecx
3133; SSSE3-NEXT:    movd %ecx, %xmm1
3134; SSSE3-NEXT:    movl %eax, %ecx
3135; SSSE3-NEXT:    shrl $4, %ecx
3136; SSSE3-NEXT:    andl $1, %ecx
3137; SSSE3-NEXT:    movd %ecx, %xmm3
3138; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
3139; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3140; SSSE3-NEXT:    movl %eax, %ecx
3141; SSSE3-NEXT:    shrl $3, %ecx
3142; SSSE3-NEXT:    andl $1, %ecx
3143; SSSE3-NEXT:    movd %ecx, %xmm1
3144; SSSE3-NEXT:    movl %eax, %ecx
3145; SSSE3-NEXT:    shrl $2, %ecx
3146; SSSE3-NEXT:    andl $1, %ecx
3147; SSSE3-NEXT:    movd %ecx, %xmm2
3148; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3149; SSSE3-NEXT:    movl %eax, %ecx
3150; SSSE3-NEXT:    andl $1, %ecx
3151; SSSE3-NEXT:    movd %ecx, %xmm1
3152; SSSE3-NEXT:    shrl %eax
3153; SSSE3-NEXT:    andl $1, %eax
3154; SSSE3-NEXT:    movd %eax, %xmm4
3155; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
3156; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3157; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
3158; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
3159; SSSE3-NEXT:    movdqa %xmm1, %xmm0
3160; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3161; SSSE3-NEXT:    psllw $15, %xmm0
3162; SSSE3-NEXT:    psraw $15, %xmm0
3163; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
3164; SSSE3-NEXT:    psllw $15, %xmm1
3165; SSSE3-NEXT:    psraw $15, %xmm1
3166; SSSE3-NEXT:    retq
3167;
3168; SSE41-LABEL: load_sext_16i1_to_16i16:
3169; SSE41:       # %bb.0: # %entry
3170; SSE41-NEXT:    movzwl (%rdi), %eax
3171; SSE41-NEXT:    movl %eax, %ecx
3172; SSE41-NEXT:    shrl %ecx
3173; SSE41-NEXT:    andl $1, %ecx
3174; SSE41-NEXT:    movl %eax, %edx
3175; SSE41-NEXT:    andl $1, %edx
3176; SSE41-NEXT:    movd %edx, %xmm1
3177; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
3178; SSE41-NEXT:    movl %eax, %ecx
3179; SSE41-NEXT:    shrl $2, %ecx
3180; SSE41-NEXT:    andl $1, %ecx
3181; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
3182; SSE41-NEXT:    movl %eax, %ecx
3183; SSE41-NEXT:    shrl $3, %ecx
3184; SSE41-NEXT:    andl $1, %ecx
3185; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
3186; SSE41-NEXT:    movl %eax, %ecx
3187; SSE41-NEXT:    shrl $4, %ecx
3188; SSE41-NEXT:    andl $1, %ecx
3189; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
3190; SSE41-NEXT:    movl %eax, %ecx
3191; SSE41-NEXT:    shrl $5, %ecx
3192; SSE41-NEXT:    andl $1, %ecx
3193; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
3194; SSE41-NEXT:    movl %eax, %ecx
3195; SSE41-NEXT:    shrl $6, %ecx
3196; SSE41-NEXT:    andl $1, %ecx
3197; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
3198; SSE41-NEXT:    movl %eax, %ecx
3199; SSE41-NEXT:    shrl $7, %ecx
3200; SSE41-NEXT:    andl $1, %ecx
3201; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
3202; SSE41-NEXT:    movl %eax, %ecx
3203; SSE41-NEXT:    shrl $8, %ecx
3204; SSE41-NEXT:    andl $1, %ecx
3205; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
3206; SSE41-NEXT:    movl %eax, %ecx
3207; SSE41-NEXT:    shrl $9, %ecx
3208; SSE41-NEXT:    andl $1, %ecx
3209; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
3210; SSE41-NEXT:    movl %eax, %ecx
3211; SSE41-NEXT:    shrl $10, %ecx
3212; SSE41-NEXT:    andl $1, %ecx
3213; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
3214; SSE41-NEXT:    movl %eax, %ecx
3215; SSE41-NEXT:    shrl $11, %ecx
3216; SSE41-NEXT:    andl $1, %ecx
3217; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
3218; SSE41-NEXT:    movl %eax, %ecx
3219; SSE41-NEXT:    shrl $12, %ecx
3220; SSE41-NEXT:    andl $1, %ecx
3221; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
3222; SSE41-NEXT:    movl %eax, %ecx
3223; SSE41-NEXT:    shrl $13, %ecx
3224; SSE41-NEXT:    andl $1, %ecx
3225; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
3226; SSE41-NEXT:    movl %eax, %ecx
3227; SSE41-NEXT:    shrl $14, %ecx
3228; SSE41-NEXT:    andl $1, %ecx
3229; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
3230; SSE41-NEXT:    shrl $15, %eax
3231; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
3232; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
3233; SSE41-NEXT:    psllw $15, %xmm0
3234; SSE41-NEXT:    psraw $15, %xmm0
3235; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3236; SSE41-NEXT:    psllw $15, %xmm1
3237; SSE41-NEXT:    psraw $15, %xmm1
3238; SSE41-NEXT:    retq
3239;
3240; AVX1-LABEL: load_sext_16i1_to_16i16:
3241; AVX1:       # %bb.0: # %entry
3242; AVX1-NEXT:    pushq %rbp
3243; AVX1-NEXT:    .cfi_def_cfa_offset 16
3244; AVX1-NEXT:    pushq %r15
3245; AVX1-NEXT:    .cfi_def_cfa_offset 24
3246; AVX1-NEXT:    pushq %r14
3247; AVX1-NEXT:    .cfi_def_cfa_offset 32
3248; AVX1-NEXT:    pushq %r13
3249; AVX1-NEXT:    .cfi_def_cfa_offset 40
3250; AVX1-NEXT:    pushq %r12
3251; AVX1-NEXT:    .cfi_def_cfa_offset 48
3252; AVX1-NEXT:    pushq %rbx
3253; AVX1-NEXT:    .cfi_def_cfa_offset 56
3254; AVX1-NEXT:    .cfi_offset %rbx, -56
3255; AVX1-NEXT:    .cfi_offset %r12, -48
3256; AVX1-NEXT:    .cfi_offset %r13, -40
3257; AVX1-NEXT:    .cfi_offset %r14, -32
3258; AVX1-NEXT:    .cfi_offset %r15, -24
3259; AVX1-NEXT:    .cfi_offset %rbp, -16
3260; AVX1-NEXT:    movswq (%rdi), %rax
3261; AVX1-NEXT:    movq %rax, %rcx
3262; AVX1-NEXT:    shlq $55, %rcx
3263; AVX1-NEXT:    sarq $63, %rcx
3264; AVX1-NEXT:    vmovd %ecx, %xmm0
3265; AVX1-NEXT:    movq %rax, %r8
3266; AVX1-NEXT:    movq %rax, %r10
3267; AVX1-NEXT:    movq %rax, %r11
3268; AVX1-NEXT:    movq %rax, %r14
3269; AVX1-NEXT:    movq %rax, %r15
3270; AVX1-NEXT:    movq %rax, %r9
3271; AVX1-NEXT:    movq %rax, %r12
3272; AVX1-NEXT:    movq %rax, %r13
3273; AVX1-NEXT:    movq %rax, %rbx
3274; AVX1-NEXT:    movq %rax, %rdi
3275; AVX1-NEXT:    movq %rax, %rcx
3276; AVX1-NEXT:    movq %rax, %rdx
3277; AVX1-NEXT:    movq %rax, %rsi
3278; AVX1-NEXT:    movsbq %al, %rbp
3279; AVX1-NEXT:    shlq $54, %rax
3280; AVX1-NEXT:    sarq $63, %rax
3281; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
3282; AVX1-NEXT:    shlq $53, %r8
3283; AVX1-NEXT:    sarq $63, %r8
3284; AVX1-NEXT:    vpinsrw $2, %r8d, %xmm0, %xmm0
3285; AVX1-NEXT:    shlq $52, %r10
3286; AVX1-NEXT:    sarq $63, %r10
3287; AVX1-NEXT:    vpinsrw $3, %r10d, %xmm0, %xmm0
3288; AVX1-NEXT:    shlq $51, %r11
3289; AVX1-NEXT:    sarq $63, %r11
3290; AVX1-NEXT:    vpinsrw $4, %r11d, %xmm0, %xmm0
3291; AVX1-NEXT:    shlq $50, %r14
3292; AVX1-NEXT:    sarq $63, %r14
3293; AVX1-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
3294; AVX1-NEXT:    shlq $49, %r15
3295; AVX1-NEXT:    sarq $63, %r15
3296; AVX1-NEXT:    vpinsrw $6, %r15d, %xmm0, %xmm0
3297; AVX1-NEXT:    shrq $15, %r9
3298; AVX1-NEXT:    vpinsrw $7, %r9d, %xmm0, %xmm0
3299; AVX1-NEXT:    shlq $63, %r13
3300; AVX1-NEXT:    sarq $63, %r13
3301; AVX1-NEXT:    vmovd %r13d, %xmm1
3302; AVX1-NEXT:    shlq $62, %r12
3303; AVX1-NEXT:    sarq $63, %r12
3304; AVX1-NEXT:    vpinsrw $1, %r12d, %xmm1, %xmm1
3305; AVX1-NEXT:    shlq $61, %rbx
3306; AVX1-NEXT:    sarq $63, %rbx
3307; AVX1-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
3308; AVX1-NEXT:    shlq $60, %rdi
3309; AVX1-NEXT:    sarq $63, %rdi
3310; AVX1-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
3311; AVX1-NEXT:    shlq $59, %rcx
3312; AVX1-NEXT:    sarq $63, %rcx
3313; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
3314; AVX1-NEXT:    shlq $58, %rdx
3315; AVX1-NEXT:    sarq $63, %rdx
3316; AVX1-NEXT:    vpinsrw $5, %edx, %xmm1, %xmm1
3317; AVX1-NEXT:    shlq $57, %rsi
3318; AVX1-NEXT:    sarq $63, %rsi
3319; AVX1-NEXT:    vpinsrw $6, %esi, %xmm1, %xmm1
3320; AVX1-NEXT:    shrq $7, %rbp
3321; AVX1-NEXT:    vpinsrw $7, %ebp, %xmm1, %xmm1
3322; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3323; AVX1-NEXT:    popq %rbx
3324; AVX1-NEXT:    .cfi_def_cfa_offset 48
3325; AVX1-NEXT:    popq %r12
3326; AVX1-NEXT:    .cfi_def_cfa_offset 40
3327; AVX1-NEXT:    popq %r13
3328; AVX1-NEXT:    .cfi_def_cfa_offset 32
3329; AVX1-NEXT:    popq %r14
3330; AVX1-NEXT:    .cfi_def_cfa_offset 24
3331; AVX1-NEXT:    popq %r15
3332; AVX1-NEXT:    .cfi_def_cfa_offset 16
3333; AVX1-NEXT:    popq %rbp
3334; AVX1-NEXT:    .cfi_def_cfa_offset 8
3335; AVX1-NEXT:    retq
3336;
3337; AVX2-LABEL: load_sext_16i1_to_16i16:
3338; AVX2:       # %bb.0: # %entry
3339; AVX2-NEXT:    pushq %rbp
3340; AVX2-NEXT:    .cfi_def_cfa_offset 16
3341; AVX2-NEXT:    pushq %r15
3342; AVX2-NEXT:    .cfi_def_cfa_offset 24
3343; AVX2-NEXT:    pushq %r14
3344; AVX2-NEXT:    .cfi_def_cfa_offset 32
3345; AVX2-NEXT:    pushq %r13
3346; AVX2-NEXT:    .cfi_def_cfa_offset 40
3347; AVX2-NEXT:    pushq %r12
3348; AVX2-NEXT:    .cfi_def_cfa_offset 48
3349; AVX2-NEXT:    pushq %rbx
3350; AVX2-NEXT:    .cfi_def_cfa_offset 56
3351; AVX2-NEXT:    .cfi_offset %rbx, -56
3352; AVX2-NEXT:    .cfi_offset %r12, -48
3353; AVX2-NEXT:    .cfi_offset %r13, -40
3354; AVX2-NEXT:    .cfi_offset %r14, -32
3355; AVX2-NEXT:    .cfi_offset %r15, -24
3356; AVX2-NEXT:    .cfi_offset %rbp, -16
3357; AVX2-NEXT:    movswq (%rdi), %rax
3358; AVX2-NEXT:    movq %rax, %rcx
3359; AVX2-NEXT:    shlq $55, %rcx
3360; AVX2-NEXT:    sarq $63, %rcx
3361; AVX2-NEXT:    vmovd %ecx, %xmm0
3362; AVX2-NEXT:    movq %rax, %r8
3363; AVX2-NEXT:    movq %rax, %r10
3364; AVX2-NEXT:    movq %rax, %r11
3365; AVX2-NEXT:    movq %rax, %r14
3366; AVX2-NEXT:    movq %rax, %r15
3367; AVX2-NEXT:    movq %rax, %r9
3368; AVX2-NEXT:    movq %rax, %r12
3369; AVX2-NEXT:    movq %rax, %r13
3370; AVX2-NEXT:    movq %rax, %rbx
3371; AVX2-NEXT:    movq %rax, %rdi
3372; AVX2-NEXT:    movq %rax, %rcx
3373; AVX2-NEXT:    movq %rax, %rdx
3374; AVX2-NEXT:    movq %rax, %rsi
3375; AVX2-NEXT:    movsbq %al, %rbp
3376; AVX2-NEXT:    shlq $54, %rax
3377; AVX2-NEXT:    sarq $63, %rax
3378; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
3379; AVX2-NEXT:    shlq $53, %r8
3380; AVX2-NEXT:    sarq $63, %r8
3381; AVX2-NEXT:    vpinsrw $2, %r8d, %xmm0, %xmm0
3382; AVX2-NEXT:    shlq $52, %r10
3383; AVX2-NEXT:    sarq $63, %r10
3384; AVX2-NEXT:    vpinsrw $3, %r10d, %xmm0, %xmm0
3385; AVX2-NEXT:    shlq $51, %r11
3386; AVX2-NEXT:    sarq $63, %r11
3387; AVX2-NEXT:    vpinsrw $4, %r11d, %xmm0, %xmm0
3388; AVX2-NEXT:    shlq $50, %r14
3389; AVX2-NEXT:    sarq $63, %r14
3390; AVX2-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
3391; AVX2-NEXT:    shlq $49, %r15
3392; AVX2-NEXT:    sarq $63, %r15
3393; AVX2-NEXT:    vpinsrw $6, %r15d, %xmm0, %xmm0
3394; AVX2-NEXT:    shrq $15, %r9
3395; AVX2-NEXT:    vpinsrw $7, %r9d, %xmm0, %xmm0
3396; AVX2-NEXT:    shlq $63, %r13
3397; AVX2-NEXT:    sarq $63, %r13
3398; AVX2-NEXT:    vmovd %r13d, %xmm1
3399; AVX2-NEXT:    shlq $62, %r12
3400; AVX2-NEXT:    sarq $63, %r12
3401; AVX2-NEXT:    vpinsrw $1, %r12d, %xmm1, %xmm1
3402; AVX2-NEXT:    shlq $61, %rbx
3403; AVX2-NEXT:    sarq $63, %rbx
3404; AVX2-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
3405; AVX2-NEXT:    shlq $60, %rdi
3406; AVX2-NEXT:    sarq $63, %rdi
3407; AVX2-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
3408; AVX2-NEXT:    shlq $59, %rcx
3409; AVX2-NEXT:    sarq $63, %rcx
3410; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
3411; AVX2-NEXT:    shlq $58, %rdx
3412; AVX2-NEXT:    sarq $63, %rdx
3413; AVX2-NEXT:    vpinsrw $5, %edx, %xmm1, %xmm1
3414; AVX2-NEXT:    shlq $57, %rsi
3415; AVX2-NEXT:    sarq $63, %rsi
3416; AVX2-NEXT:    vpinsrw $6, %esi, %xmm1, %xmm1
3417; AVX2-NEXT:    shrq $7, %rbp
3418; AVX2-NEXT:    vpinsrw $7, %ebp, %xmm1, %xmm1
3419; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
3420; AVX2-NEXT:    popq %rbx
3421; AVX2-NEXT:    .cfi_def_cfa_offset 48
3422; AVX2-NEXT:    popq %r12
3423; AVX2-NEXT:    .cfi_def_cfa_offset 40
3424; AVX2-NEXT:    popq %r13
3425; AVX2-NEXT:    .cfi_def_cfa_offset 32
3426; AVX2-NEXT:    popq %r14
3427; AVX2-NEXT:    .cfi_def_cfa_offset 24
3428; AVX2-NEXT:    popq %r15
3429; AVX2-NEXT:    .cfi_def_cfa_offset 16
3430; AVX2-NEXT:    popq %rbp
3431; AVX2-NEXT:    .cfi_def_cfa_offset 8
3432; AVX2-NEXT:    retq
3433;
3434; AVX512F-LABEL: load_sext_16i1_to_16i16:
3435; AVX512F:       # %bb.0: # %entry
3436; AVX512F-NEXT:    kmovw (%rdi), %k1
3437; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
3438; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
3439; AVX512F-NEXT:    retq
3440;
3441; AVX512BW-LABEL: load_sext_16i1_to_16i16:
3442; AVX512BW:       # %bb.0: # %entry
3443; AVX512BW-NEXT:    kmovw (%rdi), %k0
3444; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
3445; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3446; AVX512BW-NEXT:    retq
3447;
3448; X32-SSE41-LABEL: load_sext_16i1_to_16i16:
3449; X32-SSE41:       # %bb.0: # %entry
3450; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3451; X32-SSE41-NEXT:    movzwl (%eax), %eax
3452; X32-SSE41-NEXT:    movl %eax, %ecx
3453; X32-SSE41-NEXT:    shrl %ecx
3454; X32-SSE41-NEXT:    andl $1, %ecx
3455; X32-SSE41-NEXT:    movl %eax, %edx
3456; X32-SSE41-NEXT:    andl $1, %edx
3457; X32-SSE41-NEXT:    movd %edx, %xmm1
3458; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
3459; X32-SSE41-NEXT:    movl %eax, %ecx
3460; X32-SSE41-NEXT:    shrl $2, %ecx
3461; X32-SSE41-NEXT:    andl $1, %ecx
3462; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
3463; X32-SSE41-NEXT:    movl %eax, %ecx
3464; X32-SSE41-NEXT:    shrl $3, %ecx
3465; X32-SSE41-NEXT:    andl $1, %ecx
3466; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
3467; X32-SSE41-NEXT:    movl %eax, %ecx
3468; X32-SSE41-NEXT:    shrl $4, %ecx
3469; X32-SSE41-NEXT:    andl $1, %ecx
3470; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
3471; X32-SSE41-NEXT:    movl %eax, %ecx
3472; X32-SSE41-NEXT:    shrl $5, %ecx
3473; X32-SSE41-NEXT:    andl $1, %ecx
3474; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
3475; X32-SSE41-NEXT:    movl %eax, %ecx
3476; X32-SSE41-NEXT:    shrl $6, %ecx
3477; X32-SSE41-NEXT:    andl $1, %ecx
3478; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
3479; X32-SSE41-NEXT:    movl %eax, %ecx
3480; X32-SSE41-NEXT:    shrl $7, %ecx
3481; X32-SSE41-NEXT:    andl $1, %ecx
3482; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
3483; X32-SSE41-NEXT:    movl %eax, %ecx
3484; X32-SSE41-NEXT:    shrl $8, %ecx
3485; X32-SSE41-NEXT:    andl $1, %ecx
3486; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
3487; X32-SSE41-NEXT:    movl %eax, %ecx
3488; X32-SSE41-NEXT:    shrl $9, %ecx
3489; X32-SSE41-NEXT:    andl $1, %ecx
3490; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
3491; X32-SSE41-NEXT:    movl %eax, %ecx
3492; X32-SSE41-NEXT:    shrl $10, %ecx
3493; X32-SSE41-NEXT:    andl $1, %ecx
3494; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
3495; X32-SSE41-NEXT:    movl %eax, %ecx
3496; X32-SSE41-NEXT:    shrl $11, %ecx
3497; X32-SSE41-NEXT:    andl $1, %ecx
3498; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
3499; X32-SSE41-NEXT:    movl %eax, %ecx
3500; X32-SSE41-NEXT:    shrl $12, %ecx
3501; X32-SSE41-NEXT:    andl $1, %ecx
3502; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
3503; X32-SSE41-NEXT:    movl %eax, %ecx
3504; X32-SSE41-NEXT:    shrl $13, %ecx
3505; X32-SSE41-NEXT:    andl $1, %ecx
3506; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
3507; X32-SSE41-NEXT:    movl %eax, %ecx
3508; X32-SSE41-NEXT:    shrl $14, %ecx
3509; X32-SSE41-NEXT:    andl $1, %ecx
3510; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
3511; X32-SSE41-NEXT:    shrl $15, %eax
3512; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm1
3513; X32-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
3514; X32-SSE41-NEXT:    psllw $15, %xmm0
3515; X32-SSE41-NEXT:    psraw $15, %xmm0
3516; X32-SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3517; X32-SSE41-NEXT:    psllw $15, %xmm1
3518; X32-SSE41-NEXT:    psraw $15, %xmm1
3519; X32-SSE41-NEXT:    retl
3520entry:
3521 %X = load <16 x i1>, <16 x i1>* %ptr
3522 %Y = sext <16 x i1> %X to <16 x i16>
3523 ret <16 x i16> %Y
3524}
3525
3526define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
3527; SSE2-LABEL: load_sext_32i1_to_32i8:
3528; SSE2:       # %bb.0: # %entry
3529; SSE2-NEXT:    pushq %rbp
3530; SSE2-NEXT:    pushq %r15
3531; SSE2-NEXT:    pushq %r14
3532; SSE2-NEXT:    pushq %r13
3533; SSE2-NEXT:    pushq %r12
3534; SSE2-NEXT:    pushq %rbx
3535; SSE2-NEXT:    movswq (%rdi), %rax
3536; SSE2-NEXT:    movq %rax, %r10
3537; SSE2-NEXT:    movq %rax, %r8
3538; SSE2-NEXT:    movq %rax, %r9
3539; SSE2-NEXT:    movq %rax, %r11
3540; SSE2-NEXT:    movq %rax, %r14
3541; SSE2-NEXT:    movq %rax, %r15
3542; SSE2-NEXT:    movq %rax, %r12
3543; SSE2-NEXT:    movq %rax, %r13
3544; SSE2-NEXT:    movq %rax, %rdx
3545; SSE2-NEXT:    movq %rax, %rsi
3546; SSE2-NEXT:    movq %rax, %rcx
3547; SSE2-NEXT:    movq %rax, %rbp
3548; SSE2-NEXT:    movq %rax, %rbx
3549; SSE2-NEXT:    shrq $15, %rbx
3550; SSE2-NEXT:    movd %ebx, %xmm0
3551; SSE2-NEXT:    movq %rax, %rbx
3552; SSE2-NEXT:    shlq $49, %r10
3553; SSE2-NEXT:    sarq $63, %r10
3554; SSE2-NEXT:    movd %r10d, %xmm15
3555; SSE2-NEXT:    movq %rax, %r10
3556; SSE2-NEXT:    movsbq %al, %rax
3557; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
3558; SSE2-NEXT:    shlq $50, %r8
3559; SSE2-NEXT:    sarq $63, %r8
3560; SSE2-NEXT:    movd %r8d, %xmm8
3561; SSE2-NEXT:    shlq $51, %r9
3562; SSE2-NEXT:    sarq $63, %r9
3563; SSE2-NEXT:    movd %r9d, %xmm3
3564; SSE2-NEXT:    shlq $52, %r11
3565; SSE2-NEXT:    sarq $63, %r11
3566; SSE2-NEXT:    movd %r11d, %xmm9
3567; SSE2-NEXT:    shlq $53, %r14
3568; SSE2-NEXT:    sarq $63, %r14
3569; SSE2-NEXT:    movd %r14d, %xmm6
3570; SSE2-NEXT:    shlq $54, %r15
3571; SSE2-NEXT:    sarq $63, %r15
3572; SSE2-NEXT:    movd %r15d, %xmm10
3573; SSE2-NEXT:    shlq $55, %r12
3574; SSE2-NEXT:    sarq $63, %r12
3575; SSE2-NEXT:    movd %r12d, %xmm2
3576; SSE2-NEXT:    shlq $60, %r13
3577; SSE2-NEXT:    sarq $63, %r13
3578; SSE2-NEXT:    movd %r13d, %xmm11
3579; SSE2-NEXT:    shlq $61, %rdx
3580; SSE2-NEXT:    sarq $63, %rdx
3581; SSE2-NEXT:    movd %edx, %xmm5
3582; SSE2-NEXT:    shlq $62, %rsi
3583; SSE2-NEXT:    sarq $63, %rsi
3584; SSE2-NEXT:    movd %esi, %xmm12
3585; SSE2-NEXT:    shlq $63, %rcx
3586; SSE2-NEXT:    sarq $63, %rcx
3587; SSE2-NEXT:    movd %ecx, %xmm0
3588; SSE2-NEXT:    shlq $58, %rbp
3589; SSE2-NEXT:    sarq $63, %rbp
3590; SSE2-NEXT:    movd %ebp, %xmm13
3591; SSE2-NEXT:    shlq $59, %rbx
3592; SSE2-NEXT:    sarq $63, %rbx
3593; SSE2-NEXT:    movd %ebx, %xmm7
3594; SSE2-NEXT:    shlq $57, %r10
3595; SSE2-NEXT:    sarq $63, %r10
3596; SSE2-NEXT:    movd %r10d, %xmm4
3597; SSE2-NEXT:    shrq $7, %rax
3598; SSE2-NEXT:    movd %eax, %xmm14
3599; SSE2-NEXT:    movswq 2(%rdi), %rsi
3600; SSE2-NEXT:    movq %rsi, %r8
3601; SSE2-NEXT:    movq %rsi, %r9
3602; SSE2-NEXT:    movq %rsi, %r10
3603; SSE2-NEXT:    movq %rsi, %r11
3604; SSE2-NEXT:    movq %rsi, %r14
3605; SSE2-NEXT:    movq %rsi, %r15
3606; SSE2-NEXT:    movq %rsi, %r12
3607; SSE2-NEXT:    movq %rsi, %r13
3608; SSE2-NEXT:    movq %rsi, %rbx
3609; SSE2-NEXT:    movq %rsi, %rax
3610; SSE2-NEXT:    movq %rsi, %rcx
3611; SSE2-NEXT:    movq %rsi, %rdx
3612; SSE2-NEXT:    movq %rsi, %rdi
3613; SSE2-NEXT:    movq %rsi, %rbp
3614; SSE2-NEXT:    shrq $15, %rbp
3615; SSE2-NEXT:    movd %ebp, %xmm1
3616; SSE2-NEXT:    movq %rsi, %rbp
3617; SSE2-NEXT:    movsbq %sil, %rsi
3618; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
3619; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
3620; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
3621; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
3622; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
3623; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3624; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
3625; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
3626; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
3627; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
3628; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
3629; SSE2-NEXT:    shlq $49, %r8
3630; SSE2-NEXT:    sarq $63, %r8
3631; SSE2-NEXT:    movd %r8d, %xmm3
3632; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
3633; SSE2-NEXT:    shlq $50, %r9
3634; SSE2-NEXT:    sarq $63, %r9
3635; SSE2-NEXT:    movd %r9d, %xmm4
3636; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
3637; SSE2-NEXT:    shlq $51, %r10
3638; SSE2-NEXT:    sarq $63, %r10
3639; SSE2-NEXT:    movd %r10d, %xmm5
3640; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3641; SSE2-NEXT:    shlq $52, %r11
3642; SSE2-NEXT:    sarq $63, %r11
3643; SSE2-NEXT:    movd %r11d, %xmm2
3644; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
3645; SSE2-NEXT:    shlq $53, %r14
3646; SSE2-NEXT:    sarq $63, %r14
3647; SSE2-NEXT:    movd %r14d, %xmm1
3648; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
3649; SSE2-NEXT:    shlq $54, %r15
3650; SSE2-NEXT:    sarq $63, %r15
3651; SSE2-NEXT:    movd %r15d, %xmm4
3652; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
3653; SSE2-NEXT:    shlq $55, %r12
3654; SSE2-NEXT:    sarq $63, %r12
3655; SSE2-NEXT:    movd %r12d, %xmm3
3656; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3657; SSE2-NEXT:    shlq $60, %r13
3658; SSE2-NEXT:    sarq $63, %r13
3659; SSE2-NEXT:    movd %r13d, %xmm2
3660; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
3661; SSE2-NEXT:    shlq $61, %rbx
3662; SSE2-NEXT:    sarq $63, %rbx
3663; SSE2-NEXT:    movd %ebx, %xmm4
3664; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3665; SSE2-NEXT:    shlq $62, %rax
3666; SSE2-NEXT:    sarq $63, %rax
3667; SSE2-NEXT:    movd %eax, %xmm6
3668; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
3669; SSE2-NEXT:    shlq $63, %rcx
3670; SSE2-NEXT:    sarq $63, %rcx
3671; SSE2-NEXT:    movd %ecx, %xmm1
3672; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
3673; SSE2-NEXT:    shlq $58, %rdx
3674; SSE2-NEXT:    sarq $63, %rdx
3675; SSE2-NEXT:    movd %edx, %xmm2
3676; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
3677; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
3678; SSE2-NEXT:    shlq $59, %rdi
3679; SSE2-NEXT:    sarq $63, %rdi
3680; SSE2-NEXT:    movd %edi, %xmm4
3681; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
3682; SSE2-NEXT:    shlq $57, %rbp
3683; SSE2-NEXT:    sarq $63, %rbp
3684; SSE2-NEXT:    movd %ebp, %xmm2
3685; SSE2-NEXT:    shrq $7, %rsi
3686; SSE2-NEXT:    movd %esi, %xmm5
3687; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
3688; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
3689; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
3690; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
3691; SSE2-NEXT:    popq %rbx
3692; SSE2-NEXT:    popq %r12
3693; SSE2-NEXT:    popq %r13
3694; SSE2-NEXT:    popq %r14
3695; SSE2-NEXT:    popq %r15
3696; SSE2-NEXT:    popq %rbp
3697; SSE2-NEXT:    retq
3698;
3699; SSSE3-LABEL: load_sext_32i1_to_32i8:
3700; SSSE3:       # %bb.0: # %entry
3701; SSSE3-NEXT:    pushq %rbp
3702; SSSE3-NEXT:    pushq %r15
3703; SSSE3-NEXT:    pushq %r14
3704; SSSE3-NEXT:    pushq %r13
3705; SSSE3-NEXT:    pushq %r12
3706; SSSE3-NEXT:    pushq %rbx
3707; SSSE3-NEXT:    movswq (%rdi), %rax
3708; SSSE3-NEXT:    movq %rax, %r10
3709; SSSE3-NEXT:    movq %rax, %r8
3710; SSSE3-NEXT:    movq %rax, %r9
3711; SSSE3-NEXT:    movq %rax, %r11
3712; SSSE3-NEXT:    movq %rax, %r14
3713; SSSE3-NEXT:    movq %rax, %r15
3714; SSSE3-NEXT:    movq %rax, %r12
3715; SSSE3-NEXT:    movq %rax, %r13
3716; SSSE3-NEXT:    movq %rax, %rdx
3717; SSSE3-NEXT:    movq %rax, %rsi
3718; SSSE3-NEXT:    movq %rax, %rcx
3719; SSSE3-NEXT:    movq %rax, %rbp
3720; SSSE3-NEXT:    movq %rax, %rbx
3721; SSSE3-NEXT:    shrq $15, %rbx
3722; SSSE3-NEXT:    movd %ebx, %xmm0
3723; SSSE3-NEXT:    movq %rax, %rbx
3724; SSSE3-NEXT:    shlq $49, %r10
3725; SSSE3-NEXT:    sarq $63, %r10
3726; SSSE3-NEXT:    movd %r10d, %xmm15
3727; SSSE3-NEXT:    movq %rax, %r10
3728; SSSE3-NEXT:    movsbq %al, %rax
3729; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
3730; SSSE3-NEXT:    shlq $50, %r8
3731; SSSE3-NEXT:    sarq $63, %r8
3732; SSSE3-NEXT:    movd %r8d, %xmm8
3733; SSSE3-NEXT:    shlq $51, %r9
3734; SSSE3-NEXT:    sarq $63, %r9
3735; SSSE3-NEXT:    movd %r9d, %xmm3
3736; SSSE3-NEXT:    shlq $52, %r11
3737; SSSE3-NEXT:    sarq $63, %r11
3738; SSSE3-NEXT:    movd %r11d, %xmm9
3739; SSSE3-NEXT:    shlq $53, %r14
3740; SSSE3-NEXT:    sarq $63, %r14
3741; SSSE3-NEXT:    movd %r14d, %xmm6
3742; SSSE3-NEXT:    shlq $54, %r15
3743; SSSE3-NEXT:    sarq $63, %r15
3744; SSSE3-NEXT:    movd %r15d, %xmm10
3745; SSSE3-NEXT:    shlq $55, %r12
3746; SSSE3-NEXT:    sarq $63, %r12
3747; SSSE3-NEXT:    movd %r12d, %xmm2
3748; SSSE3-NEXT:    shlq $60, %r13
3749; SSSE3-NEXT:    sarq $63, %r13
3750; SSSE3-NEXT:    movd %r13d, %xmm11
3751; SSSE3-NEXT:    shlq $61, %rdx
3752; SSSE3-NEXT:    sarq $63, %rdx
3753; SSSE3-NEXT:    movd %edx, %xmm5
3754; SSSE3-NEXT:    shlq $62, %rsi
3755; SSSE3-NEXT:    sarq $63, %rsi
3756; SSSE3-NEXT:    movd %esi, %xmm12
3757; SSSE3-NEXT:    shlq $63, %rcx
3758; SSSE3-NEXT:    sarq $63, %rcx
3759; SSSE3-NEXT:    movd %ecx, %xmm0
3760; SSSE3-NEXT:    shlq $58, %rbp
3761; SSSE3-NEXT:    sarq $63, %rbp
3762; SSSE3-NEXT:    movd %ebp, %xmm13
3763; SSSE3-NEXT:    shlq $59, %rbx
3764; SSSE3-NEXT:    sarq $63, %rbx
3765; SSSE3-NEXT:    movd %ebx, %xmm7
3766; SSSE3-NEXT:    shlq $57, %r10
3767; SSSE3-NEXT:    sarq $63, %r10
3768; SSSE3-NEXT:    movd %r10d, %xmm4
3769; SSSE3-NEXT:    shrq $7, %rax
3770; SSSE3-NEXT:    movd %eax, %xmm14
3771; SSSE3-NEXT:    movswq 2(%rdi), %rsi
3772; SSSE3-NEXT:    movq %rsi, %r8
3773; SSSE3-NEXT:    movq %rsi, %r9
3774; SSSE3-NEXT:    movq %rsi, %r10
3775; SSSE3-NEXT:    movq %rsi, %r11
3776; SSSE3-NEXT:    movq %rsi, %r14
3777; SSSE3-NEXT:    movq %rsi, %r15
3778; SSSE3-NEXT:    movq %rsi, %r12
3779; SSSE3-NEXT:    movq %rsi, %r13
3780; SSSE3-NEXT:    movq %rsi, %rbx
3781; SSSE3-NEXT:    movq %rsi, %rax
3782; SSSE3-NEXT:    movq %rsi, %rcx
3783; SSSE3-NEXT:    movq %rsi, %rdx
3784; SSSE3-NEXT:    movq %rsi, %rdi
3785; SSSE3-NEXT:    movq %rsi, %rbp
3786; SSSE3-NEXT:    shrq $15, %rbp
3787; SSSE3-NEXT:    movd %ebp, %xmm1
3788; SSSE3-NEXT:    movq %rsi, %rbp
3789; SSSE3-NEXT:    movsbq %sil, %rsi
3790; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
3791; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
3792; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
3793; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
3794; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
3795; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3796; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
3797; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
3798; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
3799; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
3800; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
3801; SSSE3-NEXT:    shlq $49, %r8
3802; SSSE3-NEXT:    sarq $63, %r8
3803; SSSE3-NEXT:    movd %r8d, %xmm3
3804; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
3805; SSSE3-NEXT:    shlq $50, %r9
3806; SSSE3-NEXT:    sarq $63, %r9
3807; SSSE3-NEXT:    movd %r9d, %xmm4
3808; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
3809; SSSE3-NEXT:    shlq $51, %r10
3810; SSSE3-NEXT:    sarq $63, %r10
3811; SSSE3-NEXT:    movd %r10d, %xmm5
3812; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3813; SSSE3-NEXT:    shlq $52, %r11
3814; SSSE3-NEXT:    sarq $63, %r11
3815; SSSE3-NEXT:    movd %r11d, %xmm2
3816; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
3817; SSSE3-NEXT:    shlq $53, %r14
3818; SSSE3-NEXT:    sarq $63, %r14
3819; SSSE3-NEXT:    movd %r14d, %xmm1
3820; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
3821; SSSE3-NEXT:    shlq $54, %r15
3822; SSSE3-NEXT:    sarq $63, %r15
3823; SSSE3-NEXT:    movd %r15d, %xmm4
3824; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
3825; SSSE3-NEXT:    shlq $55, %r12
3826; SSSE3-NEXT:    sarq $63, %r12
3827; SSSE3-NEXT:    movd %r12d, %xmm3
3828; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3829; SSSE3-NEXT:    shlq $60, %r13
3830; SSSE3-NEXT:    sarq $63, %r13
3831; SSSE3-NEXT:    movd %r13d, %xmm2
3832; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
3833; SSSE3-NEXT:    shlq $61, %rbx
3834; SSSE3-NEXT:    sarq $63, %rbx
3835; SSSE3-NEXT:    movd %ebx, %xmm4
3836; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3837; SSSE3-NEXT:    shlq $62, %rax
3838; SSSE3-NEXT:    sarq $63, %rax
3839; SSSE3-NEXT:    movd %eax, %xmm6
3840; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
3841; SSSE3-NEXT:    shlq $63, %rcx
3842; SSSE3-NEXT:    sarq $63, %rcx
3843; SSSE3-NEXT:    movd %ecx, %xmm1
3844; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
3845; SSSE3-NEXT:    shlq $58, %rdx
3846; SSSE3-NEXT:    sarq $63, %rdx
3847; SSSE3-NEXT:    movd %edx, %xmm2
3848; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
3849; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
3850; SSSE3-NEXT:    shlq $59, %rdi
3851; SSSE3-NEXT:    sarq $63, %rdi
3852; SSSE3-NEXT:    movd %edi, %xmm4
3853; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
3854; SSSE3-NEXT:    shlq $57, %rbp
3855; SSSE3-NEXT:    sarq $63, %rbp
3856; SSSE3-NEXT:    movd %ebp, %xmm2
3857; SSSE3-NEXT:    shrq $7, %rsi
3858; SSSE3-NEXT:    movd %esi, %xmm5
3859; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
3860; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
3861; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
3862; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
3863; SSSE3-NEXT:    popq %rbx
3864; SSSE3-NEXT:    popq %r12
3865; SSSE3-NEXT:    popq %r13
3866; SSSE3-NEXT:    popq %r14
3867; SSSE3-NEXT:    popq %r15
3868; SSSE3-NEXT:    popq %rbp
3869; SSSE3-NEXT:    retq
3870;
3871; SSE41-LABEL: load_sext_32i1_to_32i8:
3872; SSE41:       # %bb.0: # %entry
3873; SSE41-NEXT:    movswq (%rdi), %rax
3874; SSE41-NEXT:    movq %rax, %rcx
3875; SSE41-NEXT:    shlq $62, %rcx
3876; SSE41-NEXT:    sarq $63, %rcx
3877; SSE41-NEXT:    movq %rax, %rdx
3878; SSE41-NEXT:    shlq $63, %rdx
3879; SSE41-NEXT:    sarq $63, %rdx
3880; SSE41-NEXT:    movd %edx, %xmm0
3881; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
3882; SSE41-NEXT:    movq %rax, %rcx
3883; SSE41-NEXT:    shlq $61, %rcx
3884; SSE41-NEXT:    sarq $63, %rcx
3885; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
3886; SSE41-NEXT:    movq %rax, %rcx
3887; SSE41-NEXT:    shlq $60, %rcx
3888; SSE41-NEXT:    sarq $63, %rcx
3889; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
3890; SSE41-NEXT:    movq %rax, %rcx
3891; SSE41-NEXT:    shlq $59, %rcx
3892; SSE41-NEXT:    sarq $63, %rcx
3893; SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
3894; SSE41-NEXT:    movq %rax, %rcx
3895; SSE41-NEXT:    shlq $58, %rcx
3896; SSE41-NEXT:    sarq $63, %rcx
3897; SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
3898; SSE41-NEXT:    movq %rax, %rcx
3899; SSE41-NEXT:    shlq $57, %rcx
3900; SSE41-NEXT:    sarq $63, %rcx
3901; SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
3902; SSE41-NEXT:    movsbq %al, %rcx
3903; SSE41-NEXT:    shrq $7, %rcx
3904; SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
3905; SSE41-NEXT:    movq %rax, %rcx
3906; SSE41-NEXT:    shlq $55, %rcx
3907; SSE41-NEXT:    sarq $63, %rcx
3908; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
3909; SSE41-NEXT:    movq %rax, %rcx
3910; SSE41-NEXT:    shlq $54, %rcx
3911; SSE41-NEXT:    sarq $63, %rcx
3912; SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
3913; SSE41-NEXT:    movq %rax, %rcx
3914; SSE41-NEXT:    shlq $53, %rcx
3915; SSE41-NEXT:    sarq $63, %rcx
3916; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
3917; SSE41-NEXT:    movq %rax, %rcx
3918; SSE41-NEXT:    shlq $52, %rcx
3919; SSE41-NEXT:    sarq $63, %rcx
3920; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
3921; SSE41-NEXT:    movq %rax, %rcx
3922; SSE41-NEXT:    shlq $51, %rcx
3923; SSE41-NEXT:    sarq $63, %rcx
3924; SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
3925; SSE41-NEXT:    movq %rax, %rcx
3926; SSE41-NEXT:    shlq $50, %rcx
3927; SSE41-NEXT:    sarq $63, %rcx
3928; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
3929; SSE41-NEXT:    movq %rax, %rcx
3930; SSE41-NEXT:    shlq $49, %rcx
3931; SSE41-NEXT:    sarq $63, %rcx
3932; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
3933; SSE41-NEXT:    shrq $15, %rax
3934; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
3935; SSE41-NEXT:    movswq 2(%rdi), %rax
3936; SSE41-NEXT:    movq %rax, %rcx
3937; SSE41-NEXT:    shlq $62, %rcx
3938; SSE41-NEXT:    sarq $63, %rcx
3939; SSE41-NEXT:    movq %rax, %rdx
3940; SSE41-NEXT:    shlq $63, %rdx
3941; SSE41-NEXT:    sarq $63, %rdx
3942; SSE41-NEXT:    movd %edx, %xmm1
3943; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
3944; SSE41-NEXT:    movq %rax, %rcx
3945; SSE41-NEXT:    shlq $61, %rcx
3946; SSE41-NEXT:    sarq $63, %rcx
3947; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
3948; SSE41-NEXT:    movq %rax, %rcx
3949; SSE41-NEXT:    shlq $60, %rcx
3950; SSE41-NEXT:    sarq $63, %rcx
3951; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
3952; SSE41-NEXT:    movq %rax, %rcx
3953; SSE41-NEXT:    shlq $59, %rcx
3954; SSE41-NEXT:    sarq $63, %rcx
3955; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
3956; SSE41-NEXT:    movq %rax, %rcx
3957; SSE41-NEXT:    shlq $58, %rcx
3958; SSE41-NEXT:    sarq $63, %rcx
3959; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
3960; SSE41-NEXT:    movq %rax, %rcx
3961; SSE41-NEXT:    shlq $57, %rcx
3962; SSE41-NEXT:    sarq $63, %rcx
3963; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
3964; SSE41-NEXT:    movsbq %al, %rcx
3965; SSE41-NEXT:    shrq $7, %rcx
3966; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
3967; SSE41-NEXT:    movq %rax, %rcx
3968; SSE41-NEXT:    shlq $55, %rcx
3969; SSE41-NEXT:    sarq $63, %rcx
3970; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
3971; SSE41-NEXT:    movq %rax, %rcx
3972; SSE41-NEXT:    shlq $54, %rcx
3973; SSE41-NEXT:    sarq $63, %rcx
3974; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
3975; SSE41-NEXT:    movq %rax, %rcx
3976; SSE41-NEXT:    shlq $53, %rcx
3977; SSE41-NEXT:    sarq $63, %rcx
3978; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
3979; SSE41-NEXT:    movq %rax, %rcx
3980; SSE41-NEXT:    shlq $52, %rcx
3981; SSE41-NEXT:    sarq $63, %rcx
3982; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
3983; SSE41-NEXT:    movq %rax, %rcx
3984; SSE41-NEXT:    shlq $51, %rcx
3985; SSE41-NEXT:    sarq $63, %rcx
3986; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
3987; SSE41-NEXT:    movq %rax, %rcx
3988; SSE41-NEXT:    shlq $50, %rcx
3989; SSE41-NEXT:    sarq $63, %rcx
3990; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
3991; SSE41-NEXT:    movq %rax, %rcx
3992; SSE41-NEXT:    shlq $49, %rcx
3993; SSE41-NEXT:    sarq $63, %rcx
3994; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
3995; SSE41-NEXT:    shrq $15, %rax
3996; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
3997; SSE41-NEXT:    retq
3998;
3999; AVX1-LABEL: load_sext_32i1_to_32i8:
4000; AVX1:       # %bb.0: # %entry
4001; AVX1-NEXT:    pushq %rbp
4002; AVX1-NEXT:    pushq %r15
4003; AVX1-NEXT:    pushq %r14
4004; AVX1-NEXT:    pushq %r13
4005; AVX1-NEXT:    pushq %r12
4006; AVX1-NEXT:    pushq %rbx
4007; AVX1-NEXT:    movslq (%rdi), %rax
4008; AVX1-NEXT:    movq %rax, %rcx
4009; AVX1-NEXT:    shlq $47, %rcx
4010; AVX1-NEXT:    sarq $63, %rcx
4011; AVX1-NEXT:    vmovd %ecx, %xmm0
4012; AVX1-NEXT:    movq %rax, %r8
4013; AVX1-NEXT:    movq %rax, %rdx
4014; AVX1-NEXT:    movq %rax, %rcx
4015; AVX1-NEXT:    movq %rax, %rdi
4016; AVX1-NEXT:    movq %rax, %r13
4017; AVX1-NEXT:    movq %rax, %rsi
4018; AVX1-NEXT:    movq %rax, %r10
4019; AVX1-NEXT:    movq %rax, %r11
4020; AVX1-NEXT:    movq %rax, %r9
4021; AVX1-NEXT:    movq %rax, %rbx
4022; AVX1-NEXT:    movq %rax, %r14
4023; AVX1-NEXT:    movq %rax, %r15
4024; AVX1-NEXT:    movq %rax, %r12
4025; AVX1-NEXT:    movq %rax, %rbp
4026; AVX1-NEXT:    shlq $46, %rbp
4027; AVX1-NEXT:    sarq $63, %rbp
4028; AVX1-NEXT:    vpinsrb $1, %ebp, %xmm0, %xmm0
4029; AVX1-NEXT:    movq %rax, %rbp
4030; AVX1-NEXT:    shlq $45, %r8
4031; AVX1-NEXT:    sarq $63, %r8
4032; AVX1-NEXT:    vpinsrb $2, %r8d, %xmm0, %xmm0
4033; AVX1-NEXT:    movq %rax, %r8
4034; AVX1-NEXT:    shlq $44, %rdx
4035; AVX1-NEXT:    sarq $63, %rdx
4036; AVX1-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
4037; AVX1-NEXT:    movq %rax, %rdx
4038; AVX1-NEXT:    shlq $43, %rcx
4039; AVX1-NEXT:    sarq $63, %rcx
4040; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
4041; AVX1-NEXT:    movq %rax, %rcx
4042; AVX1-NEXT:    shlq $42, %rdi
4043; AVX1-NEXT:    sarq $63, %rdi
4044; AVX1-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
4045; AVX1-NEXT:    movq %rax, %rdi
4046; AVX1-NEXT:    shlq $41, %r13
4047; AVX1-NEXT:    sarq $63, %r13
4048; AVX1-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
4049; AVX1-NEXT:    movq %rax, %r13
4050; AVX1-NEXT:    shlq $40, %rsi
4051; AVX1-NEXT:    sarq $63, %rsi
4052; AVX1-NEXT:    vpinsrb $7, %esi, %xmm0, %xmm0
4053; AVX1-NEXT:    movq %rax, %rsi
4054; AVX1-NEXT:    shlq $39, %r10
4055; AVX1-NEXT:    sarq $63, %r10
4056; AVX1-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
4057; AVX1-NEXT:    movq %rax, %r10
4058; AVX1-NEXT:    shlq $38, %r11
4059; AVX1-NEXT:    sarq $63, %r11
4060; AVX1-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
4061; AVX1-NEXT:    movsbq %al, %r11
4062; AVX1-NEXT:    shlq $37, %r9
4063; AVX1-NEXT:    sarq $63, %r9
4064; AVX1-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
4065; AVX1-NEXT:    movq %rax, %r9
4066; AVX1-NEXT:    shlq $36, %rbx
4067; AVX1-NEXT:    sarq $63, %rbx
4068; AVX1-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
4069; AVX1-NEXT:    movq %rax, %rbx
4070; AVX1-NEXT:    shlq $35, %r14
4071; AVX1-NEXT:    sarq $63, %r14
4072; AVX1-NEXT:    vpinsrb $12, %r14d, %xmm0, %xmm0
4073; AVX1-NEXT:    movq %rax, %r14
4074; AVX1-NEXT:    shlq $34, %r15
4075; AVX1-NEXT:    sarq $63, %r15
4076; AVX1-NEXT:    vpinsrb $13, %r15d, %xmm0, %xmm0
4077; AVX1-NEXT:    movq %rax, %r15
4078; AVX1-NEXT:    shlq $33, %r12
4079; AVX1-NEXT:    sarq $63, %r12
4080; AVX1-NEXT:    vpinsrb $14, %r12d, %xmm0, %xmm0
4081; AVX1-NEXT:    movq %rax, %r12
4082; AVX1-NEXT:    shrq $31, %rbp
4083; AVX1-NEXT:    vpinsrb $15, %ebp, %xmm0, %xmm0
4084; AVX1-NEXT:    movq %rax, %rbp
4085; AVX1-NEXT:    shlq $63, %rdx
4086; AVX1-NEXT:    sarq $63, %rdx
4087; AVX1-NEXT:    vmovd %edx, %xmm1
4088; AVX1-NEXT:    movq %rax, %rdx
4089; AVX1-NEXT:    movswq %ax, %rax
4090; AVX1-NEXT:    shlq $62, %r8
4091; AVX1-NEXT:    sarq $63, %r8
4092; AVX1-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm1
4093; AVX1-NEXT:    shlq $61, %rcx
4094; AVX1-NEXT:    sarq $63, %rcx
4095; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
4096; AVX1-NEXT:    shlq $60, %rdi
4097; AVX1-NEXT:    sarq $63, %rdi
4098; AVX1-NEXT:    vpinsrb $3, %edi, %xmm1, %xmm1
4099; AVX1-NEXT:    shlq $59, %r13
4100; AVX1-NEXT:    sarq $63, %r13
4101; AVX1-NEXT:    vpinsrb $4, %r13d, %xmm1, %xmm1
4102; AVX1-NEXT:    shlq $58, %rsi
4103; AVX1-NEXT:    sarq $63, %rsi
4104; AVX1-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
4105; AVX1-NEXT:    shlq $57, %r10
4106; AVX1-NEXT:    sarq $63, %r10
4107; AVX1-NEXT:    vpinsrb $6, %r10d, %xmm1, %xmm1
4108; AVX1-NEXT:    shrq $7, %r11
4109; AVX1-NEXT:    vpinsrb $7, %r11d, %xmm1, %xmm1
4110; AVX1-NEXT:    shlq $55, %r9
4111; AVX1-NEXT:    sarq $63, %r9
4112; AVX1-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
4113; AVX1-NEXT:    shlq $54, %rbx
4114; AVX1-NEXT:    sarq $63, %rbx
4115; AVX1-NEXT:    vpinsrb $9, %ebx, %xmm1, %xmm1
4116; AVX1-NEXT:    shlq $53, %r14
4117; AVX1-NEXT:    sarq $63, %r14
4118; AVX1-NEXT:    vpinsrb $10, %r14d, %xmm1, %xmm1
4119; AVX1-NEXT:    shlq $52, %r15
4120; AVX1-NEXT:    sarq $63, %r15
4121; AVX1-NEXT:    vpinsrb $11, %r15d, %xmm1, %xmm1
4122; AVX1-NEXT:    shlq $51, %r12
4123; AVX1-NEXT:    sarq $63, %r12
4124; AVX1-NEXT:    vpinsrb $12, %r12d, %xmm1, %xmm1
4125; AVX1-NEXT:    shlq $50, %rbp
4126; AVX1-NEXT:    sarq $63, %rbp
4127; AVX1-NEXT:    vpinsrb $13, %ebp, %xmm1, %xmm1
4128; AVX1-NEXT:    shlq $49, %rdx
4129; AVX1-NEXT:    sarq $63, %rdx
4130; AVX1-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
4131; AVX1-NEXT:    shrq $15, %rax
4132; AVX1-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
4133; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4134; AVX1-NEXT:    popq %rbx
4135; AVX1-NEXT:    popq %r12
4136; AVX1-NEXT:    popq %r13
4137; AVX1-NEXT:    popq %r14
4138; AVX1-NEXT:    popq %r15
4139; AVX1-NEXT:    popq %rbp
4140; AVX1-NEXT:    retq
4141;
4142; AVX2-LABEL: load_sext_32i1_to_32i8:
4143; AVX2:       # %bb.0: # %entry
4144; AVX2-NEXT:    pushq %rbp
4145; AVX2-NEXT:    pushq %r15
4146; AVX2-NEXT:    pushq %r14
4147; AVX2-NEXT:    pushq %r13
4148; AVX2-NEXT:    pushq %r12
4149; AVX2-NEXT:    pushq %rbx
4150; AVX2-NEXT:    movslq (%rdi), %rax
4151; AVX2-NEXT:    movq %rax, %rcx
4152; AVX2-NEXT:    shlq $47, %rcx
4153; AVX2-NEXT:    sarq $63, %rcx
4154; AVX2-NEXT:    vmovd %ecx, %xmm0
4155; AVX2-NEXT:    movq %rax, %r8
4156; AVX2-NEXT:    movq %rax, %rdx
4157; AVX2-NEXT:    movq %rax, %rcx
4158; AVX2-NEXT:    movq %rax, %rdi
4159; AVX2-NEXT:    movq %rax, %r13
4160; AVX2-NEXT:    movq %rax, %rsi
4161; AVX2-NEXT:    movq %rax, %r10
4162; AVX2-NEXT:    movq %rax, %r11
4163; AVX2-NEXT:    movq %rax, %r9
4164; AVX2-NEXT:    movq %rax, %rbx
4165; AVX2-NEXT:    movq %rax, %r14
4166; AVX2-NEXT:    movq %rax, %r15
4167; AVX2-NEXT:    movq %rax, %r12
4168; AVX2-NEXT:    movq %rax, %rbp
4169; AVX2-NEXT:    shlq $46, %rbp
4170; AVX2-NEXT:    sarq $63, %rbp
4171; AVX2-NEXT:    vpinsrb $1, %ebp, %xmm0, %xmm0
4172; AVX2-NEXT:    movq %rax, %rbp
4173; AVX2-NEXT:    shlq $45, %r8
4174; AVX2-NEXT:    sarq $63, %r8
4175; AVX2-NEXT:    vpinsrb $2, %r8d, %xmm0, %xmm0
4176; AVX2-NEXT:    movq %rax, %r8
4177; AVX2-NEXT:    shlq $44, %rdx
4178; AVX2-NEXT:    sarq $63, %rdx
4179; AVX2-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
4180; AVX2-NEXT:    movq %rax, %rdx
4181; AVX2-NEXT:    shlq $43, %rcx
4182; AVX2-NEXT:    sarq $63, %rcx
4183; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
4184; AVX2-NEXT:    movq %rax, %rcx
4185; AVX2-NEXT:    shlq $42, %rdi
4186; AVX2-NEXT:    sarq $63, %rdi
4187; AVX2-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
4188; AVX2-NEXT:    movq %rax, %rdi
4189; AVX2-NEXT:    shlq $41, %r13
4190; AVX2-NEXT:    sarq $63, %r13
4191; AVX2-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
4192; AVX2-NEXT:    movq %rax, %r13
4193; AVX2-NEXT:    shlq $40, %rsi
4194; AVX2-NEXT:    sarq $63, %rsi
4195; AVX2-NEXT:    vpinsrb $7, %esi, %xmm0, %xmm0
4196; AVX2-NEXT:    movq %rax, %rsi
4197; AVX2-NEXT:    shlq $39, %r10
4198; AVX2-NEXT:    sarq $63, %r10
4199; AVX2-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
4200; AVX2-NEXT:    movq %rax, %r10
4201; AVX2-NEXT:    shlq $38, %r11
4202; AVX2-NEXT:    sarq $63, %r11
4203; AVX2-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
4204; AVX2-NEXT:    movsbq %al, %r11
4205; AVX2-NEXT:    shlq $37, %r9
4206; AVX2-NEXT:    sarq $63, %r9
4207; AVX2-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
4208; AVX2-NEXT:    movq %rax, %r9
4209; AVX2-NEXT:    shlq $36, %rbx
4210; AVX2-NEXT:    sarq $63, %rbx
4211; AVX2-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
4212; AVX2-NEXT:    movq %rax, %rbx
4213; AVX2-NEXT:    shlq $35, %r14
4214; AVX2-NEXT:    sarq $63, %r14
4215; AVX2-NEXT:    vpinsrb $12, %r14d, %xmm0, %xmm0
4216; AVX2-NEXT:    movq %rax, %r14
4217; AVX2-NEXT:    shlq $34, %r15
4218; AVX2-NEXT:    sarq $63, %r15
4219; AVX2-NEXT:    vpinsrb $13, %r15d, %xmm0, %xmm0
4220; AVX2-NEXT:    movq %rax, %r15
4221; AVX2-NEXT:    shlq $33, %r12
4222; AVX2-NEXT:    sarq $63, %r12
4223; AVX2-NEXT:    vpinsrb $14, %r12d, %xmm0, %xmm0
4224; AVX2-NEXT:    movq %rax, %r12
4225; AVX2-NEXT:    shrq $31, %rbp
4226; AVX2-NEXT:    vpinsrb $15, %ebp, %xmm0, %xmm0
4227; AVX2-NEXT:    movq %rax, %rbp
4228; AVX2-NEXT:    shlq $63, %rdx
4229; AVX2-NEXT:    sarq $63, %rdx
4230; AVX2-NEXT:    vmovd %edx, %xmm1
4231; AVX2-NEXT:    movq %rax, %rdx
4232; AVX2-NEXT:    movswq %ax, %rax
4233; AVX2-NEXT:    shlq $62, %r8
4234; AVX2-NEXT:    sarq $63, %r8
4235; AVX2-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm1
4236; AVX2-NEXT:    shlq $61, %rcx
4237; AVX2-NEXT:    sarq $63, %rcx
4238; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
4239; AVX2-NEXT:    shlq $60, %rdi
4240; AVX2-NEXT:    sarq $63, %rdi
4241; AVX2-NEXT:    vpinsrb $3, %edi, %xmm1, %xmm1
4242; AVX2-NEXT:    shlq $59, %r13
4243; AVX2-NEXT:    sarq $63, %r13
4244; AVX2-NEXT:    vpinsrb $4, %r13d, %xmm1, %xmm1
4245; AVX2-NEXT:    shlq $58, %rsi
4246; AVX2-NEXT:    sarq $63, %rsi
4247; AVX2-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
4248; AVX2-NEXT:    shlq $57, %r10
4249; AVX2-NEXT:    sarq $63, %r10
4250; AVX2-NEXT:    vpinsrb $6, %r10d, %xmm1, %xmm1
4251; AVX2-NEXT:    shrq $7, %r11
4252; AVX2-NEXT:    vpinsrb $7, %r11d, %xmm1, %xmm1
4253; AVX2-NEXT:    shlq $55, %r9
4254; AVX2-NEXT:    sarq $63, %r9
4255; AVX2-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
4256; AVX2-NEXT:    shlq $54, %rbx
4257; AVX2-NEXT:    sarq $63, %rbx
4258; AVX2-NEXT:    vpinsrb $9, %ebx, %xmm1, %xmm1
4259; AVX2-NEXT:    shlq $53, %r14
4260; AVX2-NEXT:    sarq $63, %r14
4261; AVX2-NEXT:    vpinsrb $10, %r14d, %xmm1, %xmm1
4262; AVX2-NEXT:    shlq $52, %r15
4263; AVX2-NEXT:    sarq $63, %r15
4264; AVX2-NEXT:    vpinsrb $11, %r15d, %xmm1, %xmm1
4265; AVX2-NEXT:    shlq $51, %r12
4266; AVX2-NEXT:    sarq $63, %r12
4267; AVX2-NEXT:    vpinsrb $12, %r12d, %xmm1, %xmm1
4268; AVX2-NEXT:    shlq $50, %rbp
4269; AVX2-NEXT:    sarq $63, %rbp
4270; AVX2-NEXT:    vpinsrb $13, %ebp, %xmm1, %xmm1
4271; AVX2-NEXT:    shlq $49, %rdx
4272; AVX2-NEXT:    sarq $63, %rdx
4273; AVX2-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
4274; AVX2-NEXT:    shrq $15, %rax
4275; AVX2-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
4276; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
4277; AVX2-NEXT:    popq %rbx
4278; AVX2-NEXT:    popq %r12
4279; AVX2-NEXT:    popq %r13
4280; AVX2-NEXT:    popq %r14
4281; AVX2-NEXT:    popq %r15
4282; AVX2-NEXT:    popq %rbp
4283; AVX2-NEXT:    retq
4284;
4285; AVX512F-LABEL: load_sext_32i1_to_32i8:
4286; AVX512F:       # %bb.0: # %entry
4287; AVX512F-NEXT:    kmovw (%rdi), %k1
4288; AVX512F-NEXT:    kmovw 2(%rdi), %k2
4289; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
4290; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4291; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
4292; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
4293; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4294; AVX512F-NEXT:    retq
4295;
4296; AVX512BW-LABEL: load_sext_32i1_to_32i8:
4297; AVX512BW:       # %bb.0: # %entry
4298; AVX512BW-NEXT:    kmovd (%rdi), %k0
4299; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
4300; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4301; AVX512BW-NEXT:    retq
4302;
4303; X32-SSE41-LABEL: load_sext_32i1_to_32i8:
4304; X32-SSE41:       # %bb.0: # %entry
4305; X32-SSE41-NEXT:    pushl %esi
4306; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4307; X32-SSE41-NEXT:    movswl (%eax), %ecx
4308; X32-SSE41-NEXT:    movl %ecx, %edx
4309; X32-SSE41-NEXT:    shll $30, %edx
4310; X32-SSE41-NEXT:    sarl $31, %edx
4311; X32-SSE41-NEXT:    movl %ecx, %esi
4312; X32-SSE41-NEXT:    shll $31, %esi
4313; X32-SSE41-NEXT:    sarl $31, %esi
4314; X32-SSE41-NEXT:    movd %esi, %xmm0
4315; X32-SSE41-NEXT:    pinsrb $1, %edx, %xmm0
4316; X32-SSE41-NEXT:    movl %ecx, %edx
4317; X32-SSE41-NEXT:    shll $29, %edx
4318; X32-SSE41-NEXT:    sarl $31, %edx
4319; X32-SSE41-NEXT:    pinsrb $2, %edx, %xmm0
4320; X32-SSE41-NEXT:    movl %ecx, %edx
4321; X32-SSE41-NEXT:    shll $28, %edx
4322; X32-SSE41-NEXT:    sarl $31, %edx
4323; X32-SSE41-NEXT:    pinsrb $3, %edx, %xmm0
4324; X32-SSE41-NEXT:    movl %ecx, %edx
4325; X32-SSE41-NEXT:    shll $27, %edx
4326; X32-SSE41-NEXT:    sarl $31, %edx
4327; X32-SSE41-NEXT:    pinsrb $4, %edx, %xmm0
4328; X32-SSE41-NEXT:    movl %ecx, %edx
4329; X32-SSE41-NEXT:    shll $26, %edx
4330; X32-SSE41-NEXT:    sarl $31, %edx
4331; X32-SSE41-NEXT:    pinsrb $5, %edx, %xmm0
4332; X32-SSE41-NEXT:    movl %ecx, %edx
4333; X32-SSE41-NEXT:    shll $25, %edx
4334; X32-SSE41-NEXT:    sarl $31, %edx
4335; X32-SSE41-NEXT:    pinsrb $6, %edx, %xmm0
4336; X32-SSE41-NEXT:    movsbl %cl, %edx
4337; X32-SSE41-NEXT:    shrl $7, %edx
4338; X32-SSE41-NEXT:    pinsrb $7, %edx, %xmm0
4339; X32-SSE41-NEXT:    movl %ecx, %edx
4340; X32-SSE41-NEXT:    shll $23, %edx
4341; X32-SSE41-NEXT:    sarl $31, %edx
4342; X32-SSE41-NEXT:    pinsrb $8, %edx, %xmm0
4343; X32-SSE41-NEXT:    movl %ecx, %edx
4344; X32-SSE41-NEXT:    shll $22, %edx
4345; X32-SSE41-NEXT:    sarl $31, %edx
4346; X32-SSE41-NEXT:    pinsrb $9, %edx, %xmm0
4347; X32-SSE41-NEXT:    movl %ecx, %edx
4348; X32-SSE41-NEXT:    shll $21, %edx
4349; X32-SSE41-NEXT:    sarl $31, %edx
4350; X32-SSE41-NEXT:    pinsrb $10, %edx, %xmm0
4351; X32-SSE41-NEXT:    movl %ecx, %edx
4352; X32-SSE41-NEXT:    shll $20, %edx
4353; X32-SSE41-NEXT:    sarl $31, %edx
4354; X32-SSE41-NEXT:    pinsrb $11, %edx, %xmm0
4355; X32-SSE41-NEXT:    movl %ecx, %edx
4356; X32-SSE41-NEXT:    shll $19, %edx
4357; X32-SSE41-NEXT:    sarl $31, %edx
4358; X32-SSE41-NEXT:    pinsrb $12, %edx, %xmm0
4359; X32-SSE41-NEXT:    movl %ecx, %edx
4360; X32-SSE41-NEXT:    shll $18, %edx
4361; X32-SSE41-NEXT:    sarl $31, %edx
4362; X32-SSE41-NEXT:    pinsrb $13, %edx, %xmm0
4363; X32-SSE41-NEXT:    movl %ecx, %edx
4364; X32-SSE41-NEXT:    shll $17, %edx
4365; X32-SSE41-NEXT:    sarl $31, %edx
4366; X32-SSE41-NEXT:    pinsrb $14, %edx, %xmm0
4367; X32-SSE41-NEXT:    shrl $15, %ecx
4368; X32-SSE41-NEXT:    pinsrb $15, %ecx, %xmm0
4369; X32-SSE41-NEXT:    movswl 2(%eax), %eax
4370; X32-SSE41-NEXT:    movl %eax, %ecx
4371; X32-SSE41-NEXT:    shll $30, %ecx
4372; X32-SSE41-NEXT:    sarl $31, %ecx
4373; X32-SSE41-NEXT:    movl %eax, %edx
4374; X32-SSE41-NEXT:    shll $31, %edx
4375; X32-SSE41-NEXT:    sarl $31, %edx
4376; X32-SSE41-NEXT:    movd %edx, %xmm1
4377; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
4378; X32-SSE41-NEXT:    movl %eax, %ecx
4379; X32-SSE41-NEXT:    shll $29, %ecx
4380; X32-SSE41-NEXT:    sarl $31, %ecx
4381; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
4382; X32-SSE41-NEXT:    movl %eax, %ecx
4383; X32-SSE41-NEXT:    shll $28, %ecx
4384; X32-SSE41-NEXT:    sarl $31, %ecx
4385; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
4386; X32-SSE41-NEXT:    movl %eax, %ecx
4387; X32-SSE41-NEXT:    shll $27, %ecx
4388; X32-SSE41-NEXT:    sarl $31, %ecx
4389; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
4390; X32-SSE41-NEXT:    movl %eax, %ecx
4391; X32-SSE41-NEXT:    shll $26, %ecx
4392; X32-SSE41-NEXT:    sarl $31, %ecx
4393; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
4394; X32-SSE41-NEXT:    movl %eax, %ecx
4395; X32-SSE41-NEXT:    shll $25, %ecx
4396; X32-SSE41-NEXT:    sarl $31, %ecx
4397; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
4398; X32-SSE41-NEXT:    movsbl %al, %ecx
4399; X32-SSE41-NEXT:    shrl $7, %ecx
4400; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
4401; X32-SSE41-NEXT:    movl %eax, %ecx
4402; X32-SSE41-NEXT:    shll $23, %ecx
4403; X32-SSE41-NEXT:    sarl $31, %ecx
4404; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
4405; X32-SSE41-NEXT:    movl %eax, %ecx
4406; X32-SSE41-NEXT:    shll $22, %ecx
4407; X32-SSE41-NEXT:    sarl $31, %ecx
4408; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
4409; X32-SSE41-NEXT:    movl %eax, %ecx
4410; X32-SSE41-NEXT:    shll $21, %ecx
4411; X32-SSE41-NEXT:    sarl $31, %ecx
4412; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
4413; X32-SSE41-NEXT:    movl %eax, %ecx
4414; X32-SSE41-NEXT:    shll $20, %ecx
4415; X32-SSE41-NEXT:    sarl $31, %ecx
4416; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
4417; X32-SSE41-NEXT:    movl %eax, %ecx
4418; X32-SSE41-NEXT:    shll $19, %ecx
4419; X32-SSE41-NEXT:    sarl $31, %ecx
4420; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
4421; X32-SSE41-NEXT:    movl %eax, %ecx
4422; X32-SSE41-NEXT:    shll $18, %ecx
4423; X32-SSE41-NEXT:    sarl $31, %ecx
4424; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
4425; X32-SSE41-NEXT:    movl %eax, %ecx
4426; X32-SSE41-NEXT:    shll $17, %ecx
4427; X32-SSE41-NEXT:    sarl $31, %ecx
4428; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
4429; X32-SSE41-NEXT:    shrl $15, %eax
4430; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm1
4431; X32-SSE41-NEXT:    popl %esi
4432; X32-SSE41-NEXT:    retl
4433entry:
4434 %X = load <32 x i1>, <32 x i1>* %ptr
4435 %Y = sext <32 x i1> %X to <32 x i8>
4436 ret <32 x i8> %Y
4437}
4438
4439define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
4440; SSE2-LABEL: load_sext_16i8_to_16i16:
4441; SSE2:       # %bb.0: # %entry
4442; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4443; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4444; SSE2-NEXT:    psraw $8, %xmm0
4445; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
4446; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4447; SSE2-NEXT:    psraw $8, %xmm1
4448; SSE2-NEXT:    retq
4449;
4450; SSSE3-LABEL: load_sext_16i8_to_16i16:
4451; SSSE3:       # %bb.0: # %entry
4452; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4453; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4454; SSSE3-NEXT:    psraw $8, %xmm0
4455; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
4456; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4457; SSSE3-NEXT:    psraw $8, %xmm1
4458; SSSE3-NEXT:    retq
4459;
4460; SSE41-LABEL: load_sext_16i8_to_16i16:
4461; SSE41:       # %bb.0: # %entry
4462; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
4463; SSE41-NEXT:    pmovsxbw 8(%rdi), %xmm1
4464; SSE41-NEXT:    retq
4465;
4466; AVX1-LABEL: load_sext_16i8_to_16i16:
4467; AVX1:       # %bb.0: # %entry
4468; AVX1-NEXT:    vpmovsxbw 8(%rdi), %xmm0
4469; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm1
4470; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4471; AVX1-NEXT:    retq
4472;
4473; AVX2-LABEL: load_sext_16i8_to_16i16:
4474; AVX2:       # %bb.0: # %entry
4475; AVX2-NEXT:    vpmovsxbw (%rdi), %ymm0
4476; AVX2-NEXT:    retq
4477;
4478; AVX512-LABEL: load_sext_16i8_to_16i16:
4479; AVX512:       # %bb.0: # %entry
4480; AVX512-NEXT:    vpmovsxbw (%rdi), %ymm0
4481; AVX512-NEXT:    retq
4482;
4483; X32-SSE41-LABEL: load_sext_16i8_to_16i16:
4484; X32-SSE41:       # %bb.0: # %entry
4485; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4486; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
4487; X32-SSE41-NEXT:    pmovsxbw 8(%eax), %xmm1
4488; X32-SSE41-NEXT:    retl
4489entry:
4490 %X = load <16 x i8>, <16 x i8>* %ptr
4491 %Y = sext <16 x i8> %X to <16 x i16>
4492 ret <16 x i16> %Y
4493}
4494
4495define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
4496; SSE2-LABEL: load_sext_2i16_to_2i64:
4497; SSE2:       # %bb.0: # %entry
4498; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
4499; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
4500; SSE2-NEXT:    movdqa %xmm0, %xmm1
4501; SSE2-NEXT:    psrad $31, %xmm1
4502; SSE2-NEXT:    psrad $16, %xmm0
4503; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4504; SSE2-NEXT:    retq
4505;
4506; SSSE3-LABEL: load_sext_2i16_to_2i64:
4507; SSSE3:       # %bb.0: # %entry
4508; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
4509; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
4510; SSSE3-NEXT:    movdqa %xmm0, %xmm1
4511; SSSE3-NEXT:    psrad $31, %xmm1
4512; SSSE3-NEXT:    psrad $16, %xmm0
4513; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4514; SSSE3-NEXT:    retq
4515;
4516; SSE41-LABEL: load_sext_2i16_to_2i64:
4517; SSE41:       # %bb.0: # %entry
4518; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
4519; SSE41-NEXT:    retq
4520;
4521; AVX-LABEL: load_sext_2i16_to_2i64:
4522; AVX:       # %bb.0: # %entry
4523; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
4524; AVX-NEXT:    retq
4525;
4526; X32-SSE41-LABEL: load_sext_2i16_to_2i64:
4527; X32-SSE41:       # %bb.0: # %entry
4528; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4529; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
4530; X32-SSE41-NEXT:    retl
4531entry:
4532 %X = load <2 x i16>, <2 x i16>* %ptr
4533 %Y = sext <2 x i16> %X to <2 x i64>
4534 ret <2 x i64> %Y
4535}
4536
4537define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) {
4538; SSE2-LABEL: load_sext_4i16_to_4i32:
4539; SSE2:       # %bb.0: # %entry
4540; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4541; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4542; SSE2-NEXT:    psrad $16, %xmm0
4543; SSE2-NEXT:    retq
4544;
4545; SSSE3-LABEL: load_sext_4i16_to_4i32:
4546; SSSE3:       # %bb.0: # %entry
4547; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4548; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4549; SSSE3-NEXT:    psrad $16, %xmm0
4550; SSSE3-NEXT:    retq
4551;
4552; SSE41-LABEL: load_sext_4i16_to_4i32:
4553; SSE41:       # %bb.0: # %entry
4554; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
4555; SSE41-NEXT:    retq
4556;
4557; AVX-LABEL: load_sext_4i16_to_4i32:
4558; AVX:       # %bb.0: # %entry
4559; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
4560; AVX-NEXT:    retq
4561;
4562; X32-SSE41-LABEL: load_sext_4i16_to_4i32:
4563; X32-SSE41:       # %bb.0: # %entry
4564; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4565; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
4566; X32-SSE41-NEXT:    retl
4567entry:
4568 %X = load <4 x i16>, <4 x i16>* %ptr
4569 %Y = sext <4 x i16> %X to <4 x i32>
4570 ret <4 x i32> %Y
4571}
4572
4573define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
4574; SSE2-LABEL: load_sext_4i16_to_4i64:
4575; SSE2:       # %bb.0: # %entry
4576; SSE2-NEXT:    movswq 2(%rdi), %rax
4577; SSE2-NEXT:    movq %rax, %xmm1
4578; SSE2-NEXT:    movswq (%rdi), %rax
4579; SSE2-NEXT:    movq %rax, %xmm0
4580; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4581; SSE2-NEXT:    movswq 6(%rdi), %rax
4582; SSE2-NEXT:    movq %rax, %xmm2
4583; SSE2-NEXT:    movswq 4(%rdi), %rax
4584; SSE2-NEXT:    movq %rax, %xmm1
4585; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4586; SSE2-NEXT:    retq
4587;
4588; SSSE3-LABEL: load_sext_4i16_to_4i64:
4589; SSSE3:       # %bb.0: # %entry
4590; SSSE3-NEXT:    movswq 2(%rdi), %rax
4591; SSSE3-NEXT:    movq %rax, %xmm1
4592; SSSE3-NEXT:    movswq (%rdi), %rax
4593; SSSE3-NEXT:    movq %rax, %xmm0
4594; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4595; SSSE3-NEXT:    movswq 6(%rdi), %rax
4596; SSSE3-NEXT:    movq %rax, %xmm2
4597; SSSE3-NEXT:    movswq 4(%rdi), %rax
4598; SSSE3-NEXT:    movq %rax, %xmm1
4599; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4600; SSSE3-NEXT:    retq
4601;
4602; SSE41-LABEL: load_sext_4i16_to_4i64:
4603; SSE41:       # %bb.0: # %entry
4604; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
4605; SSE41-NEXT:    pmovsxwq 4(%rdi), %xmm1
4606; SSE41-NEXT:    retq
4607;
4608; AVX1-LABEL: load_sext_4i16_to_4i64:
4609; AVX1:       # %bb.0: # %entry
4610; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
4611; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
4612; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4613; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
4614; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4615; AVX1-NEXT:    retq
4616;
4617; AVX2-LABEL: load_sext_4i16_to_4i64:
4618; AVX2:       # %bb.0: # %entry
4619; AVX2-NEXT:    vpmovsxwq (%rdi), %ymm0
4620; AVX2-NEXT:    retq
4621;
4622; AVX512-LABEL: load_sext_4i16_to_4i64:
4623; AVX512:       # %bb.0: # %entry
4624; AVX512-NEXT:    vpmovsxwq (%rdi), %ymm0
4625; AVX512-NEXT:    retq
4626;
4627; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
4628; X32-SSE41:       # %bb.0: # %entry
4629; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4630; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
4631; X32-SSE41-NEXT:    pmovsxwq 4(%eax), %xmm1
4632; X32-SSE41-NEXT:    retl
4633entry:
4634 %X = load <4 x i16>, <4 x i16>* %ptr
4635 %Y = sext <4 x i16> %X to <4 x i64>
4636 ret <4 x i64> %Y
4637}
4638
4639define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
4640; SSE2-LABEL: load_sext_8i16_to_8i32:
4641; SSE2:       # %bb.0: # %entry
4642; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4643; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4644; SSE2-NEXT:    psrad $16, %xmm0
4645; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
4646; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
4647; SSE2-NEXT:    psrad $16, %xmm1
4648; SSE2-NEXT:    retq
4649;
4650; SSSE3-LABEL: load_sext_8i16_to_8i32:
4651; SSSE3:       # %bb.0: # %entry
4652; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4653; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4654; SSSE3-NEXT:    psrad $16, %xmm0
4655; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
4656; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
4657; SSSE3-NEXT:    psrad $16, %xmm1
4658; SSSE3-NEXT:    retq
4659;
4660; SSE41-LABEL: load_sext_8i16_to_8i32:
4661; SSE41:       # %bb.0: # %entry
4662; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
4663; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
4664; SSE41-NEXT:    retq
4665;
4666; AVX1-LABEL: load_sext_8i16_to_8i32:
4667; AVX1:       # %bb.0: # %entry
4668; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm0
4669; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm1
4670; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4671; AVX1-NEXT:    retq
4672;
4673; AVX2-LABEL: load_sext_8i16_to_8i32:
4674; AVX2:       # %bb.0: # %entry
4675; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
4676; AVX2-NEXT:    retq
4677;
4678; AVX512-LABEL: load_sext_8i16_to_8i32:
4679; AVX512:       # %bb.0: # %entry
4680; AVX512-NEXT:    vpmovsxwd (%rdi), %ymm0
4681; AVX512-NEXT:    retq
4682;
4683; X32-SSE41-LABEL: load_sext_8i16_to_8i32:
4684; X32-SSE41:       # %bb.0: # %entry
4685; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4686; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
4687; X32-SSE41-NEXT:    pmovsxwd 8(%eax), %xmm1
4688; X32-SSE41-NEXT:    retl
4689entry:
4690 %X = load <8 x i16>, <8 x i16>* %ptr
4691 %Y = sext <8 x i16> %X to <8 x i32>
4692 ret <8 x i32> %Y
4693}
4694
4695define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
4696; SSE2-LABEL: load_sext_2i32_to_2i64:
4697; SSE2:       # %bb.0: # %entry
4698; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4699; SSE2-NEXT:    movdqa %xmm0, %xmm1
4700; SSE2-NEXT:    psrad $31, %xmm1
4701; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4702; SSE2-NEXT:    retq
4703;
4704; SSSE3-LABEL: load_sext_2i32_to_2i64:
4705; SSSE3:       # %bb.0: # %entry
4706; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4707; SSSE3-NEXT:    movdqa %xmm0, %xmm1
4708; SSSE3-NEXT:    psrad $31, %xmm1
4709; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4710; SSSE3-NEXT:    retq
4711;
4712; SSE41-LABEL: load_sext_2i32_to_2i64:
4713; SSE41:       # %bb.0: # %entry
4714; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
4715; SSE41-NEXT:    retq
4716;
4717; AVX-LABEL: load_sext_2i32_to_2i64:
4718; AVX:       # %bb.0: # %entry
4719; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
4720; AVX-NEXT:    retq
4721;
4722; X32-SSE41-LABEL: load_sext_2i32_to_2i64:
4723; X32-SSE41:       # %bb.0: # %entry
4724; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4725; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
4726; X32-SSE41-NEXT:    retl
4727entry:
4728 %X = load <2 x i32>, <2 x i32>* %ptr
4729 %Y = sext <2 x i32> %X to <2 x i64>
4730 ret <2 x i64> %Y
4731}
4732
4733define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
4734; SSE2-LABEL: load_sext_4i32_to_4i64:
4735; SSE2:       # %bb.0: # %entry
4736; SSE2-NEXT:    movdqa (%rdi), %xmm0
4737; SSE2-NEXT:    movdqa %xmm0, %xmm2
4738; SSE2-NEXT:    psrad $31, %xmm2
4739; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4740; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4741; SSE2-NEXT:    movdqa %xmm1, %xmm2
4742; SSE2-NEXT:    psrad $31, %xmm2
4743; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4744; SSE2-NEXT:    retq
4745;
4746; SSSE3-LABEL: load_sext_4i32_to_4i64:
4747; SSSE3:       # %bb.0: # %entry
4748; SSSE3-NEXT:    movdqa (%rdi), %xmm0
4749; SSSE3-NEXT:    movdqa %xmm0, %xmm2
4750; SSSE3-NEXT:    psrad $31, %xmm2
4751; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4752; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4753; SSSE3-NEXT:    movdqa %xmm1, %xmm2
4754; SSSE3-NEXT:    psrad $31, %xmm2
4755; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4756; SSSE3-NEXT:    retq
4757;
4758; SSE41-LABEL: load_sext_4i32_to_4i64:
4759; SSE41:       # %bb.0: # %entry
4760; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
4761; SSE41-NEXT:    pmovsxdq 8(%rdi), %xmm1
4762; SSE41-NEXT:    retq
4763;
4764; AVX1-LABEL: load_sext_4i32_to_4i64:
4765; AVX1:       # %bb.0: # %entry
4766; AVX1-NEXT:    vpmovsxdq 8(%rdi), %xmm0
4767; AVX1-NEXT:    vpmovsxdq (%rdi), %xmm1
4768; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4769; AVX1-NEXT:    retq
4770;
4771; AVX2-LABEL: load_sext_4i32_to_4i64:
4772; AVX2:       # %bb.0: # %entry
4773; AVX2-NEXT:    vpmovsxdq (%rdi), %ymm0
4774; AVX2-NEXT:    retq
4775;
4776; AVX512-LABEL: load_sext_4i32_to_4i64:
4777; AVX512:       # %bb.0: # %entry
4778; AVX512-NEXT:    vpmovsxdq (%rdi), %ymm0
4779; AVX512-NEXT:    retq
4780;
4781; X32-SSE41-LABEL: load_sext_4i32_to_4i64:
4782; X32-SSE41:       # %bb.0: # %entry
4783; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4784; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
4785; X32-SSE41-NEXT:    pmovsxdq 8(%eax), %xmm1
4786; X32-SSE41-NEXT:    retl
4787entry:
4788 %X = load <4 x i32>, <4 x i32>* %ptr
4789 %Y = sext <4 x i32> %X to <4 x i64>
4790 ret <4 x i64> %Y
4791}
4792
4793define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
4794; SSE2-LABEL: sext_2i8_to_i32:
4795; SSE2:       # %bb.0: # %entry
4796; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4797; SSE2-NEXT:    psraw $8, %xmm0
4798; SSE2-NEXT:    movd %xmm0, %eax
4799; SSE2-NEXT:    retq
4800;
4801; SSSE3-LABEL: sext_2i8_to_i32:
4802; SSSE3:       # %bb.0: # %entry
4803; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4804; SSSE3-NEXT:    psraw $8, %xmm0
4805; SSSE3-NEXT:    movd %xmm0, %eax
4806; SSSE3-NEXT:    retq
4807;
4808; SSE41-LABEL: sext_2i8_to_i32:
4809; SSE41:       # %bb.0: # %entry
4810; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
4811; SSE41-NEXT:    movd %xmm0, %eax
4812; SSE41-NEXT:    retq
4813;
4814; AVX-LABEL: sext_2i8_to_i32:
4815; AVX:       # %bb.0: # %entry
4816; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
4817; AVX-NEXT:    vmovd %xmm0, %eax
4818; AVX-NEXT:    retq
4819;
4820; X32-SSE41-LABEL: sext_2i8_to_i32:
4821; X32-SSE41:       # %bb.0: # %entry
4822; X32-SSE41-NEXT:    pushl %eax
4823; X32-SSE41-NEXT:    .cfi_def_cfa_offset 8
4824; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
4825; X32-SSE41-NEXT:    movd %xmm0, %eax
4826; X32-SSE41-NEXT:    popl %ecx
4827; X32-SSE41-NEXT:    .cfi_def_cfa_offset 4
4828; X32-SSE41-NEXT:    retl
4829entry:
4830  %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
4831  %Ex = sext <2 x i8> %Shuf to <2 x i16>
4832  %Bc = bitcast <2 x i16> %Ex to i32
4833  ret i32 %Bc
4834}
4835
4836define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
4837; SSE2-LABEL: sext_4i1_to_4i64:
4838; SSE2:       # %bb.0:
4839; SSE2-NEXT:    pslld $31, %xmm0
4840; SSE2-NEXT:    psrad $31, %xmm0
4841; SSE2-NEXT:    movdqa %xmm0, %xmm2
4842; SSE2-NEXT:    psrad $31, %xmm2
4843; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4844; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4845; SSE2-NEXT:    movdqa %xmm1, %xmm2
4846; SSE2-NEXT:    psrad $31, %xmm2
4847; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4848; SSE2-NEXT:    retq
4849;
4850; SSSE3-LABEL: sext_4i1_to_4i64:
4851; SSSE3:       # %bb.0:
4852; SSSE3-NEXT:    pslld $31, %xmm0
4853; SSSE3-NEXT:    psrad $31, %xmm0
4854; SSSE3-NEXT:    movdqa %xmm0, %xmm2
4855; SSSE3-NEXT:    psrad $31, %xmm2
4856; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4857; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4858; SSSE3-NEXT:    movdqa %xmm1, %xmm2
4859; SSSE3-NEXT:    psrad $31, %xmm2
4860; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4861; SSSE3-NEXT:    retq
4862;
4863; SSE41-LABEL: sext_4i1_to_4i64:
4864; SSE41:       # %bb.0:
4865; SSE41-NEXT:    pslld $31, %xmm0
4866; SSE41-NEXT:    psrad $31, %xmm0
4867; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
4868; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4869; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
4870; SSE41-NEXT:    movdqa %xmm2, %xmm0
4871; SSE41-NEXT:    retq
4872;
4873; AVX1-LABEL: sext_4i1_to_4i64:
4874; AVX1:       # %bb.0:
4875; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
4876; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
4877; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
4878; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4879; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
4880; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4881; AVX1-NEXT:    retq
4882;
4883; AVX2-LABEL: sext_4i1_to_4i64:
4884; AVX2:       # %bb.0:
4885; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
4886; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
4887; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
4888; AVX2-NEXT:    retq
4889;
4890; AVX512-LABEL: sext_4i1_to_4i64:
4891; AVX512:       # %bb.0:
4892; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
4893; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
4894; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
4895; AVX512-NEXT:    retq
4896;
4897; X32-SSE41-LABEL: sext_4i1_to_4i64:
4898; X32-SSE41:       # %bb.0:
4899; X32-SSE41-NEXT:    pslld $31, %xmm0
4900; X32-SSE41-NEXT:    psrad $31, %xmm0
4901; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
4902; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4903; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
4904; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
4905; X32-SSE41-NEXT:    retl
4906  %extmask = sext <4 x i1> %mask to <4 x i64>
4907  ret <4 x i64> %extmask
4908}
4909
4910define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
4911; SSE2-LABEL: sext_4i8_to_4i64:
4912; SSE2:       # %bb.0:
4913; SSE2-NEXT:    pslld $24, %xmm0
4914; SSE2-NEXT:    psrad $24, %xmm0
4915; SSE2-NEXT:    movdqa %xmm0, %xmm2
4916; SSE2-NEXT:    psrad $31, %xmm2
4917; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4918; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4919; SSE2-NEXT:    movdqa %xmm1, %xmm2
4920; SSE2-NEXT:    psrad $31, %xmm2
4921; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4922; SSE2-NEXT:    retq
4923;
4924; SSSE3-LABEL: sext_4i8_to_4i64:
4925; SSSE3:       # %bb.0:
4926; SSSE3-NEXT:    pslld $24, %xmm0
4927; SSSE3-NEXT:    psrad $24, %xmm0
4928; SSSE3-NEXT:    movdqa %xmm0, %xmm2
4929; SSSE3-NEXT:    psrad $31, %xmm2
4930; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4931; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4932; SSSE3-NEXT:    movdqa %xmm1, %xmm2
4933; SSSE3-NEXT:    psrad $31, %xmm2
4934; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4935; SSSE3-NEXT:    retq
4936;
4937; SSE41-LABEL: sext_4i8_to_4i64:
4938; SSE41:       # %bb.0:
4939; SSE41-NEXT:    pslld $24, %xmm0
4940; SSE41-NEXT:    psrad $24, %xmm0
4941; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
4942; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4943; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
4944; SSE41-NEXT:    movdqa %xmm2, %xmm0
4945; SSE41-NEXT:    retq
4946;
4947; AVX1-LABEL: sext_4i8_to_4i64:
4948; AVX1:       # %bb.0:
4949; AVX1-NEXT:    vpslld $24, %xmm0, %xmm0
4950; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
4951; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
4952; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4953; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
4954; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4955; AVX1-NEXT:    retq
4956;
4957; AVX2-LABEL: sext_4i8_to_4i64:
4958; AVX2:       # %bb.0:
4959; AVX2-NEXT:    vpslld $24, %xmm0, %xmm0
4960; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
4961; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
4962; AVX2-NEXT:    retq
4963;
4964; AVX512-LABEL: sext_4i8_to_4i64:
4965; AVX512:       # %bb.0:
4966; AVX512-NEXT:    vpslld $24, %xmm0, %xmm0
4967; AVX512-NEXT:    vpsrad $24, %xmm0, %xmm0
4968; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
4969; AVX512-NEXT:    retq
4970;
4971; X32-SSE41-LABEL: sext_4i8_to_4i64:
4972; X32-SSE41:       # %bb.0:
4973; X32-SSE41-NEXT:    pslld $24, %xmm0
4974; X32-SSE41-NEXT:    psrad $24, %xmm0
4975; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
4976; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4977; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
4978; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
4979; X32-SSE41-NEXT:    retl
4980  %extmask = sext <4 x i8> %mask to <4 x i64>
4981  ret <4 x i64> %extmask
4982}
4983
4984define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
4985; SSE-LABEL: sext_32xi1_to_32xi8:
4986; SSE:       # %bb.0:
4987; SSE-NEXT:    pcmpeqw %xmm5, %xmm1
4988; SSE-NEXT:    pcmpeqw %xmm4, %xmm0
4989; SSE-NEXT:    packsswb %xmm1, %xmm0
4990; SSE-NEXT:    pcmpeqw %xmm7, %xmm3
4991; SSE-NEXT:    pcmpeqw %xmm6, %xmm2
4992; SSE-NEXT:    packsswb %xmm3, %xmm2
4993; SSE-NEXT:    movdqa %xmm2, %xmm1
4994; SSE-NEXT:    retq
4995;
4996; AVX1-LABEL: sext_32xi1_to_32xi8:
4997; AVX1:       # %bb.0:
4998; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
4999; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
5000; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm5, %xmm4
5001; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm1, %xmm1
5002; AVX1-NEXT:    vpacksswb %xmm4, %xmm1, %xmm1
5003; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
5004; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
5005; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm4, %xmm3
5006; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
5007; AVX1-NEXT:    vpacksswb %xmm3, %xmm0, %xmm0
5008; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
5009; AVX1-NEXT:    retq
5010;
5011; AVX2-LABEL: sext_32xi1_to_32xi8:
5012; AVX2:       # %bb.0:
5013; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
5014; AVX2-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
5015; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
5016; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5017; AVX2-NEXT:    retq
5018;
5019; AVX512F-LABEL: sext_32xi1_to_32xi8:
5020; AVX512F:       # %bb.0:
5021; AVX512F-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
5022; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
5023; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
5024; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
5025; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
5026; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
5027; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
5028; AVX512F-NEXT:    retq
5029;
5030; AVX512BW-LABEL: sext_32xi1_to_32xi8:
5031; AVX512BW:       # %bb.0:
5032; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
5033; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
5034; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
5035; AVX512BW-NEXT:    retq
5036;
5037; X32-SSE41-LABEL: sext_32xi1_to_32xi8:
5038; X32-SSE41:       # %bb.0:
5039; X32-SSE41-NEXT:    pushl %ebp
5040; X32-SSE41-NEXT:    movl %esp, %ebp
5041; X32-SSE41-NEXT:    andl $-16, %esp
5042; X32-SSE41-NEXT:    subl $16, %esp
5043; X32-SSE41-NEXT:    movdqa 8(%ebp), %xmm3
5044; X32-SSE41-NEXT:    pcmpeqw 40(%ebp), %xmm1
5045; X32-SSE41-NEXT:    pcmpeqw 24(%ebp), %xmm0
5046; X32-SSE41-NEXT:    packsswb %xmm1, %xmm0
5047; X32-SSE41-NEXT:    pcmpeqw 72(%ebp), %xmm3
5048; X32-SSE41-NEXT:    pcmpeqw 56(%ebp), %xmm2
5049; X32-SSE41-NEXT:    packsswb %xmm3, %xmm2
5050; X32-SSE41-NEXT:    movdqa %xmm2, %xmm1
5051; X32-SSE41-NEXT:    movl %ebp, %esp
5052; X32-SSE41-NEXT:    popl %ebp
5053; X32-SSE41-NEXT:    retl
5054  %a = icmp eq <32 x i16> %c1, %c2
5055  %b = sext <32 x i1> %a to <32 x i8>
5056  ret <32 x i8> %b
5057}
5058
5059define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) {
5060; SSE2-LABEL: sext_2i8_to_2i32:
5061; SSE2:       # %bb.0:
5062; SSE2-NEXT:    movzwl (%rdi), %eax
5063; SSE2-NEXT:    movd %eax, %xmm0
5064; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5065; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
5066; SSE2-NEXT:    psrad $24, %xmm0
5067; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
5068; SSE2-NEXT:    paddq %xmm0, %xmm0
5069; SSE2-NEXT:    retq
5070;
5071; SSSE3-LABEL: sext_2i8_to_2i32:
5072; SSSE3:       # %bb.0:
5073; SSSE3-NEXT:    movzwl (%rdi), %eax
5074; SSSE3-NEXT:    movd %eax, %xmm0
5075; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,u,u,u,1,u,u,u,u,u,u,u,u]
5076; SSSE3-NEXT:    psrad $24, %xmm0
5077; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
5078; SSSE3-NEXT:    paddq %xmm0, %xmm0
5079; SSSE3-NEXT:    retq
5080;
5081; SSE41-LABEL: sext_2i8_to_2i32:
5082; SSE41:       # %bb.0:
5083; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
5084; SSE41-NEXT:    paddq %xmm0, %xmm0
5085; SSE41-NEXT:    retq
5086;
5087; AVX-LABEL: sext_2i8_to_2i32:
5088; AVX:       # %bb.0:
5089; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
5090; AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
5091; AVX-NEXT:    retq
5092;
5093; X32-SSE41-LABEL: sext_2i8_to_2i32:
5094; X32-SSE41:       # %bb.0:
5095; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
5096; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
5097; X32-SSE41-NEXT:    paddq %xmm0, %xmm0
5098; X32-SSE41-NEXT:    retl
5099  %x = load <2 x i8>, <2 x i8>* %addr, align 1
5100  %y = sext <2 x i8> %x to <2 x i32>
5101  %z = add <2 x i32>%y, %y
5102  ret <2 x i32>%z
5103}
5104
5105