• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
8;
9; Just one 32-bit run to make sure we do reasonable things there.
10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
11
12define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
13; SSE2-LABEL: sext_16i8_to_8i16:
14; SSE2:       # BB#0: # %entry
15; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
16; SSE2-NEXT:    psraw $8, %xmm0
17; SSE2-NEXT:    retq
18;
19; SSSE3-LABEL: sext_16i8_to_8i16:
20; SSSE3:       # BB#0: # %entry
21; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
22; SSSE3-NEXT:    psraw $8, %xmm0
23; SSSE3-NEXT:    retq
24;
25; SSE41-LABEL: sext_16i8_to_8i16:
26; SSE41:       # BB#0: # %entry
27; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
28; SSE41-NEXT:    retq
29;
30; AVX-LABEL: sext_16i8_to_8i16:
31; AVX:       # BB#0: # %entry
32; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
33; AVX-NEXT:    retq
34;
35; X32-SSE41-LABEL: sext_16i8_to_8i16:
36; X32-SSE41:       # BB#0: # %entry
37; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
38; X32-SSE41-NEXT:    retl
39entry:
40  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
41  %C = sext <8 x i8> %B to <8 x i16>
42  ret <8 x i16> %C
43}
44
45define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
46; SSE2-LABEL: sext_16i8_to_16i16:
47; SSE2:       # BB#0: # %entry
48; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
49; SSE2-NEXT:    psraw $8, %xmm2
50; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
51; SSE2-NEXT:    psraw $8, %xmm1
52; SSE2-NEXT:    movdqa %xmm2, %xmm0
53; SSE2-NEXT:    retq
54;
55; SSSE3-LABEL: sext_16i8_to_16i16:
56; SSSE3:       # BB#0: # %entry
57; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
58; SSSE3-NEXT:    psraw $8, %xmm2
59; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
60; SSSE3-NEXT:    psraw $8, %xmm1
61; SSSE3-NEXT:    movdqa %xmm2, %xmm0
62; SSSE3-NEXT:    retq
63;
64; SSE41-LABEL: sext_16i8_to_16i16:
65; SSE41:       # BB#0: # %entry
66; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
67; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
68; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
69; SSE41-NEXT:    movdqa %xmm2, %xmm0
70; SSE41-NEXT:    retq
71;
72; AVX1-LABEL: sext_16i8_to_16i16:
73; AVX1:       # BB#0: # %entry
74; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
75; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
76; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
77; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
78; AVX1-NEXT:    retq
79;
80; AVX2-LABEL: sext_16i8_to_16i16:
81; AVX2:       # BB#0: # %entry
82; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
83; AVX2-NEXT:    retq
84;
85; AVX512-LABEL: sext_16i8_to_16i16:
86; AVX512:       # BB#0: # %entry
87; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
88; AVX512-NEXT:    retq
89;
90; X32-SSE41-LABEL: sext_16i8_to_16i16:
91; X32-SSE41:       # BB#0: # %entry
92; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
93; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
94; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
95; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
96; X32-SSE41-NEXT:    retl
97entry:
98  %B = sext <16 x i8> %A to <16 x i16>
99  ret <16 x i16> %B
100}
101
102define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
103; SSE2-LABEL: sext_16i8_to_4i32:
104; SSE2:       # BB#0: # %entry
105; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
106; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
107; SSE2-NEXT:    psrad $24, %xmm0
108; SSE2-NEXT:    retq
109;
110; SSSE3-LABEL: sext_16i8_to_4i32:
111; SSSE3:       # BB#0: # %entry
112; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
113; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
114; SSSE3-NEXT:    psrad $24, %xmm0
115; SSSE3-NEXT:    retq
116;
117; SSE41-LABEL: sext_16i8_to_4i32:
118; SSE41:       # BB#0: # %entry
119; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
120; SSE41-NEXT:    retq
121;
122; AVX-LABEL: sext_16i8_to_4i32:
123; AVX:       # BB#0: # %entry
124; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
125; AVX-NEXT:    retq
126;
127; X32-SSE41-LABEL: sext_16i8_to_4i32:
128; X32-SSE41:       # BB#0: # %entry
129; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
130; X32-SSE41-NEXT:    retl
131entry:
132  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
133  %C = sext <4 x i8> %B to <4 x i32>
134  ret <4 x i32> %C
135}
136
137define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
138; SSE2-LABEL: sext_16i8_to_8i32:
139; SSE2:       # BB#0: # %entry
140; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
141; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
142; SSE2-NEXT:    psrad $24, %xmm2
143; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
144; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
145; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
146; SSE2-NEXT:    psrad $24, %xmm1
147; SSE2-NEXT:    movdqa %xmm2, %xmm0
148; SSE2-NEXT:    retq
149;
150; SSSE3-LABEL: sext_16i8_to_8i32:
151; SSSE3:       # BB#0: # %entry
152; SSSE3-NEXT:    movdqa %xmm0, %xmm1
153; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
154; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
155; SSSE3-NEXT:    psrad $24, %xmm0
156; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
157; SSSE3-NEXT:    psrad $24, %xmm1
158; SSSE3-NEXT:    retq
159;
160; SSE41-LABEL: sext_16i8_to_8i32:
161; SSE41:       # BB#0: # %entry
162; SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
163; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
164; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
165; SSE41-NEXT:    movdqa %xmm2, %xmm0
166; SSE41-NEXT:    retq
167;
168; AVX1-LABEL: sext_16i8_to_8i32:
169; AVX1:       # BB#0: # %entry
170; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
171; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
172; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
173; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
174; AVX1-NEXT:    retq
175;
176; AVX2-LABEL: sext_16i8_to_8i32:
177; AVX2:       # BB#0: # %entry
178; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
179; AVX2-NEXT:    retq
180;
181; AVX512-LABEL: sext_16i8_to_8i32:
182; AVX512:       # BB#0: # %entry
183; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
184; AVX512-NEXT:    retq
185;
186; X32-SSE41-LABEL: sext_16i8_to_8i32:
187; X32-SSE41:       # BB#0: # %entry
188; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
189; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
190; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
191; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
192; X32-SSE41-NEXT:    retl
193entry:
194  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
195  %C = sext <8 x i8> %B to <8 x i32>
196  ret <8 x i32> %C
197}
198
199define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
200; SSE2-LABEL: sext_16i8_to_2i64:
201; SSE2:       # BB#0: # %entry
202; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
203; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
204; SSE2-NEXT:    movdqa %xmm0, %xmm1
205; SSE2-NEXT:    psrad $31, %xmm1
206; SSE2-NEXT:    psrad $24, %xmm0
207; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
208; SSE2-NEXT:    retq
209;
210; SSSE3-LABEL: sext_16i8_to_2i64:
211; SSSE3:       # BB#0: # %entry
212; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
213; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
214; SSSE3-NEXT:    movdqa %xmm0, %xmm1
215; SSSE3-NEXT:    psrad $31, %xmm1
216; SSSE3-NEXT:    psrad $24, %xmm0
217; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
218; SSSE3-NEXT:    retq
219;
220; SSE41-LABEL: sext_16i8_to_2i64:
221; SSE41:       # BB#0: # %entry
222; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
223; SSE41-NEXT:    retq
224;
225; AVX-LABEL: sext_16i8_to_2i64:
226; AVX:       # BB#0: # %entry
227; AVX-NEXT:    vpmovsxbq %xmm0, %xmm0
228; AVX-NEXT:    retq
229;
230; X32-SSE41-LABEL: sext_16i8_to_2i64:
231; X32-SSE41:       # BB#0: # %entry
232; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
233; X32-SSE41-NEXT:    retl
234entry:
235  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
236  %C = sext <2 x i8> %B to <2 x i64>
237  ret <2 x i64> %C
238}
239
240define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
241; SSE2-LABEL: sext_16i8_to_4i64:
242; SSE2:       # BB#0: # %entry
243; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
244; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
245; SSE2-NEXT:    movdqa %xmm2, %xmm1
246; SSE2-NEXT:    psrad $31, %xmm1
247; SSE2-NEXT:    psrad $24, %xmm2
248; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
249; SSE2-NEXT:    psrld $16, %xmm0
250; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
251; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
252; SSE2-NEXT:    movdqa %xmm1, %xmm0
253; SSE2-NEXT:    psrad $31, %xmm0
254; SSE2-NEXT:    psrad $24, %xmm1
255; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
256; SSE2-NEXT:    movdqa %xmm2, %xmm0
257; SSE2-NEXT:    retq
258;
259; SSSE3-LABEL: sext_16i8_to_4i64:
260; SSSE3:       # BB#0: # %entry
261; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
262; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
263; SSSE3-NEXT:    movdqa %xmm2, %xmm1
264; SSSE3-NEXT:    psrad $31, %xmm1
265; SSSE3-NEXT:    psrad $24, %xmm2
266; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
267; SSSE3-NEXT:    psrld $16, %xmm0
268; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
269; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
270; SSSE3-NEXT:    movdqa %xmm1, %xmm0
271; SSSE3-NEXT:    psrad $31, %xmm0
272; SSSE3-NEXT:    psrad $24, %xmm1
273; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
274; SSSE3-NEXT:    movdqa %xmm2, %xmm0
275; SSSE3-NEXT:    retq
276;
277; SSE41-LABEL: sext_16i8_to_4i64:
278; SSE41:       # BB#0: # %entry
279; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
280; SSE41-NEXT:    psrld $16, %xmm0
281; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
282; SSE41-NEXT:    movdqa %xmm2, %xmm0
283; SSE41-NEXT:    retq
284;
285; AVX1-LABEL: sext_16i8_to_4i64:
286; AVX1:       # BB#0: # %entry
287; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
288; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
289; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
290; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
291; AVX1-NEXT:    retq
292;
293; AVX2-LABEL: sext_16i8_to_4i64:
294; AVX2:       # BB#0: # %entry
295; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
296; AVX2-NEXT:    retq
297;
298; AVX512-LABEL: sext_16i8_to_4i64:
299; AVX512:       # BB#0: # %entry
300; AVX512-NEXT:    vpmovsxbq %xmm0, %ymm0
301; AVX512-NEXT:    retq
302;
303; X32-SSE41-LABEL: sext_16i8_to_4i64:
304; X32-SSE41:       # BB#0: # %entry
305; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
306; X32-SSE41-NEXT:    psrld $16, %xmm0
307; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
308; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
309; X32-SSE41-NEXT:    retl
310entry:
311  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
312  %C = sext <4 x i8> %B to <4 x i64>
313  ret <4 x i64> %C
314}
315
316define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
317; SSE2-LABEL: sext_16i8_to_8i64:
318; SSE2:       # BB#0: # %entry
319; SSE2-NEXT:    movdqa %xmm0, %xmm1
320; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
321; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
322; SSE2-NEXT:    movdqa %xmm0, %xmm2
323; SSE2-NEXT:    psrad $31, %xmm2
324; SSE2-NEXT:    psrad $24, %xmm0
325; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
326; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
327; SSE2-NEXT:    psrld $16, %xmm1
328; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
329; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
330; SSE2-NEXT:    movdqa %xmm1, %xmm2
331; SSE2-NEXT:    psrad $31, %xmm2
332; SSE2-NEXT:    psrad $24, %xmm1
333; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
334; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
335; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
336; SSE2-NEXT:    movdqa %xmm2, %xmm4
337; SSE2-NEXT:    psrad $31, %xmm4
338; SSE2-NEXT:    psrad $24, %xmm2
339; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
340; SSE2-NEXT:    psrld $16, %xmm3
341; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
342; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
343; SSE2-NEXT:    movdqa %xmm3, %xmm4
344; SSE2-NEXT:    psrad $31, %xmm4
345; SSE2-NEXT:    psrad $24, %xmm3
346; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
347; SSE2-NEXT:    retq
348;
349; SSSE3-LABEL: sext_16i8_to_8i64:
350; SSSE3:       # BB#0: # %entry
351; SSSE3-NEXT:    movdqa %xmm0, %xmm1
352; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
353; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
354; SSSE3-NEXT:    movdqa %xmm0, %xmm2
355; SSSE3-NEXT:    psrad $31, %xmm2
356; SSSE3-NEXT:    psrad $24, %xmm0
357; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
358; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
359; SSSE3-NEXT:    psrld $16, %xmm1
360; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
361; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
362; SSSE3-NEXT:    movdqa %xmm1, %xmm2
363; SSSE3-NEXT:    psrad $31, %xmm2
364; SSSE3-NEXT:    psrad $24, %xmm1
365; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
366; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
367; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
368; SSSE3-NEXT:    movdqa %xmm2, %xmm4
369; SSSE3-NEXT:    psrad $31, %xmm4
370; SSSE3-NEXT:    psrad $24, %xmm2
371; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
372; SSSE3-NEXT:    psrld $16, %xmm3
373; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
374; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
375; SSSE3-NEXT:    movdqa %xmm3, %xmm4
376; SSSE3-NEXT:    psrad $31, %xmm4
377; SSSE3-NEXT:    psrad $24, %xmm3
378; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
379; SSSE3-NEXT:    retq
380;
381; SSE41-LABEL: sext_16i8_to_8i64:
382; SSE41:       # BB#0: # %entry
383; SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
384; SSE41-NEXT:    movdqa %xmm0, %xmm1
385; SSE41-NEXT:    psrld $16, %xmm1
386; SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
387; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
388; SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
389; SSE41-NEXT:    psrlq $48, %xmm0
390; SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
391; SSE41-NEXT:    movdqa %xmm4, %xmm0
392; SSE41-NEXT:    retq
393;
394; AVX1-LABEL: sext_16i8_to_8i64:
395; AVX1:       # BB#0: # %entry
396; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
397; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
398; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
399; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
400; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
401; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm1
402; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
403; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
404; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
405; AVX1-NEXT:    vmovaps %ymm2, %ymm0
406; AVX1-NEXT:    retq
407;
408; AVX2-LABEL: sext_16i8_to_8i64:
409; AVX2:       # BB#0: # %entry
410; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
411; AVX2-NEXT:    vpslld $24, %xmm1, %xmm1
412; AVX2-NEXT:    vpsrad $24, %xmm1, %xmm1
413; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm2
414; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
415; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
416; AVX2-NEXT:    vpslld $24, %xmm0, %xmm0
417; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
418; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm1
419; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
420; AVX2-NEXT:    retq
421;
422; AVX512-LABEL: sext_16i8_to_8i64:
423; AVX512:       # BB#0: # %entry
424; AVX512-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
425; AVX512-NEXT:    vpsllq $56, %zmm0, %zmm0
426; AVX512-NEXT:    vpsraq $56, %zmm0, %zmm0
427; AVX512-NEXT:    retq
428;
429; X32-SSE41-LABEL: sext_16i8_to_8i64:
430; X32-SSE41:       # BB#0: # %entry
431; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
432; X32-SSE41-NEXT:    movdqa %xmm0, %xmm1
433; X32-SSE41-NEXT:    psrld $16, %xmm1
434; X32-SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
435; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
436; X32-SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
437; X32-SSE41-NEXT:    psrlq $48, %xmm0
438; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
439; X32-SSE41-NEXT:    movdqa %xmm4, %xmm0
440; X32-SSE41-NEXT:    retl
441entry:
442  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
443  %C = sext <8 x i8> %B to <8 x i64>
444  ret <8 x i64> %C
445}
446
447define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
448; SSE2-LABEL: sext_8i16_to_4i32:
449; SSE2:       # BB#0: # %entry
450; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
451; SSE2-NEXT:    psrad $16, %xmm0
452; SSE2-NEXT:    retq
453;
454; SSSE3-LABEL: sext_8i16_to_4i32:
455; SSSE3:       # BB#0: # %entry
456; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
457; SSSE3-NEXT:    psrad $16, %xmm0
458; SSSE3-NEXT:    retq
459;
460; SSE41-LABEL: sext_8i16_to_4i32:
461; SSE41:       # BB#0: # %entry
462; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
463; SSE41-NEXT:    retq
464;
465; AVX-LABEL: sext_8i16_to_4i32:
466; AVX:       # BB#0: # %entry
467; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
468; AVX-NEXT:    retq
469;
470; X32-SSE41-LABEL: sext_8i16_to_4i32:
471; X32-SSE41:       # BB#0: # %entry
472; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
473; X32-SSE41-NEXT:    retl
474entry:
475  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
476  %C = sext <4 x i16> %B to <4 x i32>
477  ret <4 x i32> %C
478}
479
480define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
481; SSE2-LABEL: sext_8i16_to_8i32:
482; SSE2:       # BB#0: # %entry
483; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
484; SSE2-NEXT:    psrad $16, %xmm2
485; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
486; SSE2-NEXT:    psrad $16, %xmm1
487; SSE2-NEXT:    movdqa %xmm2, %xmm0
488; SSE2-NEXT:    retq
489;
490; SSSE3-LABEL: sext_8i16_to_8i32:
491; SSSE3:       # BB#0: # %entry
492; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
493; SSSE3-NEXT:    psrad $16, %xmm2
494; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
495; SSSE3-NEXT:    psrad $16, %xmm1
496; SSSE3-NEXT:    movdqa %xmm2, %xmm0
497; SSSE3-NEXT:    retq
498;
499; SSE41-LABEL: sext_8i16_to_8i32:
500; SSE41:       # BB#0: # %entry
501; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
502; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
503; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
504; SSE41-NEXT:    movdqa %xmm2, %xmm0
505; SSE41-NEXT:    retq
506;
507; AVX1-LABEL: sext_8i16_to_8i32:
508; AVX1:       # BB#0: # %entry
509; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
510; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
511; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
512; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
513; AVX1-NEXT:    retq
514;
515; AVX2-LABEL: sext_8i16_to_8i32:
516; AVX2:       # BB#0: # %entry
517; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
518; AVX2-NEXT:    retq
519;
520; AVX512-LABEL: sext_8i16_to_8i32:
521; AVX512:       # BB#0: # %entry
522; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
523; AVX512-NEXT:    retq
524;
525; X32-SSE41-LABEL: sext_8i16_to_8i32:
526; X32-SSE41:       # BB#0: # %entry
527; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
528; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
529; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
530; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
531; X32-SSE41-NEXT:    retl
532entry:
533  %B = sext <8 x i16> %A to <8 x i32>
534  ret <8 x i32> %B
535}
536
537define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
538; SSE2-LABEL: sext_8i16_to_2i64:
539; SSE2:       # BB#0: # %entry
540; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
541; SSE2-NEXT:    movdqa %xmm0, %xmm1
542; SSE2-NEXT:    psrad $31, %xmm1
543; SSE2-NEXT:    psrad $16, %xmm0
544; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
545; SSE2-NEXT:    retq
546;
547; SSSE3-LABEL: sext_8i16_to_2i64:
548; SSSE3:       # BB#0: # %entry
549; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
550; SSSE3-NEXT:    movdqa %xmm0, %xmm1
551; SSSE3-NEXT:    psrad $31, %xmm1
552; SSSE3-NEXT:    psrad $16, %xmm0
553; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
554; SSSE3-NEXT:    retq
555;
556; SSE41-LABEL: sext_8i16_to_2i64:
557; SSE41:       # BB#0: # %entry
558; SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
559; SSE41-NEXT:    retq
560;
561; AVX-LABEL: sext_8i16_to_2i64:
562; AVX:       # BB#0: # %entry
563; AVX-NEXT:    vpmovsxwq %xmm0, %xmm0
564; AVX-NEXT:    retq
565;
566; X32-SSE41-LABEL: sext_8i16_to_2i64:
567; X32-SSE41:       # BB#0: # %entry
568; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
569; X32-SSE41-NEXT:    retl
570entry:
571  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
572  %C = sext <2 x i16> %B to <2 x i64>
573  ret <2 x i64> %C
574}
575
576define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
577; SSE2-LABEL: sext_8i16_to_4i64:
578; SSE2:       # BB#0: # %entry
579; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
580; SSE2-NEXT:    movdqa %xmm2, %xmm1
581; SSE2-NEXT:    psrad $31, %xmm1
582; SSE2-NEXT:    psrad $16, %xmm2
583; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
584; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
585; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
586; SSE2-NEXT:    movdqa %xmm1, %xmm0
587; SSE2-NEXT:    psrad $31, %xmm0
588; SSE2-NEXT:    psrad $16, %xmm1
589; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
590; SSE2-NEXT:    movdqa %xmm2, %xmm0
591; SSE2-NEXT:    retq
592;
593; SSSE3-LABEL: sext_8i16_to_4i64:
594; SSSE3:       # BB#0: # %entry
595; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
596; SSSE3-NEXT:    movdqa %xmm2, %xmm1
597; SSSE3-NEXT:    psrad $31, %xmm1
598; SSSE3-NEXT:    psrad $16, %xmm2
599; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
600; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
601; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
602; SSSE3-NEXT:    movdqa %xmm1, %xmm0
603; SSSE3-NEXT:    psrad $31, %xmm0
604; SSSE3-NEXT:    psrad $16, %xmm1
605; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
606; SSSE3-NEXT:    movdqa %xmm2, %xmm0
607; SSSE3-NEXT:    retq
608;
609; SSE41-LABEL: sext_8i16_to_4i64:
610; SSE41:       # BB#0: # %entry
611; SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
612; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
613; SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
614; SSE41-NEXT:    movdqa %xmm2, %xmm0
615; SSE41-NEXT:    retq
616;
617; AVX1-LABEL: sext_8i16_to_4i64:
618; AVX1:       # BB#0: # %entry
619; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
620; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
621; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
622; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
623; AVX1-NEXT:    retq
624;
625; AVX2-LABEL: sext_8i16_to_4i64:
626; AVX2:       # BB#0: # %entry
627; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
628; AVX2-NEXT:    retq
629;
630; AVX512-LABEL: sext_8i16_to_4i64:
631; AVX512:       # BB#0: # %entry
632; AVX512-NEXT:    vpmovsxwq %xmm0, %ymm0
633; AVX512-NEXT:    retq
634;
635; X32-SSE41-LABEL: sext_8i16_to_4i64:
636; X32-SSE41:       # BB#0: # %entry
637; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
638; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
639; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
640; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
641; X32-SSE41-NEXT:    retl
642entry:
643  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
644  %C = sext <4 x i16> %B to <4 x i64>
645  ret <4 x i64> %C
646}
647
648define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
649; SSE2-LABEL: sext_4i32_to_2i64:
650; SSE2:       # BB#0: # %entry
651; SSE2-NEXT:    movdqa %xmm0, %xmm1
652; SSE2-NEXT:    psrad $31, %xmm1
653; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
654; SSE2-NEXT:    retq
655;
656; SSSE3-LABEL: sext_4i32_to_2i64:
657; SSSE3:       # BB#0: # %entry
658; SSSE3-NEXT:    movdqa %xmm0, %xmm1
659; SSSE3-NEXT:    psrad $31, %xmm1
660; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
661; SSSE3-NEXT:    retq
662;
663; SSE41-LABEL: sext_4i32_to_2i64:
664; SSE41:       # BB#0: # %entry
665; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
666; SSE41-NEXT:    retq
667;
668; AVX-LABEL: sext_4i32_to_2i64:
669; AVX:       # BB#0: # %entry
670; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
671; AVX-NEXT:    retq
672;
673; X32-SSE41-LABEL: sext_4i32_to_2i64:
674; X32-SSE41:       # BB#0: # %entry
675; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
676; X32-SSE41-NEXT:    retl
677entry:
678  %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
679  %C = sext <2 x i32> %B to <2 x i64>
680  ret <2 x i64> %C
681}
682
683define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
684; SSE2-LABEL: sext_4i32_to_4i64:
685; SSE2:       # BB#0: # %entry
686; SSE2-NEXT:    movdqa %xmm0, %xmm2
687; SSE2-NEXT:    psrad $31, %xmm2
688; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
689; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
690; SSE2-NEXT:    movdqa %xmm1, %xmm2
691; SSE2-NEXT:    psrad $31, %xmm2
692; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
693; SSE2-NEXT:    retq
694;
695; SSSE3-LABEL: sext_4i32_to_4i64:
696; SSSE3:       # BB#0: # %entry
697; SSSE3-NEXT:    movdqa %xmm0, %xmm2
698; SSSE3-NEXT:    psrad $31, %xmm2
699; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
700; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
701; SSSE3-NEXT:    movdqa %xmm1, %xmm2
702; SSSE3-NEXT:    psrad $31, %xmm2
703; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
704; SSSE3-NEXT:    retq
705;
706; SSE41-LABEL: sext_4i32_to_4i64:
707; SSE41:       # BB#0: # %entry
708; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
709; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
710; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
711; SSE41-NEXT:    movdqa %xmm2, %xmm0
712; SSE41-NEXT:    retq
713;
714; AVX1-LABEL: sext_4i32_to_4i64:
715; AVX1:       # BB#0: # %entry
716; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
717; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
718; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
719; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
720; AVX1-NEXT:    retq
721;
722; AVX2-LABEL: sext_4i32_to_4i64:
723; AVX2:       # BB#0: # %entry
724; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
725; AVX2-NEXT:    retq
726;
727; AVX512-LABEL: sext_4i32_to_4i64:
728; AVX512:       # BB#0: # %entry
729; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
730; AVX512-NEXT:    retq
731;
732; X32-SSE41-LABEL: sext_4i32_to_4i64:
733; X32-SSE41:       # BB#0: # %entry
734; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
735; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
736; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
737; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
738; X32-SSE41-NEXT:    retl
739entry:
740  %B = sext <4 x i32> %A to <4 x i64>
741  ret <4 x i64> %B
742}
743
744define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
745; SSE-LABEL: load_sext_2i1_to_2i64:
746; SSE:       # BB#0: # %entry
747; SSE-NEXT:    movzbl (%rdi), %eax
748; SSE-NEXT:    movq %rax, %rcx
749; SSE-NEXT:    shlq $62, %rcx
750; SSE-NEXT:    sarq $63, %rcx
751; SSE-NEXT:    movd %rcx, %xmm1
752; SSE-NEXT:    shlq $63, %rax
753; SSE-NEXT:    sarq $63, %rax
754; SSE-NEXT:    movd %rax, %xmm0
755; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
756; SSE-NEXT:    retq
757;
758; AVX1-LABEL: load_sext_2i1_to_2i64:
759; AVX1:       # BB#0: # %entry
760; AVX1-NEXT:    movzbl (%rdi), %eax
761; AVX1-NEXT:    movq %rax, %rcx
762; AVX1-NEXT:    shlq $62, %rcx
763; AVX1-NEXT:    sarq $63, %rcx
764; AVX1-NEXT:    vmovq %rcx, %xmm0
765; AVX1-NEXT:    shlq $63, %rax
766; AVX1-NEXT:    sarq $63, %rax
767; AVX1-NEXT:    vmovq %rax, %xmm1
768; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
769; AVX1-NEXT:    retq
770;
771; AVX2-LABEL: load_sext_2i1_to_2i64:
772; AVX2:       # BB#0: # %entry
773; AVX2-NEXT:    movzbl (%rdi), %eax
774; AVX2-NEXT:    movq %rax, %rcx
775; AVX2-NEXT:    shlq $62, %rcx
776; AVX2-NEXT:    sarq $63, %rcx
777; AVX2-NEXT:    vmovq %rcx, %xmm0
778; AVX2-NEXT:    shlq $63, %rax
779; AVX2-NEXT:    sarq $63, %rax
780; AVX2-NEXT:    vmovq %rax, %xmm1
781; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
782; AVX2-NEXT:    retq
783;
784; AVX512-LABEL: load_sext_2i1_to_2i64:
785; AVX512:       # BB#0: # %entry
786; AVX512-NEXT:    movzbl (%rdi), %eax
787; AVX512-NEXT:    kmovw %eax, %k1
788; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
789; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
790; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
791; AVX512-NEXT:    retq
792;
793; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
794; X32-SSE41:       # BB#0: # %entry
795; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
796; X32-SSE41-NEXT:    movzbl (%eax), %eax
797; X32-SSE41-NEXT:    movl %eax, %ecx
798; X32-SSE41-NEXT:    shll $31, %ecx
799; X32-SSE41-NEXT:    sarl $31, %ecx
800; X32-SSE41-NEXT:    movd %ecx, %xmm0
801; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
802; X32-SSE41-NEXT:    shll $30, %eax
803; X32-SSE41-NEXT:    sarl $31, %eax
804; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
805; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
806; X32-SSE41-NEXT:    retl
807entry:
808 %X = load <2 x i1>, <2 x i1>* %ptr
809 %Y = sext <2 x i1> %X to <2 x i64>
810 ret <2 x i64> %Y
811}
812
813define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
814; SSE2-LABEL: load_sext_2i8_to_2i64:
815; SSE2:       # BB#0: # %entry
816; SSE2-NEXT:    movzwl (%rdi), %eax
817; SSE2-NEXT:    movd %eax, %xmm0
818; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
819; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
820; SSE2-NEXT:    movdqa %xmm0, %xmm1
821; SSE2-NEXT:    psrad $31, %xmm1
822; SSE2-NEXT:    psrad $24, %xmm0
823; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
824; SSE2-NEXT:    retq
825;
826; SSSE3-LABEL: load_sext_2i8_to_2i64:
827; SSSE3:       # BB#0: # %entry
828; SSSE3-NEXT:    movzwl (%rdi), %eax
829; SSSE3-NEXT:    movd %eax, %xmm0
830; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
831; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
832; SSSE3-NEXT:    movdqa %xmm0, %xmm1
833; SSSE3-NEXT:    psrad $31, %xmm1
834; SSSE3-NEXT:    psrad $24, %xmm0
835; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
836; SSSE3-NEXT:    retq
837;
838; SSE41-LABEL: load_sext_2i8_to_2i64:
839; SSE41:       # BB#0: # %entry
840; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
841; SSE41-NEXT:    retq
842;
843; AVX-LABEL: load_sext_2i8_to_2i64:
844; AVX:       # BB#0: # %entry
845; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
846; AVX-NEXT:    retq
847;
848; X32-SSE41-LABEL: load_sext_2i8_to_2i64:
849; X32-SSE41:       # BB#0: # %entry
850; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
851; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
852; X32-SSE41-NEXT:    retl
853entry:
854 %X = load <2 x i8>, <2 x i8>* %ptr
855 %Y = sext <2 x i8> %X to <2 x i64>
856 ret <2 x i64> %Y
857}
858
859define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
860; SSE2-LABEL: load_sext_4i1_to_4i32:
861; SSE2:       # BB#0: # %entry
862; SSE2-NEXT:    movzbl (%rdi), %eax
863; SSE2-NEXT:    movq %rax, %rcx
864; SSE2-NEXT:    shlq $60, %rcx
865; SSE2-NEXT:    sarq $63, %rcx
866; SSE2-NEXT:    movd %ecx, %xmm0
867; SSE2-NEXT:    movq %rax, %rcx
868; SSE2-NEXT:    shlq $62, %rcx
869; SSE2-NEXT:    sarq $63, %rcx
870; SSE2-NEXT:    movd %ecx, %xmm1
871; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
872; SSE2-NEXT:    movq %rax, %rcx
873; SSE2-NEXT:    shlq $61, %rcx
874; SSE2-NEXT:    sarq $63, %rcx
875; SSE2-NEXT:    movd %ecx, %xmm2
876; SSE2-NEXT:    shlq $63, %rax
877; SSE2-NEXT:    sarq $63, %rax
878; SSE2-NEXT:    movd %eax, %xmm0
879; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
880; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
881; SSE2-NEXT:    retq
882;
883; SSSE3-LABEL: load_sext_4i1_to_4i32:
884; SSSE3:       # BB#0: # %entry
885; SSSE3-NEXT:    movzbl (%rdi), %eax
886; SSSE3-NEXT:    movq %rax, %rcx
887; SSSE3-NEXT:    shlq $60, %rcx
888; SSSE3-NEXT:    sarq $63, %rcx
889; SSSE3-NEXT:    movd %ecx, %xmm0
890; SSSE3-NEXT:    movq %rax, %rcx
891; SSSE3-NEXT:    shlq $62, %rcx
892; SSSE3-NEXT:    sarq $63, %rcx
893; SSSE3-NEXT:    movd %ecx, %xmm1
894; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
895; SSSE3-NEXT:    movq %rax, %rcx
896; SSSE3-NEXT:    shlq $61, %rcx
897; SSSE3-NEXT:    sarq $63, %rcx
898; SSSE3-NEXT:    movd %ecx, %xmm2
899; SSSE3-NEXT:    shlq $63, %rax
900; SSSE3-NEXT:    sarq $63, %rax
901; SSSE3-NEXT:    movd %eax, %xmm0
902; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
903; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
904; SSSE3-NEXT:    retq
905;
906; SSE41-LABEL: load_sext_4i1_to_4i32:
907; SSE41:       # BB#0: # %entry
908; SSE41-NEXT:    movzbl (%rdi), %eax
909; SSE41-NEXT:    movq %rax, %rcx
910; SSE41-NEXT:    shlq $62, %rcx
911; SSE41-NEXT:    sarq $63, %rcx
912; SSE41-NEXT:    movq %rax, %rdx
913; SSE41-NEXT:    shlq $63, %rdx
914; SSE41-NEXT:    sarq $63, %rdx
915; SSE41-NEXT:    movd %edx, %xmm0
916; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
917; SSE41-NEXT:    movq %rax, %rcx
918; SSE41-NEXT:    shlq $61, %rcx
919; SSE41-NEXT:    sarq $63, %rcx
920; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
921; SSE41-NEXT:    shlq $60, %rax
922; SSE41-NEXT:    sarq $63, %rax
923; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
924; SSE41-NEXT:    retq
925;
926; AVX1-LABEL: load_sext_4i1_to_4i32:
927; AVX1:       # BB#0: # %entry
928; AVX1-NEXT:    movzbl (%rdi), %eax
929; AVX1-NEXT:    movq %rax, %rcx
930; AVX1-NEXT:    shlq $62, %rcx
931; AVX1-NEXT:    sarq $63, %rcx
932; AVX1-NEXT:    movq %rax, %rdx
933; AVX1-NEXT:    shlq $63, %rdx
934; AVX1-NEXT:    sarq $63, %rdx
935; AVX1-NEXT:    vmovd %edx, %xmm0
936; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
937; AVX1-NEXT:    movq %rax, %rcx
938; AVX1-NEXT:    shlq $61, %rcx
939; AVX1-NEXT:    sarq $63, %rcx
940; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
941; AVX1-NEXT:    shlq $60, %rax
942; AVX1-NEXT:    sarq $63, %rax
943; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
944; AVX1-NEXT:    retq
945;
946; AVX2-LABEL: load_sext_4i1_to_4i32:
947; AVX2:       # BB#0: # %entry
948; AVX2-NEXT:    movzbl (%rdi), %eax
949; AVX2-NEXT:    movq %rax, %rcx
950; AVX2-NEXT:    shlq $62, %rcx
951; AVX2-NEXT:    sarq $63, %rcx
952; AVX2-NEXT:    movq %rax, %rdx
953; AVX2-NEXT:    shlq $63, %rdx
954; AVX2-NEXT:    sarq $63, %rdx
955; AVX2-NEXT:    vmovd %edx, %xmm0
956; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
957; AVX2-NEXT:    movq %rax, %rcx
958; AVX2-NEXT:    shlq $61, %rcx
959; AVX2-NEXT:    sarq $63, %rcx
960; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
961; AVX2-NEXT:    shlq $60, %rax
962; AVX2-NEXT:    sarq $63, %rax
963; AVX2-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
964; AVX2-NEXT:    retq
965;
966; AVX512-LABEL: load_sext_4i1_to_4i32:
967; AVX512:       # BB#0: # %entry
968; AVX512-NEXT:    movzbl (%rdi), %eax
969; AVX512-NEXT:    kmovw %eax, %k1
970; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
971; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
972; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
973; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
974; AVX512-NEXT:    retq
975;
976; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
977; X32-SSE41:       # BB#0: # %entry
978; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
979; X32-SSE41-NEXT:    movl (%eax), %eax
980; X32-SSE41-NEXT:    movl %eax, %ecx
981; X32-SSE41-NEXT:    shll $30, %ecx
982; X32-SSE41-NEXT:    sarl $31, %ecx
983; X32-SSE41-NEXT:    movl %eax, %edx
984; X32-SSE41-NEXT:    shll $31, %edx
985; X32-SSE41-NEXT:    sarl $31, %edx
986; X32-SSE41-NEXT:    movd %edx, %xmm0
987; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
988; X32-SSE41-NEXT:    movl %eax, %ecx
989; X32-SSE41-NEXT:    shll $29, %ecx
990; X32-SSE41-NEXT:    sarl $31, %ecx
991; X32-SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
992; X32-SSE41-NEXT:    shll $28, %eax
993; X32-SSE41-NEXT:    sarl $31, %eax
994; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
995; X32-SSE41-NEXT:    retl
996entry:
997 %X = load <4 x i1>, <4 x i1>* %ptr
998 %Y = sext <4 x i1> %X to <4 x i32>
999 ret <4 x i32> %Y
1000}
1001
1002define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
1003; SSE2-LABEL: load_sext_4i8_to_4i32:
1004; SSE2:       # BB#0: # %entry
1005; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1006; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1007; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1008; SSE2-NEXT:    psrad $24, %xmm0
1009; SSE2-NEXT:    retq
1010;
1011; SSSE3-LABEL: load_sext_4i8_to_4i32:
1012; SSSE3:       # BB#0: # %entry
1013; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1014; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1015; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1016; SSSE3-NEXT:    psrad $24, %xmm0
1017; SSSE3-NEXT:    retq
1018;
1019; SSE41-LABEL: load_sext_4i8_to_4i32:
1020; SSE41:       # BB#0: # %entry
1021; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
1022; SSE41-NEXT:    retq
1023;
1024; AVX-LABEL: load_sext_4i8_to_4i32:
1025; AVX:       # BB#0: # %entry
1026; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
1027; AVX-NEXT:    retq
1028;
1029; X32-SSE41-LABEL: load_sext_4i8_to_4i32:
1030; X32-SSE41:       # BB#0: # %entry
1031; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1032; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
1033; X32-SSE41-NEXT:    retl
1034entry:
1035 %X = load <4 x i8>, <4 x i8>* %ptr
1036 %Y = sext <4 x i8> %X to <4 x i32>
1037 ret <4 x i32> %Y
1038}
1039
1040define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
1041; SSE2-LABEL: load_sext_4i1_to_4i64:
1042; SSE2:       # BB#0: # %entry
1043; SSE2-NEXT:    movl (%rdi), %eax
1044; SSE2-NEXT:    movl %eax, %ecx
1045; SSE2-NEXT:    shrl $3, %ecx
1046; SSE2-NEXT:    movd %ecx, %xmm0
1047; SSE2-NEXT:    movl %eax, %ecx
1048; SSE2-NEXT:    shrl %ecx
1049; SSE2-NEXT:    movd %ecx, %xmm1
1050; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1051; SSE2-NEXT:    movd %eax, %xmm2
1052; SSE2-NEXT:    shrl $2, %eax
1053; SSE2-NEXT:    movd %eax, %xmm0
1054; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1055; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1056; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
1057; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
1058; SSE2-NEXT:    psllq $63, %xmm0
1059; SSE2-NEXT:    psrad $31, %xmm0
1060; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1061; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
1062; SSE2-NEXT:    psllq $63, %xmm1
1063; SSE2-NEXT:    psrad $31, %xmm1
1064; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1065; SSE2-NEXT:    retq
1066;
1067; SSSE3-LABEL: load_sext_4i1_to_4i64:
1068; SSSE3:       # BB#0: # %entry
1069; SSSE3-NEXT:    movl (%rdi), %eax
1070; SSSE3-NEXT:    movl %eax, %ecx
1071; SSSE3-NEXT:    shrl $3, %ecx
1072; SSSE3-NEXT:    movd %ecx, %xmm0
1073; SSSE3-NEXT:    movl %eax, %ecx
1074; SSSE3-NEXT:    shrl %ecx
1075; SSSE3-NEXT:    movd %ecx, %xmm1
1076; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1077; SSSE3-NEXT:    movd %eax, %xmm2
1078; SSSE3-NEXT:    shrl $2, %eax
1079; SSSE3-NEXT:    movd %eax, %xmm0
1080; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1081; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1082; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm2
1083; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
1084; SSSE3-NEXT:    psllq $63, %xmm0
1085; SSSE3-NEXT:    psrad $31, %xmm0
1086; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1087; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
1088; SSSE3-NEXT:    psllq $63, %xmm1
1089; SSSE3-NEXT:    psrad $31, %xmm1
1090; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1091; SSSE3-NEXT:    retq
1092;
1093; SSE41-LABEL: load_sext_4i1_to_4i64:
1094; SSE41:       # BB#0: # %entry
1095; SSE41-NEXT:    movl (%rdi), %eax
1096; SSE41-NEXT:    movl %eax, %ecx
1097; SSE41-NEXT:    shrl %ecx
1098; SSE41-NEXT:    movd %eax, %xmm1
1099; SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
1100; SSE41-NEXT:    movl %eax, %ecx
1101; SSE41-NEXT:    shrl $2, %ecx
1102; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
1103; SSE41-NEXT:    shrl $3, %eax
1104; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
1105; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
1106; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1107; SSE41-NEXT:    psllq $63, %xmm0
1108; SSE41-NEXT:    psrad $31, %xmm0
1109; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1110; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1111; SSE41-NEXT:    psllq $63, %xmm1
1112; SSE41-NEXT:    psrad $31, %xmm1
1113; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1114; SSE41-NEXT:    retq
1115;
1116; AVX1-LABEL: load_sext_4i1_to_4i64:
1117; AVX1:       # BB#0: # %entry
1118; AVX1-NEXT:    movzbl (%rdi), %eax
1119; AVX1-NEXT:    movq %rax, %rcx
1120; AVX1-NEXT:    shlq $62, %rcx
1121; AVX1-NEXT:    sarq $63, %rcx
1122; AVX1-NEXT:    movq %rax, %rdx
1123; AVX1-NEXT:    shlq $63, %rdx
1124; AVX1-NEXT:    sarq $63, %rdx
1125; AVX1-NEXT:    vmovd %edx, %xmm0
1126; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
1127; AVX1-NEXT:    movq %rax, %rcx
1128; AVX1-NEXT:    shlq $61, %rcx
1129; AVX1-NEXT:    sarq $63, %rcx
1130; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
1131; AVX1-NEXT:    shlq $60, %rax
1132; AVX1-NEXT:    sarq $63, %rax
1133; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1134; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1135; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1136; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1137; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1138; AVX1-NEXT:    retq
1139;
1140; AVX2-LABEL: load_sext_4i1_to_4i64:
1141; AVX2:       # BB#0: # %entry
1142; AVX2-NEXT:    movzbl (%rdi), %eax
1143; AVX2-NEXT:    movq %rax, %rcx
1144; AVX2-NEXT:    shlq $60, %rcx
1145; AVX2-NEXT:    sarq $63, %rcx
1146; AVX2-NEXT:    vmovq %rcx, %xmm0
1147; AVX2-NEXT:    movq %rax, %rcx
1148; AVX2-NEXT:    shlq $61, %rcx
1149; AVX2-NEXT:    sarq $63, %rcx
1150; AVX2-NEXT:    vmovq %rcx, %xmm1
1151; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1152; AVX2-NEXT:    movq %rax, %rcx
1153; AVX2-NEXT:    shlq $62, %rcx
1154; AVX2-NEXT:    sarq $63, %rcx
1155; AVX2-NEXT:    vmovq %rcx, %xmm1
1156; AVX2-NEXT:    shlq $63, %rax
1157; AVX2-NEXT:    sarq $63, %rax
1158; AVX2-NEXT:    vmovq %rax, %xmm2
1159; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1160; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1161; AVX2-NEXT:    retq
1162;
1163; AVX512-LABEL: load_sext_4i1_to_4i64:
1164; AVX512:       # BB#0: # %entry
1165; AVX512-NEXT:    movzbl (%rdi), %eax
1166; AVX512-NEXT:    kmovw %eax, %k1
1167; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
1168; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
1169; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
1170; AVX512-NEXT:    retq
1171;
1172; X32-SSE41-LABEL: load_sext_4i1_to_4i64:
1173; X32-SSE41:       # BB#0: # %entry
1174; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1175; X32-SSE41-NEXT:    movzbl (%eax), %eax
1176; X32-SSE41-NEXT:    movl %eax, %ecx
1177; X32-SSE41-NEXT:    shrl %ecx
1178; X32-SSE41-NEXT:    movd %eax, %xmm1
1179; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
1180; X32-SSE41-NEXT:    movl %eax, %ecx
1181; X32-SSE41-NEXT:    shrl $2, %ecx
1182; X32-SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
1183; X32-SSE41-NEXT:    shrl $3, %eax
1184; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm1
1185; X32-SSE41-NEXT:    pand {{\.LCPI.*}}, %xmm1
1186; X32-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1187; X32-SSE41-NEXT:    psllq $63, %xmm0
1188; X32-SSE41-NEXT:    psrad $31, %xmm0
1189; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1190; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1191; X32-SSE41-NEXT:    psllq $63, %xmm1
1192; X32-SSE41-NEXT:    psrad $31, %xmm1
1193; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1194; X32-SSE41-NEXT:    retl
1195entry:
1196 %X = load <4 x i1>, <4 x i1>* %ptr
1197 %Y = sext <4 x i1> %X to <4 x i64>
1198 ret <4 x i64> %Y
1199}
1200
1201define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
1202; SSE2-LABEL: load_sext_4i8_to_4i64:
1203; SSE2:       # BB#0: # %entry
1204; SSE2-NEXT:    movsbq 1(%rdi), %rax
1205; SSE2-NEXT:    movd %rax, %xmm1
1206; SSE2-NEXT:    movsbq (%rdi), %rax
1207; SSE2-NEXT:    movd %rax, %xmm0
1208; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1209; SSE2-NEXT:    movsbq 3(%rdi), %rax
1210; SSE2-NEXT:    movd %rax, %xmm2
1211; SSE2-NEXT:    movsbq 2(%rdi), %rax
1212; SSE2-NEXT:    movd %rax, %xmm1
1213; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1214; SSE2-NEXT:    retq
1215;
1216; SSSE3-LABEL: load_sext_4i8_to_4i64:
1217; SSSE3:       # BB#0: # %entry
1218; SSSE3-NEXT:    movsbq 1(%rdi), %rax
1219; SSSE3-NEXT:    movd %rax, %xmm1
1220; SSSE3-NEXT:    movsbq (%rdi), %rax
1221; SSSE3-NEXT:    movd %rax, %xmm0
1222; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1223; SSSE3-NEXT:    movsbq 3(%rdi), %rax
1224; SSSE3-NEXT:    movd %rax, %xmm2
1225; SSSE3-NEXT:    movsbq 2(%rdi), %rax
1226; SSSE3-NEXT:    movd %rax, %xmm1
1227; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1228; SSSE3-NEXT:    retq
1229;
1230; SSE41-LABEL: load_sext_4i8_to_4i64:
1231; SSE41:       # BB#0: # %entry
1232; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
1233; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
1234; SSE41-NEXT:    retq
1235;
1236; AVX1-LABEL: load_sext_4i8_to_4i64:
1237; AVX1:       # BB#0: # %entry
1238; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
1239; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1240; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1241; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1242; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1243; AVX1-NEXT:    retq
1244;
1245; AVX2-LABEL: load_sext_4i8_to_4i64:
1246; AVX2:       # BB#0: # %entry
1247; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
1248; AVX2-NEXT:    retq
1249;
1250; AVX512-LABEL: load_sext_4i8_to_4i64:
1251; AVX512:       # BB#0: # %entry
1252; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
1253; AVX512-NEXT:    retq
1254;
1255; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
1256; X32-SSE41:       # BB#0: # %entry
1257; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1258; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
1259; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
1260; X32-SSE41-NEXT:    retl
1261entry:
1262 %X = load <4 x i8>, <4 x i8>* %ptr
1263 %Y = sext <4 x i8> %X to <4 x i64>
1264 ret <4 x i64> %Y
1265}
1266
1267define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
1268; SSE2-LABEL: load_sext_8i1_to_8i16:
1269; SSE2:       # BB#0: # %entry
1270; SSE2-NEXT:    movsbq (%rdi), %rax
1271; SSE2-NEXT:    movq %rax, %rcx
1272; SSE2-NEXT:    shrq $7, %rcx
1273; SSE2-NEXT:    movd %ecx, %xmm0
1274; SSE2-NEXT:    movq %rax, %rcx
1275; SSE2-NEXT:    shlq $60, %rcx
1276; SSE2-NEXT:    sarq $63, %rcx
1277; SSE2-NEXT:    movd %ecx, %xmm2
1278; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1279; SSE2-NEXT:    movq %rax, %rcx
1280; SSE2-NEXT:    shlq $58, %rcx
1281; SSE2-NEXT:    sarq $63, %rcx
1282; SSE2-NEXT:    movd %ecx, %xmm0
1283; SSE2-NEXT:    movq %rax, %rcx
1284; SSE2-NEXT:    shlq $62, %rcx
1285; SSE2-NEXT:    sarq $63, %rcx
1286; SSE2-NEXT:    movd %ecx, %xmm1
1287; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1288; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1289; SSE2-NEXT:    movq %rax, %rcx
1290; SSE2-NEXT:    shlq $57, %rcx
1291; SSE2-NEXT:    sarq $63, %rcx
1292; SSE2-NEXT:    movd %ecx, %xmm0
1293; SSE2-NEXT:    movq %rax, %rcx
1294; SSE2-NEXT:    shlq $61, %rcx
1295; SSE2-NEXT:    sarq $63, %rcx
1296; SSE2-NEXT:    movd %ecx, %xmm2
1297; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1298; SSE2-NEXT:    movq %rax, %rcx
1299; SSE2-NEXT:    shlq $59, %rcx
1300; SSE2-NEXT:    sarq $63, %rcx
1301; SSE2-NEXT:    movd %ecx, %xmm3
1302; SSE2-NEXT:    shlq $63, %rax
1303; SSE2-NEXT:    sarq $63, %rax
1304; SSE2-NEXT:    movd %eax, %xmm0
1305; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1306; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1307; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1308; SSE2-NEXT:    retq
1309;
1310; SSSE3-LABEL: load_sext_8i1_to_8i16:
1311; SSSE3:       # BB#0: # %entry
1312; SSSE3-NEXT:    movsbq (%rdi), %rax
1313; SSSE3-NEXT:    movq %rax, %rcx
1314; SSSE3-NEXT:    shrq $7, %rcx
1315; SSSE3-NEXT:    movd %ecx, %xmm0
1316; SSSE3-NEXT:    movq %rax, %rcx
1317; SSSE3-NEXT:    shlq $60, %rcx
1318; SSSE3-NEXT:    sarq $63, %rcx
1319; SSSE3-NEXT:    movd %ecx, %xmm2
1320; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1321; SSSE3-NEXT:    movq %rax, %rcx
1322; SSSE3-NEXT:    shlq $58, %rcx
1323; SSSE3-NEXT:    sarq $63, %rcx
1324; SSSE3-NEXT:    movd %ecx, %xmm0
1325; SSSE3-NEXT:    movq %rax, %rcx
1326; SSSE3-NEXT:    shlq $62, %rcx
1327; SSSE3-NEXT:    sarq $63, %rcx
1328; SSSE3-NEXT:    movd %ecx, %xmm1
1329; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1330; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1331; SSSE3-NEXT:    movq %rax, %rcx
1332; SSSE3-NEXT:    shlq $57, %rcx
1333; SSSE3-NEXT:    sarq $63, %rcx
1334; SSSE3-NEXT:    movd %ecx, %xmm0
1335; SSSE3-NEXT:    movq %rax, %rcx
1336; SSSE3-NEXT:    shlq $61, %rcx
1337; SSSE3-NEXT:    sarq $63, %rcx
1338; SSSE3-NEXT:    movd %ecx, %xmm2
1339; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1340; SSSE3-NEXT:    movq %rax, %rcx
1341; SSSE3-NEXT:    shlq $59, %rcx
1342; SSSE3-NEXT:    sarq $63, %rcx
1343; SSSE3-NEXT:    movd %ecx, %xmm3
1344; SSSE3-NEXT:    shlq $63, %rax
1345; SSSE3-NEXT:    sarq $63, %rax
1346; SSSE3-NEXT:    movd %eax, %xmm0
1347; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1348; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1349; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1350; SSSE3-NEXT:    retq
1351;
1352; SSE41-LABEL: load_sext_8i1_to_8i16:
1353; SSE41:       # BB#0: # %entry
1354; SSE41-NEXT:    movsbq (%rdi), %rax
1355; SSE41-NEXT:    movq %rax, %rcx
1356; SSE41-NEXT:    shlq $62, %rcx
1357; SSE41-NEXT:    sarq $63, %rcx
1358; SSE41-NEXT:    movq %rax, %rdx
1359; SSE41-NEXT:    shlq $63, %rdx
1360; SSE41-NEXT:    sarq $63, %rdx
1361; SSE41-NEXT:    movd %edx, %xmm0
1362; SSE41-NEXT:    pinsrw $1, %ecx, %xmm0
1363; SSE41-NEXT:    movq %rax, %rcx
1364; SSE41-NEXT:    shlq $61, %rcx
1365; SSE41-NEXT:    sarq $63, %rcx
1366; SSE41-NEXT:    pinsrw $2, %ecx, %xmm0
1367; SSE41-NEXT:    movq %rax, %rcx
1368; SSE41-NEXT:    shlq $60, %rcx
1369; SSE41-NEXT:    sarq $63, %rcx
1370; SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
1371; SSE41-NEXT:    movq %rax, %rcx
1372; SSE41-NEXT:    shlq $59, %rcx
1373; SSE41-NEXT:    sarq $63, %rcx
1374; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
1375; SSE41-NEXT:    movq %rax, %rcx
1376; SSE41-NEXT:    shlq $58, %rcx
1377; SSE41-NEXT:    sarq $63, %rcx
1378; SSE41-NEXT:    pinsrw $5, %ecx, %xmm0
1379; SSE41-NEXT:    movq %rax, %rcx
1380; SSE41-NEXT:    shlq $57, %rcx
1381; SSE41-NEXT:    sarq $63, %rcx
1382; SSE41-NEXT:    pinsrw $6, %ecx, %xmm0
1383; SSE41-NEXT:    shrq $7, %rax
1384; SSE41-NEXT:    pinsrw $7, %eax, %xmm0
1385; SSE41-NEXT:    retq
1386;
1387; AVX1-LABEL: load_sext_8i1_to_8i16:
1388; AVX1:       # BB#0: # %entry
1389; AVX1-NEXT:    movsbq (%rdi), %rax
1390; AVX1-NEXT:    movq %rax, %rcx
1391; AVX1-NEXT:    shlq $62, %rcx
1392; AVX1-NEXT:    sarq $63, %rcx
1393; AVX1-NEXT:    movq %rax, %rdx
1394; AVX1-NEXT:    shlq $63, %rdx
1395; AVX1-NEXT:    sarq $63, %rdx
1396; AVX1-NEXT:    vmovd %edx, %xmm0
1397; AVX1-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
1398; AVX1-NEXT:    movq %rax, %rcx
1399; AVX1-NEXT:    shlq $61, %rcx
1400; AVX1-NEXT:    sarq $63, %rcx
1401; AVX1-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
1402; AVX1-NEXT:    movq %rax, %rcx
1403; AVX1-NEXT:    shlq $60, %rcx
1404; AVX1-NEXT:    sarq $63, %rcx
1405; AVX1-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
1406; AVX1-NEXT:    movq %rax, %rcx
1407; AVX1-NEXT:    shlq $59, %rcx
1408; AVX1-NEXT:    sarq $63, %rcx
1409; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
1410; AVX1-NEXT:    movq %rax, %rcx
1411; AVX1-NEXT:    shlq $58, %rcx
1412; AVX1-NEXT:    sarq $63, %rcx
1413; AVX1-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
1414; AVX1-NEXT:    movq %rax, %rcx
1415; AVX1-NEXT:    shlq $57, %rcx
1416; AVX1-NEXT:    sarq $63, %rcx
1417; AVX1-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
1418; AVX1-NEXT:    shrq $7, %rax
1419; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
1420; AVX1-NEXT:    retq
1421;
1422; AVX2-LABEL: load_sext_8i1_to_8i16:
1423; AVX2:       # BB#0: # %entry
1424; AVX2-NEXT:    movsbq (%rdi), %rax
1425; AVX2-NEXT:    movq %rax, %rcx
1426; AVX2-NEXT:    shlq $62, %rcx
1427; AVX2-NEXT:    sarq $63, %rcx
1428; AVX2-NEXT:    movq %rax, %rdx
1429; AVX2-NEXT:    shlq $63, %rdx
1430; AVX2-NEXT:    sarq $63, %rdx
1431; AVX2-NEXT:    vmovd %edx, %xmm0
1432; AVX2-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
1433; AVX2-NEXT:    movq %rax, %rcx
1434; AVX2-NEXT:    shlq $61, %rcx
1435; AVX2-NEXT:    sarq $63, %rcx
1436; AVX2-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
1437; AVX2-NEXT:    movq %rax, %rcx
1438; AVX2-NEXT:    shlq $60, %rcx
1439; AVX2-NEXT:    sarq $63, %rcx
1440; AVX2-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
1441; AVX2-NEXT:    movq %rax, %rcx
1442; AVX2-NEXT:    shlq $59, %rcx
1443; AVX2-NEXT:    sarq $63, %rcx
1444; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
1445; AVX2-NEXT:    movq %rax, %rcx
1446; AVX2-NEXT:    shlq $58, %rcx
1447; AVX2-NEXT:    sarq $63, %rcx
1448; AVX2-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
1449; AVX2-NEXT:    movq %rax, %rcx
1450; AVX2-NEXT:    shlq $57, %rcx
1451; AVX2-NEXT:    sarq $63, %rcx
1452; AVX2-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
1453; AVX2-NEXT:    shrq $7, %rax
1454; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
1455; AVX2-NEXT:    retq
1456;
1457; AVX512-LABEL: load_sext_8i1_to_8i16:
1458; AVX512:       # BB#0: # %entry
1459; AVX512-NEXT:    movzbl (%rdi), %eax
1460; AVX512-NEXT:    kmovw %eax, %k1
1461; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
1462; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
1463; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
1464; AVX512-NEXT:    retq
1465;
1466; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
1467; X32-SSE41:       # BB#0: # %entry
1468; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1469; X32-SSE41-NEXT:    movsbl (%eax), %eax
1470; X32-SSE41-NEXT:    movl %eax, %ecx
1471; X32-SSE41-NEXT:    shll $30, %ecx
1472; X32-SSE41-NEXT:    sarl $31, %ecx
1473; X32-SSE41-NEXT:    movl %eax, %edx
1474; X32-SSE41-NEXT:    shll $31, %edx
1475; X32-SSE41-NEXT:    sarl $31, %edx
1476; X32-SSE41-NEXT:    movd %edx, %xmm0
1477; X32-SSE41-NEXT:    pinsrw $1, %ecx, %xmm0
1478; X32-SSE41-NEXT:    movl %eax, %ecx
1479; X32-SSE41-NEXT:    shll $29, %ecx
1480; X32-SSE41-NEXT:    sarl $31, %ecx
1481; X32-SSE41-NEXT:    pinsrw $2, %ecx, %xmm0
1482; X32-SSE41-NEXT:    movl %eax, %ecx
1483; X32-SSE41-NEXT:    shll $28, %ecx
1484; X32-SSE41-NEXT:    sarl $31, %ecx
1485; X32-SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
1486; X32-SSE41-NEXT:    movl %eax, %ecx
1487; X32-SSE41-NEXT:    shll $27, %ecx
1488; X32-SSE41-NEXT:    sarl $31, %ecx
1489; X32-SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
1490; X32-SSE41-NEXT:    movl %eax, %ecx
1491; X32-SSE41-NEXT:    shll $26, %ecx
1492; X32-SSE41-NEXT:    sarl $31, %ecx
1493; X32-SSE41-NEXT:    pinsrw $5, %ecx, %xmm0
1494; X32-SSE41-NEXT:    movl %eax, %ecx
1495; X32-SSE41-NEXT:    shll $25, %ecx
1496; X32-SSE41-NEXT:    sarl $31, %ecx
1497; X32-SSE41-NEXT:    pinsrw $6, %ecx, %xmm0
1498; X32-SSE41-NEXT:    shrl $7, %eax
1499; X32-SSE41-NEXT:    pinsrw $7, %eax, %xmm0
1500; X32-SSE41-NEXT:    retl
1501entry:
1502 %X = load <8 x i1>, <8 x i1>* %ptr
1503 %Y = sext <8 x i1> %X to <8 x i16>
1504 ret <8 x i16> %Y
1505}
1506
1507define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) {
1508; SSE2-LABEL: load_sext_8i8_to_8i16:
1509; SSE2:       # BB#0: # %entry
1510; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1511; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1512; SSE2-NEXT:    psraw $8, %xmm0
1513; SSE2-NEXT:    retq
1514;
1515; SSSE3-LABEL: load_sext_8i8_to_8i16:
1516; SSSE3:       # BB#0: # %entry
1517; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1518; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1519; SSSE3-NEXT:    psraw $8, %xmm0
1520; SSSE3-NEXT:    retq
1521;
1522; SSE41-LABEL: load_sext_8i8_to_8i16:
1523; SSE41:       # BB#0: # %entry
1524; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
1525; SSE41-NEXT:    retq
1526;
1527; AVX-LABEL: load_sext_8i8_to_8i16:
1528; AVX:       # BB#0: # %entry
1529; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
1530; AVX-NEXT:    retq
1531;
1532; X32-SSE41-LABEL: load_sext_8i8_to_8i16:
1533; X32-SSE41:       # BB#0: # %entry
1534; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1535; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
1536; X32-SSE41-NEXT:    retl
1537entry:
1538 %X = load <8 x i8>, <8 x i8>* %ptr
1539 %Y = sext <8 x i8> %X to <8 x i16>
1540 ret <8 x i16> %Y
1541}
1542
1543define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
1544; SSE2-LABEL: load_sext_8i8_to_8i64:
1545; SSE2:       # BB#0: # %entry
1546; SSE2-NEXT:    movsbq 1(%rdi), %rax
1547; SSE2-NEXT:    movd %rax, %xmm1
1548; SSE2-NEXT:    movsbq (%rdi), %rax
1549; SSE2-NEXT:    movd %rax, %xmm0
1550; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1551; SSE2-NEXT:    movsbq 3(%rdi), %rax
1552; SSE2-NEXT:    movd %rax, %xmm2
1553; SSE2-NEXT:    movsbq 2(%rdi), %rax
1554; SSE2-NEXT:    movd %rax, %xmm1
1555; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1556; SSE2-NEXT:    movsbq 5(%rdi), %rax
1557; SSE2-NEXT:    movd %rax, %xmm3
1558; SSE2-NEXT:    movsbq 4(%rdi), %rax
1559; SSE2-NEXT:    movd %rax, %xmm2
1560; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1561; SSE2-NEXT:    movsbq 7(%rdi), %rax
1562; SSE2-NEXT:    movd %rax, %xmm4
1563; SSE2-NEXT:    movsbq 6(%rdi), %rax
1564; SSE2-NEXT:    movd %rax, %xmm3
1565; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1566; SSE2-NEXT:    retq
1567;
1568; SSSE3-LABEL: load_sext_8i8_to_8i64:
1569; SSSE3:       # BB#0: # %entry
1570; SSSE3-NEXT:    movsbq 1(%rdi), %rax
1571; SSSE3-NEXT:    movd %rax, %xmm1
1572; SSSE3-NEXT:    movsbq (%rdi), %rax
1573; SSSE3-NEXT:    movd %rax, %xmm0
1574; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1575; SSSE3-NEXT:    movsbq 3(%rdi), %rax
1576; SSSE3-NEXT:    movd %rax, %xmm2
1577; SSSE3-NEXT:    movsbq 2(%rdi), %rax
1578; SSSE3-NEXT:    movd %rax, %xmm1
1579; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1580; SSSE3-NEXT:    movsbq 5(%rdi), %rax
1581; SSSE3-NEXT:    movd %rax, %xmm3
1582; SSSE3-NEXT:    movsbq 4(%rdi), %rax
1583; SSSE3-NEXT:    movd %rax, %xmm2
1584; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1585; SSSE3-NEXT:    movsbq 7(%rdi), %rax
1586; SSSE3-NEXT:    movd %rax, %xmm4
1587; SSSE3-NEXT:    movsbq 6(%rdi), %rax
1588; SSSE3-NEXT:    movd %rax, %xmm3
1589; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1590; SSSE3-NEXT:    retq
1591;
1592; SSE41-LABEL: load_sext_8i8_to_8i64:
1593; SSE41:       # BB#0: # %entry
1594; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
1595; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
1596; SSE41-NEXT:    pmovsxbq 4(%rdi), %xmm2
1597; SSE41-NEXT:    pmovsxbq 6(%rdi), %xmm3
1598; SSE41-NEXT:    retq
1599;
1600; AVX1-LABEL: load_sext_8i8_to_8i64:
1601; AVX1:       # BB#0: # %entry
1602; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
1603; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1604; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1605; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1606; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1607; AVX1-NEXT:    vpmovsxbd 4(%rdi), %xmm1
1608; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm2
1609; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1610; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
1611; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1612; AVX1-NEXT:    retq
1613;
1614; AVX2-LABEL: load_sext_8i8_to_8i64:
1615; AVX2:       # BB#0: # %entry
1616; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
1617; AVX2-NEXT:    vpmovsxbq 4(%rdi), %ymm1
1618; AVX2-NEXT:    retq
1619;
1620; AVX512-LABEL: load_sext_8i8_to_8i64:
1621; AVX512:       # BB#0: # %entry
1622; AVX512-NEXT:    vpmovsxbq (%rdi), %zmm0
1623; AVX512-NEXT:    retq
1624;
1625; X32-SSE41-LABEL: load_sext_8i8_to_8i64:
1626; X32-SSE41:       # BB#0: # %entry
1627; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1628; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
1629; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
1630; X32-SSE41-NEXT:    pmovsxbq 4(%eax), %xmm2
1631; X32-SSE41-NEXT:    pmovsxbq 6(%eax), %xmm3
1632; X32-SSE41-NEXT:    retl
1633entry:
1634 %X = load <8 x i8>, <8 x i8>* %ptr
1635 %Y = sext <8 x i8> %X to <8 x i64>
1636 ret <8 x i64> %Y
1637}
1638
1639define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
1640; SSE2-LABEL: load_sext_8i1_to_8i32:
1641; SSE2:       # BB#0: # %entry
1642; SSE2-NEXT:    movzbl (%rdi), %eax
1643; SSE2-NEXT:    movl %eax, %ecx
1644; SSE2-NEXT:    shrl $6, %ecx
1645; SSE2-NEXT:    andl $1, %ecx
1646; SSE2-NEXT:    movd %ecx, %xmm0
1647; SSE2-NEXT:    movl %eax, %ecx
1648; SSE2-NEXT:    shrl $2, %ecx
1649; SSE2-NEXT:    andl $1, %ecx
1650; SSE2-NEXT:    movd %ecx, %xmm2
1651; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1652; SSE2-NEXT:    movl %eax, %ecx
1653; SSE2-NEXT:    andl $1, %ecx
1654; SSE2-NEXT:    movd %ecx, %xmm1
1655; SSE2-NEXT:    movl %eax, %ecx
1656; SSE2-NEXT:    shrl $4, %ecx
1657; SSE2-NEXT:    andl $1, %ecx
1658; SSE2-NEXT:    movd %ecx, %xmm0
1659; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1660; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1661; SSE2-NEXT:    movl %eax, %ecx
1662; SSE2-NEXT:    shrl $5, %ecx
1663; SSE2-NEXT:    andl $1, %ecx
1664; SSE2-NEXT:    movd %ecx, %xmm0
1665; SSE2-NEXT:    movl %eax, %ecx
1666; SSE2-NEXT:    shrl %ecx
1667; SSE2-NEXT:    andl $1, %ecx
1668; SSE2-NEXT:    movd %ecx, %xmm2
1669; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1670; SSE2-NEXT:    movl %eax, %ecx
1671; SSE2-NEXT:    shrl $3, %ecx
1672; SSE2-NEXT:    andl $1, %ecx
1673; SSE2-NEXT:    movd %ecx, %xmm0
1674; SSE2-NEXT:    shrl $7, %eax
1675; SSE2-NEXT:    movzwl %ax, %eax
1676; SSE2-NEXT:    movd %eax, %xmm3
1677; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1678; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1679; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1680; SSE2-NEXT:    movdqa %xmm1, %xmm0
1681; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1682; SSE2-NEXT:    pslld $31, %xmm0
1683; SSE2-NEXT:    psrad $31, %xmm0
1684; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1685; SSE2-NEXT:    pslld $31, %xmm1
1686; SSE2-NEXT:    psrad $31, %xmm1
1687; SSE2-NEXT:    retq
1688;
1689; SSSE3-LABEL: load_sext_8i1_to_8i32:
1690; SSSE3:       # BB#0: # %entry
1691; SSSE3-NEXT:    movzbl (%rdi), %eax
1692; SSSE3-NEXT:    movl %eax, %ecx
1693; SSSE3-NEXT:    shrl $6, %ecx
1694; SSSE3-NEXT:    andl $1, %ecx
1695; SSSE3-NEXT:    movd %ecx, %xmm0
1696; SSSE3-NEXT:    movl %eax, %ecx
1697; SSSE3-NEXT:    shrl $2, %ecx
1698; SSSE3-NEXT:    andl $1, %ecx
1699; SSSE3-NEXT:    movd %ecx, %xmm2
1700; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1701; SSSE3-NEXT:    movl %eax, %ecx
1702; SSSE3-NEXT:    andl $1, %ecx
1703; SSSE3-NEXT:    movd %ecx, %xmm1
1704; SSSE3-NEXT:    movl %eax, %ecx
1705; SSSE3-NEXT:    shrl $4, %ecx
1706; SSSE3-NEXT:    andl $1, %ecx
1707; SSSE3-NEXT:    movd %ecx, %xmm0
1708; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1709; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1710; SSSE3-NEXT:    movl %eax, %ecx
1711; SSSE3-NEXT:    shrl $5, %ecx
1712; SSSE3-NEXT:    andl $1, %ecx
1713; SSSE3-NEXT:    movd %ecx, %xmm0
1714; SSSE3-NEXT:    movl %eax, %ecx
1715; SSSE3-NEXT:    shrl %ecx
1716; SSSE3-NEXT:    andl $1, %ecx
1717; SSSE3-NEXT:    movd %ecx, %xmm2
1718; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1719; SSSE3-NEXT:    movl %eax, %ecx
1720; SSSE3-NEXT:    shrl $3, %ecx
1721; SSSE3-NEXT:    andl $1, %ecx
1722; SSSE3-NEXT:    movd %ecx, %xmm0
1723; SSSE3-NEXT:    shrl $7, %eax
1724; SSSE3-NEXT:    movzwl %ax, %eax
1725; SSSE3-NEXT:    movd %eax, %xmm3
1726; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1727; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1728; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1729; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1730; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1731; SSSE3-NEXT:    pslld $31, %xmm0
1732; SSSE3-NEXT:    psrad $31, %xmm0
1733; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1734; SSSE3-NEXT:    pslld $31, %xmm1
1735; SSSE3-NEXT:    psrad $31, %xmm1
1736; SSSE3-NEXT:    retq
1737;
1738; SSE41-LABEL: load_sext_8i1_to_8i32:
1739; SSE41:       # BB#0: # %entry
1740; SSE41-NEXT:    movzbl (%rdi), %eax
1741; SSE41-NEXT:    movl %eax, %ecx
1742; SSE41-NEXT:    shrl %ecx
1743; SSE41-NEXT:    andl $1, %ecx
1744; SSE41-NEXT:    movl %eax, %edx
1745; SSE41-NEXT:    andl $1, %edx
1746; SSE41-NEXT:    movd %edx, %xmm1
1747; SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
1748; SSE41-NEXT:    movl %eax, %ecx
1749; SSE41-NEXT:    shrl $2, %ecx
1750; SSE41-NEXT:    andl $1, %ecx
1751; SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
1752; SSE41-NEXT:    movl %eax, %ecx
1753; SSE41-NEXT:    shrl $3, %ecx
1754; SSE41-NEXT:    andl $1, %ecx
1755; SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
1756; SSE41-NEXT:    movl %eax, %ecx
1757; SSE41-NEXT:    shrl $4, %ecx
1758; SSE41-NEXT:    andl $1, %ecx
1759; SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
1760; SSE41-NEXT:    movl %eax, %ecx
1761; SSE41-NEXT:    shrl $5, %ecx
1762; SSE41-NEXT:    andl $1, %ecx
1763; SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
1764; SSE41-NEXT:    movl %eax, %ecx
1765; SSE41-NEXT:    shrl $6, %ecx
1766; SSE41-NEXT:    andl $1, %ecx
1767; SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
1768; SSE41-NEXT:    shrl $7, %eax
1769; SSE41-NEXT:    movzwl %ax, %eax
1770; SSE41-NEXT:    pinsrw $7, %eax, %xmm1
1771; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1772; SSE41-NEXT:    pslld $31, %xmm0
1773; SSE41-NEXT:    psrad $31, %xmm0
1774; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1775; SSE41-NEXT:    pslld $31, %xmm1
1776; SSE41-NEXT:    psrad $31, %xmm1
1777; SSE41-NEXT:    retq
1778;
1779; AVX1-LABEL: load_sext_8i1_to_8i32:
1780; AVX1:       # BB#0: # %entry
1781; AVX1-NEXT:    movsbq (%rdi), %rax
1782; AVX1-NEXT:    movq %rax, %rcx
1783; AVX1-NEXT:    shlq $58, %rcx
1784; AVX1-NEXT:    sarq $63, %rcx
1785; AVX1-NEXT:    movq %rax, %rdx
1786; AVX1-NEXT:    shlq $59, %rdx
1787; AVX1-NEXT:    sarq $63, %rdx
1788; AVX1-NEXT:    vmovd %edx, %xmm0
1789; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
1790; AVX1-NEXT:    movq %rax, %rcx
1791; AVX1-NEXT:    shlq $57, %rcx
1792; AVX1-NEXT:    sarq $63, %rcx
1793; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
1794; AVX1-NEXT:    movq %rax, %rcx
1795; AVX1-NEXT:    shrq $7, %rcx
1796; AVX1-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
1797; AVX1-NEXT:    movq %rax, %rcx
1798; AVX1-NEXT:    shlq $62, %rcx
1799; AVX1-NEXT:    sarq $63, %rcx
1800; AVX1-NEXT:    movq %rax, %rdx
1801; AVX1-NEXT:    shlq $63, %rdx
1802; AVX1-NEXT:    sarq $63, %rdx
1803; AVX1-NEXT:    vmovd %edx, %xmm1
1804; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
1805; AVX1-NEXT:    movq %rax, %rcx
1806; AVX1-NEXT:    shlq $61, %rcx
1807; AVX1-NEXT:    sarq $63, %rcx
1808; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
1809; AVX1-NEXT:    shlq $60, %rax
1810; AVX1-NEXT:    sarq $63, %rax
1811; AVX1-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
1812; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1813; AVX1-NEXT:    retq
1814;
1815; AVX2-LABEL: load_sext_8i1_to_8i32:
1816; AVX2:       # BB#0: # %entry
1817; AVX2-NEXT:    movsbq (%rdi), %rax
1818; AVX2-NEXT:    movq %rax, %rcx
1819; AVX2-NEXT:    shlq $58, %rcx
1820; AVX2-NEXT:    sarq $63, %rcx
1821; AVX2-NEXT:    movq %rax, %rdx
1822; AVX2-NEXT:    shlq $59, %rdx
1823; AVX2-NEXT:    sarq $63, %rdx
1824; AVX2-NEXT:    vmovd %edx, %xmm0
1825; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
1826; AVX2-NEXT:    movq %rax, %rcx
1827; AVX2-NEXT:    shlq $57, %rcx
1828; AVX2-NEXT:    sarq $63, %rcx
1829; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
1830; AVX2-NEXT:    movq %rax, %rcx
1831; AVX2-NEXT:    shrq $7, %rcx
1832; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
1833; AVX2-NEXT:    movq %rax, %rcx
1834; AVX2-NEXT:    shlq $62, %rcx
1835; AVX2-NEXT:    sarq $63, %rcx
1836; AVX2-NEXT:    movq %rax, %rdx
1837; AVX2-NEXT:    shlq $63, %rdx
1838; AVX2-NEXT:    sarq $63, %rdx
1839; AVX2-NEXT:    vmovd %edx, %xmm1
1840; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
1841; AVX2-NEXT:    movq %rax, %rcx
1842; AVX2-NEXT:    shlq $61, %rcx
1843; AVX2-NEXT:    sarq $63, %rcx
1844; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
1845; AVX2-NEXT:    shlq $60, %rax
1846; AVX2-NEXT:    sarq $63, %rax
1847; AVX2-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
1848; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1849; AVX2-NEXT:    retq
1850;
1851; AVX512-LABEL: load_sext_8i1_to_8i32:
1852; AVX512:       # BB#0: # %entry
1853; AVX512-NEXT:    movzbl (%rdi), %eax
1854; AVX512-NEXT:    kmovw %eax, %k1
1855; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
1856; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
1857; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
1858; AVX512-NEXT:    retq
1859;
1860; X32-SSE41-LABEL: load_sext_8i1_to_8i32:
1861; X32-SSE41:       # BB#0: # %entry
1862; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1863; X32-SSE41-NEXT:    movzbl (%eax), %eax
1864; X32-SSE41-NEXT:    movl %eax, %ecx
1865; X32-SSE41-NEXT:    shrl %ecx
1866; X32-SSE41-NEXT:    andl $1, %ecx
1867; X32-SSE41-NEXT:    movl %eax, %edx
1868; X32-SSE41-NEXT:    andl $1, %edx
1869; X32-SSE41-NEXT:    movd %edx, %xmm1
1870; X32-SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
1871; X32-SSE41-NEXT:    movl %eax, %ecx
1872; X32-SSE41-NEXT:    shrl $2, %ecx
1873; X32-SSE41-NEXT:    andl $1, %ecx
1874; X32-SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
1875; X32-SSE41-NEXT:    movl %eax, %ecx
1876; X32-SSE41-NEXT:    shrl $3, %ecx
1877; X32-SSE41-NEXT:    andl $1, %ecx
1878; X32-SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
1879; X32-SSE41-NEXT:    movl %eax, %ecx
1880; X32-SSE41-NEXT:    shrl $4, %ecx
1881; X32-SSE41-NEXT:    andl $1, %ecx
1882; X32-SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
1883; X32-SSE41-NEXT:    movl %eax, %ecx
1884; X32-SSE41-NEXT:    shrl $5, %ecx
1885; X32-SSE41-NEXT:    andl $1, %ecx
1886; X32-SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
1887; X32-SSE41-NEXT:    movl %eax, %ecx
1888; X32-SSE41-NEXT:    shrl $6, %ecx
1889; X32-SSE41-NEXT:    andl $1, %ecx
1890; X32-SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
1891; X32-SSE41-NEXT:    shrl $7, %eax
1892; X32-SSE41-NEXT:    pinsrw $7, %eax, %xmm1
1893; X32-SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1894; X32-SSE41-NEXT:    pslld $31, %xmm0
1895; X32-SSE41-NEXT:    psrad $31, %xmm0
1896; X32-SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1897; X32-SSE41-NEXT:    pslld $31, %xmm1
1898; X32-SSE41-NEXT:    psrad $31, %xmm1
1899; X32-SSE41-NEXT:    retl
1900entry:
1901 %X = load <8 x i1>, <8 x i1>* %ptr
1902 %Y = sext <8 x i1> %X to <8 x i32>
1903 ret <8 x i32> %Y
1904}
1905
1906define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
1907; SSE2-LABEL: load_sext_8i8_to_8i32:
1908; SSE2:       # BB#0: # %entry
1909; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1910; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1911; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1912; SSE2-NEXT:    psrad $24, %xmm0
1913; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1914; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1915; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1916; SSE2-NEXT:    psrad $24, %xmm1
1917; SSE2-NEXT:    retq
1918;
1919; SSSE3-LABEL: load_sext_8i8_to_8i32:
1920; SSSE3:       # BB#0: # %entry
1921; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1922; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1923; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1924; SSSE3-NEXT:    psrad $24, %xmm0
1925; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1926; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1927; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1928; SSSE3-NEXT:    psrad $24, %xmm1
1929; SSSE3-NEXT:    retq
1930;
1931; SSE41-LABEL: load_sext_8i8_to_8i32:
1932; SSE41:       # BB#0: # %entry
1933; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
1934; SSE41-NEXT:    pmovsxbd 4(%rdi), %xmm1
1935; SSE41-NEXT:    retq
1936;
1937; AVX1-LABEL: load_sext_8i8_to_8i32:
1938; AVX1:       # BB#0: # %entry
1939; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
1940; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
1941; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1942; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
1943; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1944; AVX1-NEXT:    retq
1945;
1946; AVX2-LABEL: load_sext_8i8_to_8i32:
1947; AVX2:       # BB#0: # %entry
1948; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
1949; AVX2-NEXT:    retq
1950;
1951; AVX512-LABEL: load_sext_8i8_to_8i32:
1952; AVX512:       # BB#0: # %entry
1953; AVX512-NEXT:    vpmovsxbd (%rdi), %ymm0
1954; AVX512-NEXT:    retq
1955;
1956; X32-SSE41-LABEL: load_sext_8i8_to_8i32:
1957; X32-SSE41:       # BB#0: # %entry
1958; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1959; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
1960; X32-SSE41-NEXT:    pmovsxbd 4(%eax), %xmm1
1961; X32-SSE41-NEXT:    retl
1962entry:
1963 %X = load <8 x i8>, <8 x i8>* %ptr
1964 %Y = sext <8 x i8> %X to <8 x i32>
1965 ret <8 x i32> %Y
1966}
1967
1968define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
1969; SSE2-LABEL: load_sext_16i1_to_16i8:
1970; SSE2:       # BB#0: # %entry
1971; SSE2-NEXT:    pushq %rbp
1972; SSE2-NEXT:    pushq %r15
1973; SSE2-NEXT:    pushq %r14
1974; SSE2-NEXT:    pushq %r13
1975; SSE2-NEXT:    pushq %r12
1976; SSE2-NEXT:    pushq %rbx
1977; SSE2-NEXT:    movswq (%rdi), %rax
1978; SSE2-NEXT:    movq %rax, %r8
1979; SSE2-NEXT:    movq %rax, %r9
1980; SSE2-NEXT:    movq %rax, %r10
1981; SSE2-NEXT:    movq %rax, %r11
1982; SSE2-NEXT:    movq %rax, %r14
1983; SSE2-NEXT:    movq %rax, %r15
1984; SSE2-NEXT:    movq %rax, %r12
1985; SSE2-NEXT:    movq %rax, %r13
1986; SSE2-NEXT:    movq %rax, %rbx
1987; SSE2-NEXT:    movq %rax, %rcx
1988; SSE2-NEXT:    movq %rax, %rdx
1989; SSE2-NEXT:    movq %rax, %rsi
1990; SSE2-NEXT:    movq %rax, %rdi
1991; SSE2-NEXT:    movq %rax, %rbp
1992; SSE2-NEXT:    shlq $49, %rbp
1993; SSE2-NEXT:    sarq $63, %rbp
1994; SSE2-NEXT:    movd %ebp, %xmm0
1995; SSE2-NEXT:    movq %rax, %rbp
1996; SSE2-NEXT:    movsbq %al, %rax
1997; SSE2-NEXT:    shlq $57, %r8
1998; SSE2-NEXT:    sarq $63, %r8
1999; SSE2-NEXT:    movd %r8d, %xmm1
2000; SSE2-NEXT:    shlq $53, %r9
2001; SSE2-NEXT:    sarq $63, %r9
2002; SSE2-NEXT:    movd %r9d, %xmm2
2003; SSE2-NEXT:    shlq $61, %r10
2004; SSE2-NEXT:    sarq $63, %r10
2005; SSE2-NEXT:    movd %r10d, %xmm3
2006; SSE2-NEXT:    shlq $51, %r11
2007; SSE2-NEXT:    sarq $63, %r11
2008; SSE2-NEXT:    movd %r11d, %xmm4
2009; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2010; SSE2-NEXT:    shlq $59, %r14
2011; SSE2-NEXT:    sarq $63, %r14
2012; SSE2-NEXT:    movd %r14d, %xmm5
2013; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2014; SSE2-NEXT:    shlq $55, %r15
2015; SSE2-NEXT:    sarq $63, %r15
2016; SSE2-NEXT:    movd %r15d, %xmm2
2017; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
2018; SSE2-NEXT:    shlq $63, %r12
2019; SSE2-NEXT:    sarq $63, %r12
2020; SSE2-NEXT:    movd %r12d, %xmm0
2021; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
2022; SSE2-NEXT:    shlq $50, %r13
2023; SSE2-NEXT:    sarq $63, %r13
2024; SSE2-NEXT:    movd %r13d, %xmm1
2025; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2026; SSE2-NEXT:    shlq $58, %rbx
2027; SSE2-NEXT:    sarq $63, %rbx
2028; SSE2-NEXT:    movd %ebx, %xmm2
2029; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2030; SSE2-NEXT:    shlq $54, %rcx
2031; SSE2-NEXT:    sarq $63, %rcx
2032; SSE2-NEXT:    movd %ecx, %xmm4
2033; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2034; SSE2-NEXT:    shlq $62, %rdx
2035; SSE2-NEXT:    sarq $63, %rdx
2036; SSE2-NEXT:    movd %edx, %xmm3
2037; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2038; SSE2-NEXT:    shlq $52, %rsi
2039; SSE2-NEXT:    sarq $63, %rsi
2040; SSE2-NEXT:    movd %esi, %xmm1
2041; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2042; SSE2-NEXT:    shlq $60, %rdi
2043; SSE2-NEXT:    sarq $63, %rdi
2044; SSE2-NEXT:    movd %edi, %xmm4
2045; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2046; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
2047; SSE2-NEXT:    shrq $15, %rbp
2048; SSE2-NEXT:    movd %ebp, %xmm1
2049; SSE2-NEXT:    shrq $7, %rax
2050; SSE2-NEXT:    movd %eax, %xmm2
2051; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2052; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2053; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2054; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2055; SSE2-NEXT:    popq %rbx
2056; SSE2-NEXT:    popq %r12
2057; SSE2-NEXT:    popq %r13
2058; SSE2-NEXT:    popq %r14
2059; SSE2-NEXT:    popq %r15
2060; SSE2-NEXT:    popq %rbp
2061; SSE2-NEXT:    retq
2062;
2063; SSSE3-LABEL: load_sext_16i1_to_16i8:
2064; SSSE3:       # BB#0: # %entry
2065; SSSE3-NEXT:    pushq %rbp
2066; SSSE3-NEXT:    pushq %r15
2067; SSSE3-NEXT:    pushq %r14
2068; SSSE3-NEXT:    pushq %r13
2069; SSSE3-NEXT:    pushq %r12
2070; SSSE3-NEXT:    pushq %rbx
2071; SSSE3-NEXT:    movswq (%rdi), %rax
2072; SSSE3-NEXT:    movq %rax, %r8
2073; SSSE3-NEXT:    movq %rax, %r9
2074; SSSE3-NEXT:    movq %rax, %r10
2075; SSSE3-NEXT:    movq %rax, %r11
2076; SSSE3-NEXT:    movq %rax, %r14
2077; SSSE3-NEXT:    movq %rax, %r15
2078; SSSE3-NEXT:    movq %rax, %r12
2079; SSSE3-NEXT:    movq %rax, %r13
2080; SSSE3-NEXT:    movq %rax, %rbx
2081; SSSE3-NEXT:    movq %rax, %rcx
2082; SSSE3-NEXT:    movq %rax, %rdx
2083; SSSE3-NEXT:    movq %rax, %rsi
2084; SSSE3-NEXT:    movq %rax, %rdi
2085; SSSE3-NEXT:    movq %rax, %rbp
2086; SSSE3-NEXT:    shlq $49, %rbp
2087; SSSE3-NEXT:    sarq $63, %rbp
2088; SSSE3-NEXT:    movd %ebp, %xmm0
2089; SSSE3-NEXT:    movq %rax, %rbp
2090; SSSE3-NEXT:    movsbq %al, %rax
2091; SSSE3-NEXT:    shlq $57, %r8
2092; SSSE3-NEXT:    sarq $63, %r8
2093; SSSE3-NEXT:    movd %r8d, %xmm1
2094; SSSE3-NEXT:    shlq $53, %r9
2095; SSSE3-NEXT:    sarq $63, %r9
2096; SSSE3-NEXT:    movd %r9d, %xmm2
2097; SSSE3-NEXT:    shlq $61, %r10
2098; SSSE3-NEXT:    sarq $63, %r10
2099; SSSE3-NEXT:    movd %r10d, %xmm3
2100; SSSE3-NEXT:    shlq $51, %r11
2101; SSSE3-NEXT:    sarq $63, %r11
2102; SSSE3-NEXT:    movd %r11d, %xmm4
2103; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2104; SSSE3-NEXT:    shlq $59, %r14
2105; SSSE3-NEXT:    sarq $63, %r14
2106; SSSE3-NEXT:    movd %r14d, %xmm5
2107; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2108; SSSE3-NEXT:    shlq $55, %r15
2109; SSSE3-NEXT:    sarq $63, %r15
2110; SSSE3-NEXT:    movd %r15d, %xmm2
2111; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
2112; SSSE3-NEXT:    shlq $63, %r12
2113; SSSE3-NEXT:    sarq $63, %r12
2114; SSSE3-NEXT:    movd %r12d, %xmm0
2115; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
2116; SSSE3-NEXT:    shlq $50, %r13
2117; SSSE3-NEXT:    sarq $63, %r13
2118; SSSE3-NEXT:    movd %r13d, %xmm1
2119; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2120; SSSE3-NEXT:    shlq $58, %rbx
2121; SSSE3-NEXT:    sarq $63, %rbx
2122; SSSE3-NEXT:    movd %ebx, %xmm2
2123; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2124; SSSE3-NEXT:    shlq $54, %rcx
2125; SSSE3-NEXT:    sarq $63, %rcx
2126; SSSE3-NEXT:    movd %ecx, %xmm4
2127; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2128; SSSE3-NEXT:    shlq $62, %rdx
2129; SSSE3-NEXT:    sarq $63, %rdx
2130; SSSE3-NEXT:    movd %edx, %xmm3
2131; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2132; SSSE3-NEXT:    shlq $52, %rsi
2133; SSSE3-NEXT:    sarq $63, %rsi
2134; SSSE3-NEXT:    movd %esi, %xmm1
2135; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2136; SSSE3-NEXT:    shlq $60, %rdi
2137; SSSE3-NEXT:    sarq $63, %rdi
2138; SSSE3-NEXT:    movd %edi, %xmm4
2139; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2140; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
2141; SSSE3-NEXT:    shrq $15, %rbp
2142; SSSE3-NEXT:    movd %ebp, %xmm1
2143; SSSE3-NEXT:    shrq $7, %rax
2144; SSSE3-NEXT:    movd %eax, %xmm2
2145; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2146; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2147; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2148; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2149; SSSE3-NEXT:    popq %rbx
2150; SSSE3-NEXT:    popq %r12
2151; SSSE3-NEXT:    popq %r13
2152; SSSE3-NEXT:    popq %r14
2153; SSSE3-NEXT:    popq %r15
2154; SSSE3-NEXT:    popq %rbp
2155; SSSE3-NEXT:    retq
2156;
2157; SSE41-LABEL: load_sext_16i1_to_16i8:
2158; SSE41:       # BB#0: # %entry
2159; SSE41-NEXT:    movswq (%rdi), %rax
2160; SSE41-NEXT:    movq %rax, %rcx
2161; SSE41-NEXT:    shlq $62, %rcx
2162; SSE41-NEXT:    sarq $63, %rcx
2163; SSE41-NEXT:    movq %rax, %rdx
2164; SSE41-NEXT:    shlq $63, %rdx
2165; SSE41-NEXT:    sarq $63, %rdx
2166; SSE41-NEXT:    movd %edx, %xmm0
2167; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
2168; SSE41-NEXT:    movq %rax, %rcx
2169; SSE41-NEXT:    shlq $61, %rcx
2170; SSE41-NEXT:    sarq $63, %rcx
2171; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
2172; SSE41-NEXT:    movq %rax, %rcx
2173; SSE41-NEXT:    shlq $60, %rcx
2174; SSE41-NEXT:    sarq $63, %rcx
2175; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
2176; SSE41-NEXT:    movq %rax, %rcx
2177; SSE41-NEXT:    shlq $59, %rcx
2178; SSE41-NEXT:    sarq $63, %rcx
2179; SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
2180; SSE41-NEXT:    movq %rax, %rcx
2181; SSE41-NEXT:    shlq $58, %rcx
2182; SSE41-NEXT:    sarq $63, %rcx
2183; SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
2184; SSE41-NEXT:    movq %rax, %rcx
2185; SSE41-NEXT:    shlq $57, %rcx
2186; SSE41-NEXT:    sarq $63, %rcx
2187; SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
2188; SSE41-NEXT:    movsbq %al, %rcx
2189; SSE41-NEXT:    shrq $7, %rcx
2190; SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
2191; SSE41-NEXT:    movq %rax, %rcx
2192; SSE41-NEXT:    shlq $55, %rcx
2193; SSE41-NEXT:    sarq $63, %rcx
2194; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
2195; SSE41-NEXT:    movq %rax, %rcx
2196; SSE41-NEXT:    shlq $54, %rcx
2197; SSE41-NEXT:    sarq $63, %rcx
2198; SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
2199; SSE41-NEXT:    movq %rax, %rcx
2200; SSE41-NEXT:    shlq $53, %rcx
2201; SSE41-NEXT:    sarq $63, %rcx
2202; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
2203; SSE41-NEXT:    movq %rax, %rcx
2204; SSE41-NEXT:    shlq $52, %rcx
2205; SSE41-NEXT:    sarq $63, %rcx
2206; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
2207; SSE41-NEXT:    movq %rax, %rcx
2208; SSE41-NEXT:    shlq $51, %rcx
2209; SSE41-NEXT:    sarq $63, %rcx
2210; SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
2211; SSE41-NEXT:    movq %rax, %rcx
2212; SSE41-NEXT:    shlq $50, %rcx
2213; SSE41-NEXT:    sarq $63, %rcx
2214; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
2215; SSE41-NEXT:    movq %rax, %rcx
2216; SSE41-NEXT:    shlq $49, %rcx
2217; SSE41-NEXT:    sarq $63, %rcx
2218; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
2219; SSE41-NEXT:    shrq $15, %rax
2220; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
2221; SSE41-NEXT:    retq
2222;
2223; AVX1-LABEL: load_sext_16i1_to_16i8:
2224; AVX1:       # BB#0: # %entry
2225; AVX1-NEXT:    movswq (%rdi), %rax
2226; AVX1-NEXT:    movq %rax, %rcx
2227; AVX1-NEXT:    shlq $62, %rcx
2228; AVX1-NEXT:    sarq $63, %rcx
2229; AVX1-NEXT:    movq %rax, %rdx
2230; AVX1-NEXT:    shlq $63, %rdx
2231; AVX1-NEXT:    sarq $63, %rdx
2232; AVX1-NEXT:    vmovd %edx, %xmm0
2233; AVX1-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
2234; AVX1-NEXT:    movq %rax, %rcx
2235; AVX1-NEXT:    shlq $61, %rcx
2236; AVX1-NEXT:    sarq $63, %rcx
2237; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
2238; AVX1-NEXT:    movq %rax, %rcx
2239; AVX1-NEXT:    shlq $60, %rcx
2240; AVX1-NEXT:    sarq $63, %rcx
2241; AVX1-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
2242; AVX1-NEXT:    movq %rax, %rcx
2243; AVX1-NEXT:    shlq $59, %rcx
2244; AVX1-NEXT:    sarq $63, %rcx
2245; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
2246; AVX1-NEXT:    movq %rax, %rcx
2247; AVX1-NEXT:    shlq $58, %rcx
2248; AVX1-NEXT:    sarq $63, %rcx
2249; AVX1-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
2250; AVX1-NEXT:    movq %rax, %rcx
2251; AVX1-NEXT:    shlq $57, %rcx
2252; AVX1-NEXT:    sarq $63, %rcx
2253; AVX1-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
2254; AVX1-NEXT:    movsbq %al, %rcx
2255; AVX1-NEXT:    shrq $7, %rcx
2256; AVX1-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
2257; AVX1-NEXT:    movq %rax, %rcx
2258; AVX1-NEXT:    shlq $55, %rcx
2259; AVX1-NEXT:    sarq $63, %rcx
2260; AVX1-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
2261; AVX1-NEXT:    movq %rax, %rcx
2262; AVX1-NEXT:    shlq $54, %rcx
2263; AVX1-NEXT:    sarq $63, %rcx
2264; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
2265; AVX1-NEXT:    movq %rax, %rcx
2266; AVX1-NEXT:    shlq $53, %rcx
2267; AVX1-NEXT:    sarq $63, %rcx
2268; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
2269; AVX1-NEXT:    movq %rax, %rcx
2270; AVX1-NEXT:    shlq $52, %rcx
2271; AVX1-NEXT:    sarq $63, %rcx
2272; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
2273; AVX1-NEXT:    movq %rax, %rcx
2274; AVX1-NEXT:    shlq $51, %rcx
2275; AVX1-NEXT:    sarq $63, %rcx
2276; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
2277; AVX1-NEXT:    movq %rax, %rcx
2278; AVX1-NEXT:    shlq $50, %rcx
2279; AVX1-NEXT:    sarq $63, %rcx
2280; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
2281; AVX1-NEXT:    movq %rax, %rcx
2282; AVX1-NEXT:    shlq $49, %rcx
2283; AVX1-NEXT:    sarq $63, %rcx
2284; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
2285; AVX1-NEXT:    shrq $15, %rax
2286; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2287; AVX1-NEXT:    retq
2288;
2289; AVX2-LABEL: load_sext_16i1_to_16i8:
2290; AVX2:       # BB#0: # %entry
2291; AVX2-NEXT:    movswq (%rdi), %rax
2292; AVX2-NEXT:    movq %rax, %rcx
2293; AVX2-NEXT:    shlq $62, %rcx
2294; AVX2-NEXT:    sarq $63, %rcx
2295; AVX2-NEXT:    movq %rax, %rdx
2296; AVX2-NEXT:    shlq $63, %rdx
2297; AVX2-NEXT:    sarq $63, %rdx
2298; AVX2-NEXT:    vmovd %edx, %xmm0
2299; AVX2-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
2300; AVX2-NEXT:    movq %rax, %rcx
2301; AVX2-NEXT:    shlq $61, %rcx
2302; AVX2-NEXT:    sarq $63, %rcx
2303; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
2304; AVX2-NEXT:    movq %rax, %rcx
2305; AVX2-NEXT:    shlq $60, %rcx
2306; AVX2-NEXT:    sarq $63, %rcx
2307; AVX2-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
2308; AVX2-NEXT:    movq %rax, %rcx
2309; AVX2-NEXT:    shlq $59, %rcx
2310; AVX2-NEXT:    sarq $63, %rcx
2311; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
2312; AVX2-NEXT:    movq %rax, %rcx
2313; AVX2-NEXT:    shlq $58, %rcx
2314; AVX2-NEXT:    sarq $63, %rcx
2315; AVX2-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
2316; AVX2-NEXT:    movq %rax, %rcx
2317; AVX2-NEXT:    shlq $57, %rcx
2318; AVX2-NEXT:    sarq $63, %rcx
2319; AVX2-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
2320; AVX2-NEXT:    movsbq %al, %rcx
2321; AVX2-NEXT:    shrq $7, %rcx
2322; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
2323; AVX2-NEXT:    movq %rax, %rcx
2324; AVX2-NEXT:    shlq $55, %rcx
2325; AVX2-NEXT:    sarq $63, %rcx
2326; AVX2-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
2327; AVX2-NEXT:    movq %rax, %rcx
2328; AVX2-NEXT:    shlq $54, %rcx
2329; AVX2-NEXT:    sarq $63, %rcx
2330; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
2331; AVX2-NEXT:    movq %rax, %rcx
2332; AVX2-NEXT:    shlq $53, %rcx
2333; AVX2-NEXT:    sarq $63, %rcx
2334; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
2335; AVX2-NEXT:    movq %rax, %rcx
2336; AVX2-NEXT:    shlq $52, %rcx
2337; AVX2-NEXT:    sarq $63, %rcx
2338; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
2339; AVX2-NEXT:    movq %rax, %rcx
2340; AVX2-NEXT:    shlq $51, %rcx
2341; AVX2-NEXT:    sarq $63, %rcx
2342; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
2343; AVX2-NEXT:    movq %rax, %rcx
2344; AVX2-NEXT:    shlq $50, %rcx
2345; AVX2-NEXT:    sarq $63, %rcx
2346; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
2347; AVX2-NEXT:    movq %rax, %rcx
2348; AVX2-NEXT:    shlq $49, %rcx
2349; AVX2-NEXT:    sarq $63, %rcx
2350; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
2351; AVX2-NEXT:    shrq $15, %rax
2352; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2353; AVX2-NEXT:    retq
2354;
2355; AVX512-LABEL: load_sext_16i1_to_16i8:
2356; AVX512:       # BB#0: # %entry
2357; AVX512-NEXT:    kmovw (%rdi), %k1
2358; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
2359; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
2360; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2361; AVX512-NEXT:    retq
2362;
2363; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
2364; X32-SSE41:       # BB#0: # %entry
2365; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2366; X32-SSE41-NEXT:    movswl (%eax), %eax
2367; X32-SSE41-NEXT:    movl %eax, %ecx
2368; X32-SSE41-NEXT:    shll $30, %ecx
2369; X32-SSE41-NEXT:    sarl $31, %ecx
2370; X32-SSE41-NEXT:    movl %eax, %edx
2371; X32-SSE41-NEXT:    shll $31, %edx
2372; X32-SSE41-NEXT:    sarl $31, %edx
2373; X32-SSE41-NEXT:    movd %edx, %xmm0
2374; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
2375; X32-SSE41-NEXT:    movl %eax, %ecx
2376; X32-SSE41-NEXT:    shll $29, %ecx
2377; X32-SSE41-NEXT:    sarl $31, %ecx
2378; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
2379; X32-SSE41-NEXT:    movl %eax, %ecx
2380; X32-SSE41-NEXT:    shll $28, %ecx
2381; X32-SSE41-NEXT:    sarl $31, %ecx
2382; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
2383; X32-SSE41-NEXT:    movl %eax, %ecx
2384; X32-SSE41-NEXT:    shll $27, %ecx
2385; X32-SSE41-NEXT:    sarl $31, %ecx
2386; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
2387; X32-SSE41-NEXT:    movl %eax, %ecx
2388; X32-SSE41-NEXT:    shll $26, %ecx
2389; X32-SSE41-NEXT:    sarl $31, %ecx
2390; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
2391; X32-SSE41-NEXT:    movl %eax, %ecx
2392; X32-SSE41-NEXT:    shll $25, %ecx
2393; X32-SSE41-NEXT:    sarl $31, %ecx
2394; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
2395; X32-SSE41-NEXT:    movsbl %al, %ecx
2396; X32-SSE41-NEXT:    shrl $7, %ecx
2397; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
2398; X32-SSE41-NEXT:    movl %eax, %ecx
2399; X32-SSE41-NEXT:    shll $23, %ecx
2400; X32-SSE41-NEXT:    sarl $31, %ecx
2401; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
2402; X32-SSE41-NEXT:    movl %eax, %ecx
2403; X32-SSE41-NEXT:    shll $22, %ecx
2404; X32-SSE41-NEXT:    sarl $31, %ecx
2405; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
2406; X32-SSE41-NEXT:    movl %eax, %ecx
2407; X32-SSE41-NEXT:    shll $21, %ecx
2408; X32-SSE41-NEXT:    sarl $31, %ecx
2409; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
2410; X32-SSE41-NEXT:    movl %eax, %ecx
2411; X32-SSE41-NEXT:    shll $20, %ecx
2412; X32-SSE41-NEXT:    sarl $31, %ecx
2413; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
2414; X32-SSE41-NEXT:    movl %eax, %ecx
2415; X32-SSE41-NEXT:    shll $19, %ecx
2416; X32-SSE41-NEXT:    sarl $31, %ecx
2417; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
2418; X32-SSE41-NEXT:    movl %eax, %ecx
2419; X32-SSE41-NEXT:    shll $18, %ecx
2420; X32-SSE41-NEXT:    sarl $31, %ecx
2421; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
2422; X32-SSE41-NEXT:    movl %eax, %ecx
2423; X32-SSE41-NEXT:    shll $17, %ecx
2424; X32-SSE41-NEXT:    sarl $31, %ecx
2425; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
2426; X32-SSE41-NEXT:    shrl $15, %eax
2427; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm0
2428; X32-SSE41-NEXT:    retl
2429entry:
2430 %X = load <16 x i1>, <16 x i1>* %ptr
2431 %Y = sext <16 x i1> %X to <16 x i8>
2432 ret <16 x i8> %Y
2433}
2434
2435define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
2436; SSE2-LABEL: load_sext_16i1_to_16i16:
2437; SSE2:       # BB#0: # %entry
2438; SSE2-NEXT:    movzwl (%rdi), %eax
2439; SSE2-NEXT:    movl %eax, %ecx
2440; SSE2-NEXT:    shrl $14, %ecx
2441; SSE2-NEXT:    andl $1, %ecx
2442; SSE2-NEXT:    movd %ecx, %xmm0
2443; SSE2-NEXT:    movl %eax, %ecx
2444; SSE2-NEXT:    shrl $6, %ecx
2445; SSE2-NEXT:    andl $1, %ecx
2446; SSE2-NEXT:    movd %ecx, %xmm1
2447; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2448; SSE2-NEXT:    movl %eax, %ecx
2449; SSE2-NEXT:    shrl $10, %ecx
2450; SSE2-NEXT:    andl $1, %ecx
2451; SSE2-NEXT:    movd %ecx, %xmm0
2452; SSE2-NEXT:    movl %eax, %ecx
2453; SSE2-NEXT:    shrl $2, %ecx
2454; SSE2-NEXT:    andl $1, %ecx
2455; SSE2-NEXT:    movd %ecx, %xmm2
2456; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2457; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2458; SSE2-NEXT:    movl %eax, %ecx
2459; SSE2-NEXT:    shrl $12, %ecx
2460; SSE2-NEXT:    andl $1, %ecx
2461; SSE2-NEXT:    movd %ecx, %xmm0
2462; SSE2-NEXT:    movl %eax, %ecx
2463; SSE2-NEXT:    shrl $4, %ecx
2464; SSE2-NEXT:    andl $1, %ecx
2465; SSE2-NEXT:    movd %ecx, %xmm3
2466; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2467; SSE2-NEXT:    movl %eax, %ecx
2468; SSE2-NEXT:    andl $1, %ecx
2469; SSE2-NEXT:    movd %ecx, %xmm1
2470; SSE2-NEXT:    movl %eax, %ecx
2471; SSE2-NEXT:    shrl $8, %ecx
2472; SSE2-NEXT:    andl $1, %ecx
2473; SSE2-NEXT:    movd %ecx, %xmm0
2474; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2475; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2476; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2477; SSE2-NEXT:    movl %eax, %ecx
2478; SSE2-NEXT:    shrl $13, %ecx
2479; SSE2-NEXT:    andl $1, %ecx
2480; SSE2-NEXT:    movd %ecx, %xmm0
2481; SSE2-NEXT:    movl %eax, %ecx
2482; SSE2-NEXT:    shrl $5, %ecx
2483; SSE2-NEXT:    andl $1, %ecx
2484; SSE2-NEXT:    movd %ecx, %xmm2
2485; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2486; SSE2-NEXT:    movl %eax, %ecx
2487; SSE2-NEXT:    shrl $9, %ecx
2488; SSE2-NEXT:    andl $1, %ecx
2489; SSE2-NEXT:    movd %ecx, %xmm3
2490; SSE2-NEXT:    movl %eax, %ecx
2491; SSE2-NEXT:    shrl %ecx
2492; SSE2-NEXT:    andl $1, %ecx
2493; SSE2-NEXT:    movd %ecx, %xmm0
2494; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2495; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2496; SSE2-NEXT:    movl %eax, %ecx
2497; SSE2-NEXT:    shrl $11, %ecx
2498; SSE2-NEXT:    andl $1, %ecx
2499; SSE2-NEXT:    movd %ecx, %xmm2
2500; SSE2-NEXT:    movl %eax, %ecx
2501; SSE2-NEXT:    shrl $3, %ecx
2502; SSE2-NEXT:    andl $1, %ecx
2503; SSE2-NEXT:    movd %ecx, %xmm3
2504; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2505; SSE2-NEXT:    movl %eax, %ecx
2506; SSE2-NEXT:    shrl $7, %ecx
2507; SSE2-NEXT:    andl $1, %ecx
2508; SSE2-NEXT:    movd %ecx, %xmm2
2509; SSE2-NEXT:    shrl $15, %eax
2510; SSE2-NEXT:    movzwl %ax, %eax
2511; SSE2-NEXT:    movd %eax, %xmm4
2512; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2513; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2514; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2515; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2516; SSE2-NEXT:    movdqa %xmm1, %xmm0
2517; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2518; SSE2-NEXT:    psllw $15, %xmm0
2519; SSE2-NEXT:    psraw $15, %xmm0
2520; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2521; SSE2-NEXT:    psllw $15, %xmm1
2522; SSE2-NEXT:    psraw $15, %xmm1
2523; SSE2-NEXT:    retq
2524;
2525; SSSE3-LABEL: load_sext_16i1_to_16i16:
2526; SSSE3:       # BB#0: # %entry
2527; SSSE3-NEXT:    movzwl (%rdi), %eax
2528; SSSE3-NEXT:    movl %eax, %ecx
2529; SSSE3-NEXT:    shrl $14, %ecx
2530; SSSE3-NEXT:    andl $1, %ecx
2531; SSSE3-NEXT:    movd %ecx, %xmm0
2532; SSSE3-NEXT:    movl %eax, %ecx
2533; SSSE3-NEXT:    shrl $6, %ecx
2534; SSSE3-NEXT:    andl $1, %ecx
2535; SSSE3-NEXT:    movd %ecx, %xmm1
2536; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2537; SSSE3-NEXT:    movl %eax, %ecx
2538; SSSE3-NEXT:    shrl $10, %ecx
2539; SSSE3-NEXT:    andl $1, %ecx
2540; SSSE3-NEXT:    movd %ecx, %xmm0
2541; SSSE3-NEXT:    movl %eax, %ecx
2542; SSSE3-NEXT:    shrl $2, %ecx
2543; SSSE3-NEXT:    andl $1, %ecx
2544; SSSE3-NEXT:    movd %ecx, %xmm2
2545; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2546; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2547; SSSE3-NEXT:    movl %eax, %ecx
2548; SSSE3-NEXT:    shrl $12, %ecx
2549; SSSE3-NEXT:    andl $1, %ecx
2550; SSSE3-NEXT:    movd %ecx, %xmm0
2551; SSSE3-NEXT:    movl %eax, %ecx
2552; SSSE3-NEXT:    shrl $4, %ecx
2553; SSSE3-NEXT:    andl $1, %ecx
2554; SSSE3-NEXT:    movd %ecx, %xmm3
2555; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2556; SSSE3-NEXT:    movl %eax, %ecx
2557; SSSE3-NEXT:    andl $1, %ecx
2558; SSSE3-NEXT:    movd %ecx, %xmm1
2559; SSSE3-NEXT:    movl %eax, %ecx
2560; SSSE3-NEXT:    shrl $8, %ecx
2561; SSSE3-NEXT:    andl $1, %ecx
2562; SSSE3-NEXT:    movd %ecx, %xmm0
2563; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2564; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2565; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2566; SSSE3-NEXT:    movl %eax, %ecx
2567; SSSE3-NEXT:    shrl $13, %ecx
2568; SSSE3-NEXT:    andl $1, %ecx
2569; SSSE3-NEXT:    movd %ecx, %xmm0
2570; SSSE3-NEXT:    movl %eax, %ecx
2571; SSSE3-NEXT:    shrl $5, %ecx
2572; SSSE3-NEXT:    andl $1, %ecx
2573; SSSE3-NEXT:    movd %ecx, %xmm2
2574; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2575; SSSE3-NEXT:    movl %eax, %ecx
2576; SSSE3-NEXT:    shrl $9, %ecx
2577; SSSE3-NEXT:    andl $1, %ecx
2578; SSSE3-NEXT:    movd %ecx, %xmm3
2579; SSSE3-NEXT:    movl %eax, %ecx
2580; SSSE3-NEXT:    shrl %ecx
2581; SSSE3-NEXT:    andl $1, %ecx
2582; SSSE3-NEXT:    movd %ecx, %xmm0
2583; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2584; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2585; SSSE3-NEXT:    movl %eax, %ecx
2586; SSSE3-NEXT:    shrl $11, %ecx
2587; SSSE3-NEXT:    andl $1, %ecx
2588; SSSE3-NEXT:    movd %ecx, %xmm2
2589; SSSE3-NEXT:    movl %eax, %ecx
2590; SSSE3-NEXT:    shrl $3, %ecx
2591; SSSE3-NEXT:    andl $1, %ecx
2592; SSSE3-NEXT:    movd %ecx, %xmm3
2593; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2594; SSSE3-NEXT:    movl %eax, %ecx
2595; SSSE3-NEXT:    shrl $7, %ecx
2596; SSSE3-NEXT:    andl $1, %ecx
2597; SSSE3-NEXT:    movd %ecx, %xmm2
2598; SSSE3-NEXT:    shrl $15, %eax
2599; SSSE3-NEXT:    movzwl %ax, %eax
2600; SSSE3-NEXT:    movd %eax, %xmm4
2601; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2602; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2603; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2604; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2605; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2606; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2607; SSSE3-NEXT:    psllw $15, %xmm0
2608; SSSE3-NEXT:    psraw $15, %xmm0
2609; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2610; SSSE3-NEXT:    psllw $15, %xmm1
2611; SSSE3-NEXT:    psraw $15, %xmm1
2612; SSSE3-NEXT:    retq
2613;
2614; SSE41-LABEL: load_sext_16i1_to_16i16:
2615; SSE41:       # BB#0: # %entry
2616; SSE41-NEXT:    movzwl (%rdi), %eax
2617; SSE41-NEXT:    movl %eax, %ecx
2618; SSE41-NEXT:    shrl %ecx
2619; SSE41-NEXT:    andl $1, %ecx
2620; SSE41-NEXT:    movl %eax, %edx
2621; SSE41-NEXT:    andl $1, %edx
2622; SSE41-NEXT:    movd %edx, %xmm1
2623; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
2624; SSE41-NEXT:    movl %eax, %ecx
2625; SSE41-NEXT:    shrl $2, %ecx
2626; SSE41-NEXT:    andl $1, %ecx
2627; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
2628; SSE41-NEXT:    movl %eax, %ecx
2629; SSE41-NEXT:    shrl $3, %ecx
2630; SSE41-NEXT:    andl $1, %ecx
2631; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
2632; SSE41-NEXT:    movl %eax, %ecx
2633; SSE41-NEXT:    shrl $4, %ecx
2634; SSE41-NEXT:    andl $1, %ecx
2635; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
2636; SSE41-NEXT:    movl %eax, %ecx
2637; SSE41-NEXT:    shrl $5, %ecx
2638; SSE41-NEXT:    andl $1, %ecx
2639; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
2640; SSE41-NEXT:    movl %eax, %ecx
2641; SSE41-NEXT:    shrl $6, %ecx
2642; SSE41-NEXT:    andl $1, %ecx
2643; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
2644; SSE41-NEXT:    movl %eax, %ecx
2645; SSE41-NEXT:    shrl $7, %ecx
2646; SSE41-NEXT:    andl $1, %ecx
2647; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
2648; SSE41-NEXT:    movl %eax, %ecx
2649; SSE41-NEXT:    shrl $8, %ecx
2650; SSE41-NEXT:    andl $1, %ecx
2651; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
2652; SSE41-NEXT:    movl %eax, %ecx
2653; SSE41-NEXT:    shrl $9, %ecx
2654; SSE41-NEXT:    andl $1, %ecx
2655; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
2656; SSE41-NEXT:    movl %eax, %ecx
2657; SSE41-NEXT:    shrl $10, %ecx
2658; SSE41-NEXT:    andl $1, %ecx
2659; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
2660; SSE41-NEXT:    movl %eax, %ecx
2661; SSE41-NEXT:    shrl $11, %ecx
2662; SSE41-NEXT:    andl $1, %ecx
2663; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
2664; SSE41-NEXT:    movl %eax, %ecx
2665; SSE41-NEXT:    shrl $12, %ecx
2666; SSE41-NEXT:    andl $1, %ecx
2667; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
2668; SSE41-NEXT:    movl %eax, %ecx
2669; SSE41-NEXT:    shrl $13, %ecx
2670; SSE41-NEXT:    andl $1, %ecx
2671; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
2672; SSE41-NEXT:    movl %eax, %ecx
2673; SSE41-NEXT:    shrl $14, %ecx
2674; SSE41-NEXT:    andl $1, %ecx
2675; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
2676; SSE41-NEXT:    shrl $15, %eax
2677; SSE41-NEXT:    movzwl %ax, %eax
2678; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
2679; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2680; SSE41-NEXT:    psllw $15, %xmm0
2681; SSE41-NEXT:    psraw $15, %xmm0
2682; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2683; SSE41-NEXT:    psllw $15, %xmm1
2684; SSE41-NEXT:    psraw $15, %xmm1
2685; SSE41-NEXT:    retq
2686;
2687; AVX1-LABEL: load_sext_16i1_to_16i16:
2688; AVX1:       # BB#0: # %entry
2689; AVX1-NEXT:    pushq %rbp
2690; AVX1-NEXT:  .Ltmp0:
2691; AVX1-NEXT:    .cfi_def_cfa_offset 16
2692; AVX1-NEXT:    pushq %r15
2693; AVX1-NEXT:  .Ltmp1:
2694; AVX1-NEXT:    .cfi_def_cfa_offset 24
2695; AVX1-NEXT:    pushq %r14
2696; AVX1-NEXT:  .Ltmp2:
2697; AVX1-NEXT:    .cfi_def_cfa_offset 32
2698; AVX1-NEXT:    pushq %r13
2699; AVX1-NEXT:  .Ltmp3:
2700; AVX1-NEXT:    .cfi_def_cfa_offset 40
2701; AVX1-NEXT:    pushq %r12
2702; AVX1-NEXT:  .Ltmp4:
2703; AVX1-NEXT:    .cfi_def_cfa_offset 48
2704; AVX1-NEXT:    pushq %rbx
2705; AVX1-NEXT:  .Ltmp5:
2706; AVX1-NEXT:    .cfi_def_cfa_offset 56
2707; AVX1-NEXT:  .Ltmp6:
2708; AVX1-NEXT:    .cfi_offset %rbx, -56
2709; AVX1-NEXT:  .Ltmp7:
2710; AVX1-NEXT:    .cfi_offset %r12, -48
2711; AVX1-NEXT:  .Ltmp8:
2712; AVX1-NEXT:    .cfi_offset %r13, -40
2713; AVX1-NEXT:  .Ltmp9:
2714; AVX1-NEXT:    .cfi_offset %r14, -32
2715; AVX1-NEXT:  .Ltmp10:
2716; AVX1-NEXT:    .cfi_offset %r15, -24
2717; AVX1-NEXT:  .Ltmp11:
2718; AVX1-NEXT:    .cfi_offset %rbp, -16
2719; AVX1-NEXT:    movswq (%rdi), %rax
2720; AVX1-NEXT:    movq %rax, %rcx
2721; AVX1-NEXT:    shlq $55, %rcx
2722; AVX1-NEXT:    sarq $63, %rcx
2723; AVX1-NEXT:    vmovd %ecx, %xmm0
2724; AVX1-NEXT:    movq %rax, %r8
2725; AVX1-NEXT:    movq %rax, %r10
2726; AVX1-NEXT:    movq %rax, %r11
2727; AVX1-NEXT:    movq %rax, %r14
2728; AVX1-NEXT:    movq %rax, %r15
2729; AVX1-NEXT:    movq %rax, %r9
2730; AVX1-NEXT:    movq %rax, %r12
2731; AVX1-NEXT:    movq %rax, %r13
2732; AVX1-NEXT:    movq %rax, %rbx
2733; AVX1-NEXT:    movq %rax, %rdi
2734; AVX1-NEXT:    movq %rax, %rcx
2735; AVX1-NEXT:    movq %rax, %rdx
2736; AVX1-NEXT:    movq %rax, %rsi
2737; AVX1-NEXT:    movsbq %al, %rbp
2738; AVX1-NEXT:    shlq $54, %rax
2739; AVX1-NEXT:    sarq $63, %rax
2740; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2741; AVX1-NEXT:    shlq $53, %r8
2742; AVX1-NEXT:    sarq $63, %r8
2743; AVX1-NEXT:    vpinsrw $2, %r8d, %xmm0, %xmm0
2744; AVX1-NEXT:    shlq $52, %r10
2745; AVX1-NEXT:    sarq $63, %r10
2746; AVX1-NEXT:    vpinsrw $3, %r10d, %xmm0, %xmm0
2747; AVX1-NEXT:    shlq $51, %r11
2748; AVX1-NEXT:    sarq $63, %r11
2749; AVX1-NEXT:    vpinsrw $4, %r11d, %xmm0, %xmm0
2750; AVX1-NEXT:    shlq $50, %r14
2751; AVX1-NEXT:    sarq $63, %r14
2752; AVX1-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
2753; AVX1-NEXT:    shlq $49, %r15
2754; AVX1-NEXT:    sarq $63, %r15
2755; AVX1-NEXT:    vpinsrw $6, %r15d, %xmm0, %xmm0
2756; AVX1-NEXT:    shrq $15, %r9
2757; AVX1-NEXT:    vpinsrw $7, %r9d, %xmm0, %xmm0
2758; AVX1-NEXT:    shlq $63, %r13
2759; AVX1-NEXT:    sarq $63, %r13
2760; AVX1-NEXT:    vmovd %r13d, %xmm1
2761; AVX1-NEXT:    shlq $62, %r12
2762; AVX1-NEXT:    sarq $63, %r12
2763; AVX1-NEXT:    vpinsrw $1, %r12d, %xmm1, %xmm1
2764; AVX1-NEXT:    shlq $61, %rbx
2765; AVX1-NEXT:    sarq $63, %rbx
2766; AVX1-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
2767; AVX1-NEXT:    shlq $60, %rdi
2768; AVX1-NEXT:    sarq $63, %rdi
2769; AVX1-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
2770; AVX1-NEXT:    shlq $59, %rcx
2771; AVX1-NEXT:    sarq $63, %rcx
2772; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
2773; AVX1-NEXT:    shlq $58, %rdx
2774; AVX1-NEXT:    sarq $63, %rdx
2775; AVX1-NEXT:    vpinsrw $5, %edx, %xmm1, %xmm1
2776; AVX1-NEXT:    shlq $57, %rsi
2777; AVX1-NEXT:    sarq $63, %rsi
2778; AVX1-NEXT:    vpinsrw $6, %esi, %xmm1, %xmm1
2779; AVX1-NEXT:    shrq $7, %rbp
2780; AVX1-NEXT:    vpinsrw $7, %ebp, %xmm1, %xmm1
2781; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2782; AVX1-NEXT:    popq %rbx
2783; AVX1-NEXT:    popq %r12
2784; AVX1-NEXT:    popq %r13
2785; AVX1-NEXT:    popq %r14
2786; AVX1-NEXT:    popq %r15
2787; AVX1-NEXT:    popq %rbp
2788; AVX1-NEXT:    retq
2789;
2790; AVX2-LABEL: load_sext_16i1_to_16i16:
2791; AVX2:       # BB#0: # %entry
2792; AVX2-NEXT:    pushq %rbp
2793; AVX2-NEXT:  .Ltmp0:
2794; AVX2-NEXT:    .cfi_def_cfa_offset 16
2795; AVX2-NEXT:    pushq %r15
2796; AVX2-NEXT:  .Ltmp1:
2797; AVX2-NEXT:    .cfi_def_cfa_offset 24
2798; AVX2-NEXT:    pushq %r14
2799; AVX2-NEXT:  .Ltmp2:
2800; AVX2-NEXT:    .cfi_def_cfa_offset 32
2801; AVX2-NEXT:    pushq %r13
2802; AVX2-NEXT:  .Ltmp3:
2803; AVX2-NEXT:    .cfi_def_cfa_offset 40
2804; AVX2-NEXT:    pushq %r12
2805; AVX2-NEXT:  .Ltmp4:
2806; AVX2-NEXT:    .cfi_def_cfa_offset 48
2807; AVX2-NEXT:    pushq %rbx
2808; AVX2-NEXT:  .Ltmp5:
2809; AVX2-NEXT:    .cfi_def_cfa_offset 56
2810; AVX2-NEXT:  .Ltmp6:
2811; AVX2-NEXT:    .cfi_offset %rbx, -56
2812; AVX2-NEXT:  .Ltmp7:
2813; AVX2-NEXT:    .cfi_offset %r12, -48
2814; AVX2-NEXT:  .Ltmp8:
2815; AVX2-NEXT:    .cfi_offset %r13, -40
2816; AVX2-NEXT:  .Ltmp9:
2817; AVX2-NEXT:    .cfi_offset %r14, -32
2818; AVX2-NEXT:  .Ltmp10:
2819; AVX2-NEXT:    .cfi_offset %r15, -24
2820; AVX2-NEXT:  .Ltmp11:
2821; AVX2-NEXT:    .cfi_offset %rbp, -16
2822; AVX2-NEXT:    movswq (%rdi), %rax
2823; AVX2-NEXT:    movq %rax, %rcx
2824; AVX2-NEXT:    shlq $55, %rcx
2825; AVX2-NEXT:    sarq $63, %rcx
2826; AVX2-NEXT:    vmovd %ecx, %xmm0
2827; AVX2-NEXT:    movq %rax, %r8
2828; AVX2-NEXT:    movq %rax, %r10
2829; AVX2-NEXT:    movq %rax, %r11
2830; AVX2-NEXT:    movq %rax, %r14
2831; AVX2-NEXT:    movq %rax, %r15
2832; AVX2-NEXT:    movq %rax, %r9
2833; AVX2-NEXT:    movq %rax, %r12
2834; AVX2-NEXT:    movq %rax, %r13
2835; AVX2-NEXT:    movq %rax, %rbx
2836; AVX2-NEXT:    movq %rax, %rdi
2837; AVX2-NEXT:    movq %rax, %rcx
2838; AVX2-NEXT:    movq %rax, %rdx
2839; AVX2-NEXT:    movq %rax, %rsi
2840; AVX2-NEXT:    movsbq %al, %rbp
2841; AVX2-NEXT:    shlq $54, %rax
2842; AVX2-NEXT:    sarq $63, %rax
2843; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2844; AVX2-NEXT:    shlq $53, %r8
2845; AVX2-NEXT:    sarq $63, %r8
2846; AVX2-NEXT:    vpinsrw $2, %r8d, %xmm0, %xmm0
2847; AVX2-NEXT:    shlq $52, %r10
2848; AVX2-NEXT:    sarq $63, %r10
2849; AVX2-NEXT:    vpinsrw $3, %r10d, %xmm0, %xmm0
2850; AVX2-NEXT:    shlq $51, %r11
2851; AVX2-NEXT:    sarq $63, %r11
2852; AVX2-NEXT:    vpinsrw $4, %r11d, %xmm0, %xmm0
2853; AVX2-NEXT:    shlq $50, %r14
2854; AVX2-NEXT:    sarq $63, %r14
2855; AVX2-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
2856; AVX2-NEXT:    shlq $49, %r15
2857; AVX2-NEXT:    sarq $63, %r15
2858; AVX2-NEXT:    vpinsrw $6, %r15d, %xmm0, %xmm0
2859; AVX2-NEXT:    shrq $15, %r9
2860; AVX2-NEXT:    vpinsrw $7, %r9d, %xmm0, %xmm0
2861; AVX2-NEXT:    shlq $63, %r13
2862; AVX2-NEXT:    sarq $63, %r13
2863; AVX2-NEXT:    vmovd %r13d, %xmm1
2864; AVX2-NEXT:    shlq $62, %r12
2865; AVX2-NEXT:    sarq $63, %r12
2866; AVX2-NEXT:    vpinsrw $1, %r12d, %xmm1, %xmm1
2867; AVX2-NEXT:    shlq $61, %rbx
2868; AVX2-NEXT:    sarq $63, %rbx
2869; AVX2-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
2870; AVX2-NEXT:    shlq $60, %rdi
2871; AVX2-NEXT:    sarq $63, %rdi
2872; AVX2-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
2873; AVX2-NEXT:    shlq $59, %rcx
2874; AVX2-NEXT:    sarq $63, %rcx
2875; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
2876; AVX2-NEXT:    shlq $58, %rdx
2877; AVX2-NEXT:    sarq $63, %rdx
2878; AVX2-NEXT:    vpinsrw $5, %edx, %xmm1, %xmm1
2879; AVX2-NEXT:    shlq $57, %rsi
2880; AVX2-NEXT:    sarq $63, %rsi
2881; AVX2-NEXT:    vpinsrw $6, %esi, %xmm1, %xmm1
2882; AVX2-NEXT:    shrq $7, %rbp
2883; AVX2-NEXT:    vpinsrw $7, %ebp, %xmm1, %xmm1
2884; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
2885; AVX2-NEXT:    popq %rbx
2886; AVX2-NEXT:    popq %r12
2887; AVX2-NEXT:    popq %r13
2888; AVX2-NEXT:    popq %r14
2889; AVX2-NEXT:    popq %r15
2890; AVX2-NEXT:    popq %rbp
2891; AVX2-NEXT:    retq
2892;
2893; AVX512-LABEL: load_sext_16i1_to_16i16:
2894; AVX512:       # BB#0: # %entry
2895; AVX512-NEXT:    kmovw (%rdi), %k1
2896; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
2897; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
2898; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2899; AVX512-NEXT:    retq
2900;
2901; X32-SSE41-LABEL: load_sext_16i1_to_16i16:
2902; X32-SSE41:       # BB#0: # %entry
2903; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2904; X32-SSE41-NEXT:    movzwl (%eax), %eax
2905; X32-SSE41-NEXT:    movl %eax, %ecx
2906; X32-SSE41-NEXT:    shrl %ecx
2907; X32-SSE41-NEXT:    andl $1, %ecx
2908; X32-SSE41-NEXT:    movl %eax, %edx
2909; X32-SSE41-NEXT:    andl $1, %edx
2910; X32-SSE41-NEXT:    movd %edx, %xmm1
2911; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
2912; X32-SSE41-NEXT:    movl %eax, %ecx
2913; X32-SSE41-NEXT:    shrl $2, %ecx
2914; X32-SSE41-NEXT:    andl $1, %ecx
2915; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
2916; X32-SSE41-NEXT:    movl %eax, %ecx
2917; X32-SSE41-NEXT:    shrl $3, %ecx
2918; X32-SSE41-NEXT:    andl $1, %ecx
2919; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
2920; X32-SSE41-NEXT:    movl %eax, %ecx
2921; X32-SSE41-NEXT:    shrl $4, %ecx
2922; X32-SSE41-NEXT:    andl $1, %ecx
2923; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
2924; X32-SSE41-NEXT:    movl %eax, %ecx
2925; X32-SSE41-NEXT:    shrl $5, %ecx
2926; X32-SSE41-NEXT:    andl $1, %ecx
2927; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
2928; X32-SSE41-NEXT:    movl %eax, %ecx
2929; X32-SSE41-NEXT:    shrl $6, %ecx
2930; X32-SSE41-NEXT:    andl $1, %ecx
2931; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
2932; X32-SSE41-NEXT:    movl %eax, %ecx
2933; X32-SSE41-NEXT:    shrl $7, %ecx
2934; X32-SSE41-NEXT:    andl $1, %ecx
2935; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
2936; X32-SSE41-NEXT:    movl %eax, %ecx
2937; X32-SSE41-NEXT:    shrl $8, %ecx
2938; X32-SSE41-NEXT:    andl $1, %ecx
2939; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
2940; X32-SSE41-NEXT:    movl %eax, %ecx
2941; X32-SSE41-NEXT:    shrl $9, %ecx
2942; X32-SSE41-NEXT:    andl $1, %ecx
2943; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
2944; X32-SSE41-NEXT:    movl %eax, %ecx
2945; X32-SSE41-NEXT:    shrl $10, %ecx
2946; X32-SSE41-NEXT:    andl $1, %ecx
2947; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
2948; X32-SSE41-NEXT:    movl %eax, %ecx
2949; X32-SSE41-NEXT:    shrl $11, %ecx
2950; X32-SSE41-NEXT:    andl $1, %ecx
2951; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
2952; X32-SSE41-NEXT:    movl %eax, %ecx
2953; X32-SSE41-NEXT:    shrl $12, %ecx
2954; X32-SSE41-NEXT:    andl $1, %ecx
2955; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
2956; X32-SSE41-NEXT:    movl %eax, %ecx
2957; X32-SSE41-NEXT:    shrl $13, %ecx
2958; X32-SSE41-NEXT:    andl $1, %ecx
2959; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
2960; X32-SSE41-NEXT:    movl %eax, %ecx
2961; X32-SSE41-NEXT:    shrl $14, %ecx
2962; X32-SSE41-NEXT:    andl $1, %ecx
2963; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
2964; X32-SSE41-NEXT:    shrl $15, %eax
2965; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm1
2966; X32-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2967; X32-SSE41-NEXT:    psllw $15, %xmm0
2968; X32-SSE41-NEXT:    psraw $15, %xmm0
2969; X32-SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2970; X32-SSE41-NEXT:    psllw $15, %xmm1
2971; X32-SSE41-NEXT:    psraw $15, %xmm1
2972; X32-SSE41-NEXT:    retl
2973entry:
2974 %X = load <16 x i1>, <16 x i1>* %ptr
2975 %Y = sext <16 x i1> %X to <16 x i16>
2976 ret <16 x i16> %Y
2977}
2978
2979define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
2980; SSE2-LABEL: load_sext_32i1_to_32i8:
2981; SSE2:       # BB#0: # %entry
2982; SSE2-NEXT:    pushq %rbp
2983; SSE2-NEXT:    pushq %r15
2984; SSE2-NEXT:    pushq %r14
2985; SSE2-NEXT:    pushq %r13
2986; SSE2-NEXT:    pushq %r12
2987; SSE2-NEXT:    pushq %rbx
2988; SSE2-NEXT:    movswq (%rdi), %rbx
2989; SSE2-NEXT:    movq %rbx, %r10
2990; SSE2-NEXT:    movq %rbx, %r8
2991; SSE2-NEXT:    movq %rbx, %r9
2992; SSE2-NEXT:    movq %rbx, %r11
2993; SSE2-NEXT:    movq %rbx, %r14
2994; SSE2-NEXT:    movq %rbx, %r15
2995; SSE2-NEXT:    movq %rbx, %r12
2996; SSE2-NEXT:    movq %rbx, %r13
2997; SSE2-NEXT:    movq %rbx, %rdx
2998; SSE2-NEXT:    movq %rbx, %rsi
2999; SSE2-NEXT:    movq %rbx, %rcx
3000; SSE2-NEXT:    movq %rbx, %rbp
3001; SSE2-NEXT:    movq %rbx, %rax
3002; SSE2-NEXT:    shlq $49, %rax
3003; SSE2-NEXT:    sarq $63, %rax
3004; SSE2-NEXT:    movd %eax, %xmm0
3005; SSE2-NEXT:    movq %rbx, %rax
3006; SSE2-NEXT:    shlq $57, %r10
3007; SSE2-NEXT:    sarq $63, %r10
3008; SSE2-NEXT:    movd %r10d, %xmm15
3009; SSE2-NEXT:    movq %rbx, %r10
3010; SSE2-NEXT:    movsbq %bl, %rbx
3011; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
3012; SSE2-NEXT:    shlq $53, %r8
3013; SSE2-NEXT:    sarq $63, %r8
3014; SSE2-NEXT:    movd %r8d, %xmm8
3015; SSE2-NEXT:    shlq $61, %r9
3016; SSE2-NEXT:    sarq $63, %r9
3017; SSE2-NEXT:    movd %r9d, %xmm2
3018; SSE2-NEXT:    shlq $51, %r11
3019; SSE2-NEXT:    sarq $63, %r11
3020; SSE2-NEXT:    movd %r11d, %xmm9
3021; SSE2-NEXT:    shlq $59, %r14
3022; SSE2-NEXT:    sarq $63, %r14
3023; SSE2-NEXT:    movd %r14d, %xmm5
3024; SSE2-NEXT:    shlq $55, %r15
3025; SSE2-NEXT:    sarq $63, %r15
3026; SSE2-NEXT:    movd %r15d, %xmm10
3027; SSE2-NEXT:    shlq $63, %r12
3028; SSE2-NEXT:    sarq $63, %r12
3029; SSE2-NEXT:    movd %r12d, %xmm0
3030; SSE2-NEXT:    shlq $50, %r13
3031; SSE2-NEXT:    sarq $63, %r13
3032; SSE2-NEXT:    movd %r13d, %xmm11
3033; SSE2-NEXT:    shlq $58, %rdx
3034; SSE2-NEXT:    sarq $63, %rdx
3035; SSE2-NEXT:    movd %edx, %xmm4
3036; SSE2-NEXT:    shlq $54, %rsi
3037; SSE2-NEXT:    sarq $63, %rsi
3038; SSE2-NEXT:    movd %esi, %xmm12
3039; SSE2-NEXT:    shlq $62, %rcx
3040; SSE2-NEXT:    sarq $63, %rcx
3041; SSE2-NEXT:    movd %ecx, %xmm6
3042; SSE2-NEXT:    shlq $52, %rbp
3043; SSE2-NEXT:    sarq $63, %rbp
3044; SSE2-NEXT:    movd %ebp, %xmm13
3045; SSE2-NEXT:    shlq $60, %rax
3046; SSE2-NEXT:    sarq $63, %rax
3047; SSE2-NEXT:    movd %eax, %xmm7
3048; SSE2-NEXT:    shrq $15, %r10
3049; SSE2-NEXT:    movd %r10d, %xmm14
3050; SSE2-NEXT:    shrq $7, %rbx
3051; SSE2-NEXT:    movd %ebx, %xmm3
3052; SSE2-NEXT:    movswq 2(%rdi), %rdx
3053; SSE2-NEXT:    movq %rdx, %r8
3054; SSE2-NEXT:    movq %rdx, %r9
3055; SSE2-NEXT:    movq %rdx, %r10
3056; SSE2-NEXT:    movq %rdx, %r11
3057; SSE2-NEXT:    movq %rdx, %r14
3058; SSE2-NEXT:    movq %rdx, %r15
3059; SSE2-NEXT:    movq %rdx, %r12
3060; SSE2-NEXT:    movq %rdx, %r13
3061; SSE2-NEXT:    movq %rdx, %rbx
3062; SSE2-NEXT:    movq %rdx, %rax
3063; SSE2-NEXT:    movq %rdx, %rcx
3064; SSE2-NEXT:    movq %rdx, %rsi
3065; SSE2-NEXT:    movq %rdx, %rdi
3066; SSE2-NEXT:    movq %rdx, %rbp
3067; SSE2-NEXT:    shlq $49, %rbp
3068; SSE2-NEXT:    sarq $63, %rbp
3069; SSE2-NEXT:    movd %ebp, %xmm1
3070; SSE2-NEXT:    movq %rdx, %rbp
3071; SSE2-NEXT:    movsbq %dl, %rdx
3072; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
3073; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
3074; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
3075; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
3076; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
3077; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3078; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
3079; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
3080; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
3081; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
3082; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
3083; SSE2-NEXT:    shlq $57, %r8
3084; SSE2-NEXT:    sarq $63, %r8
3085; SSE2-NEXT:    movd %r8d, %xmm2
3086; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
3087; SSE2-NEXT:    shlq $53, %r9
3088; SSE2-NEXT:    sarq $63, %r9
3089; SSE2-NEXT:    movd %r9d, %xmm3
3090; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
3091; SSE2-NEXT:    shlq $61, %r10
3092; SSE2-NEXT:    sarq $63, %r10
3093; SSE2-NEXT:    movd %r10d, %xmm4
3094; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
3095; SSE2-NEXT:    shlq $51, %r11
3096; SSE2-NEXT:    sarq $63, %r11
3097; SSE2-NEXT:    movd %r11d, %xmm5
3098; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3099; SSE2-NEXT:    shlq $59, %r14
3100; SSE2-NEXT:    sarq $63, %r14
3101; SSE2-NEXT:    movd %r14d, %xmm6
3102; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
3103; SSE2-NEXT:    shlq $55, %r15
3104; SSE2-NEXT:    sarq $63, %r15
3105; SSE2-NEXT:    movd %r15d, %xmm3
3106; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
3107; SSE2-NEXT:    shlq $63, %r12
3108; SSE2-NEXT:    sarq $63, %r12
3109; SSE2-NEXT:    movd %r12d, %xmm1
3110; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
3111; SSE2-NEXT:    shlq $50, %r13
3112; SSE2-NEXT:    sarq $63, %r13
3113; SSE2-NEXT:    movd %r13d, %xmm2
3114; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
3115; SSE2-NEXT:    shlq $58, %rbx
3116; SSE2-NEXT:    sarq $63, %rbx
3117; SSE2-NEXT:    movd %ebx, %xmm3
3118; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
3119; SSE2-NEXT:    shlq $54, %rax
3120; SSE2-NEXT:    sarq $63, %rax
3121; SSE2-NEXT:    movd %eax, %xmm5
3122; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
3123; SSE2-NEXT:    shlq $62, %rcx
3124; SSE2-NEXT:    sarq $63, %rcx
3125; SSE2-NEXT:    movd %ecx, %xmm4
3126; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3127; SSE2-NEXT:    shlq $52, %rsi
3128; SSE2-NEXT:    sarq $63, %rsi
3129; SSE2-NEXT:    movd %esi, %xmm2
3130; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
3131; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
3132; SSE2-NEXT:    shlq $60, %rdi
3133; SSE2-NEXT:    sarq $63, %rdi
3134; SSE2-NEXT:    movd %edi, %xmm3
3135; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3136; SSE2-NEXT:    shrq $15, %rbp
3137; SSE2-NEXT:    movd %ebp, %xmm2
3138; SSE2-NEXT:    shrq $7, %rdx
3139; SSE2-NEXT:    movd %edx, %xmm5
3140; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
3141; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
3142; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
3143; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
3144; SSE2-NEXT:    popq %rbx
3145; SSE2-NEXT:    popq %r12
3146; SSE2-NEXT:    popq %r13
3147; SSE2-NEXT:    popq %r14
3148; SSE2-NEXT:    popq %r15
3149; SSE2-NEXT:    popq %rbp
3150; SSE2-NEXT:    retq
3151;
3152; SSSE3-LABEL: load_sext_32i1_to_32i8:
3153; SSSE3:       # BB#0: # %entry
3154; SSSE3-NEXT:    pushq %rbp
3155; SSSE3-NEXT:    pushq %r15
3156; SSSE3-NEXT:    pushq %r14
3157; SSSE3-NEXT:    pushq %r13
3158; SSSE3-NEXT:    pushq %r12
3159; SSSE3-NEXT:    pushq %rbx
3160; SSSE3-NEXT:    movswq (%rdi), %rbx
3161; SSSE3-NEXT:    movq %rbx, %r10
3162; SSSE3-NEXT:    movq %rbx, %r8
3163; SSSE3-NEXT:    movq %rbx, %r9
3164; SSSE3-NEXT:    movq %rbx, %r11
3165; SSSE3-NEXT:    movq %rbx, %r14
3166; SSSE3-NEXT:    movq %rbx, %r15
3167; SSSE3-NEXT:    movq %rbx, %r12
3168; SSSE3-NEXT:    movq %rbx, %r13
3169; SSSE3-NEXT:    movq %rbx, %rdx
3170; SSSE3-NEXT:    movq %rbx, %rsi
3171; SSSE3-NEXT:    movq %rbx, %rcx
3172; SSSE3-NEXT:    movq %rbx, %rbp
3173; SSSE3-NEXT:    movq %rbx, %rax
3174; SSSE3-NEXT:    shlq $49, %rax
3175; SSSE3-NEXT:    sarq $63, %rax
3176; SSSE3-NEXT:    movd %eax, %xmm0
3177; SSSE3-NEXT:    movq %rbx, %rax
3178; SSSE3-NEXT:    shlq $57, %r10
3179; SSSE3-NEXT:    sarq $63, %r10
3180; SSSE3-NEXT:    movd %r10d, %xmm15
3181; SSSE3-NEXT:    movq %rbx, %r10
3182; SSSE3-NEXT:    movsbq %bl, %rbx
3183; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
3184; SSSE3-NEXT:    shlq $53, %r8
3185; SSSE3-NEXT:    sarq $63, %r8
3186; SSSE3-NEXT:    movd %r8d, %xmm8
3187; SSSE3-NEXT:    shlq $61, %r9
3188; SSSE3-NEXT:    sarq $63, %r9
3189; SSSE3-NEXT:    movd %r9d, %xmm2
3190; SSSE3-NEXT:    shlq $51, %r11
3191; SSSE3-NEXT:    sarq $63, %r11
3192; SSSE3-NEXT:    movd %r11d, %xmm9
3193; SSSE3-NEXT:    shlq $59, %r14
3194; SSSE3-NEXT:    sarq $63, %r14
3195; SSSE3-NEXT:    movd %r14d, %xmm5
3196; SSSE3-NEXT:    shlq $55, %r15
3197; SSSE3-NEXT:    sarq $63, %r15
3198; SSSE3-NEXT:    movd %r15d, %xmm10
3199; SSSE3-NEXT:    shlq $63, %r12
3200; SSSE3-NEXT:    sarq $63, %r12
3201; SSSE3-NEXT:    movd %r12d, %xmm0
3202; SSSE3-NEXT:    shlq $50, %r13
3203; SSSE3-NEXT:    sarq $63, %r13
3204; SSSE3-NEXT:    movd %r13d, %xmm11
3205; SSSE3-NEXT:    shlq $58, %rdx
3206; SSSE3-NEXT:    sarq $63, %rdx
3207; SSSE3-NEXT:    movd %edx, %xmm4
3208; SSSE3-NEXT:    shlq $54, %rsi
3209; SSSE3-NEXT:    sarq $63, %rsi
3210; SSSE3-NEXT:    movd %esi, %xmm12
3211; SSSE3-NEXT:    shlq $62, %rcx
3212; SSSE3-NEXT:    sarq $63, %rcx
3213; SSSE3-NEXT:    movd %ecx, %xmm6
3214; SSSE3-NEXT:    shlq $52, %rbp
3215; SSSE3-NEXT:    sarq $63, %rbp
3216; SSSE3-NEXT:    movd %ebp, %xmm13
3217; SSSE3-NEXT:    shlq $60, %rax
3218; SSSE3-NEXT:    sarq $63, %rax
3219; SSSE3-NEXT:    movd %eax, %xmm7
3220; SSSE3-NEXT:    shrq $15, %r10
3221; SSSE3-NEXT:    movd %r10d, %xmm14
3222; SSSE3-NEXT:    shrq $7, %rbx
3223; SSSE3-NEXT:    movd %ebx, %xmm3
3224; SSSE3-NEXT:    movswq 2(%rdi), %rdx
3225; SSSE3-NEXT:    movq %rdx, %r8
3226; SSSE3-NEXT:    movq %rdx, %r9
3227; SSSE3-NEXT:    movq %rdx, %r10
3228; SSSE3-NEXT:    movq %rdx, %r11
3229; SSSE3-NEXT:    movq %rdx, %r14
3230; SSSE3-NEXT:    movq %rdx, %r15
3231; SSSE3-NEXT:    movq %rdx, %r12
3232; SSSE3-NEXT:    movq %rdx, %r13
3233; SSSE3-NEXT:    movq %rdx, %rbx
3234; SSSE3-NEXT:    movq %rdx, %rax
3235; SSSE3-NEXT:    movq %rdx, %rcx
3236; SSSE3-NEXT:    movq %rdx, %rsi
3237; SSSE3-NEXT:    movq %rdx, %rdi
3238; SSSE3-NEXT:    movq %rdx, %rbp
3239; SSSE3-NEXT:    shlq $49, %rbp
3240; SSSE3-NEXT:    sarq $63, %rbp
3241; SSSE3-NEXT:    movd %ebp, %xmm1
3242; SSSE3-NEXT:    movq %rdx, %rbp
3243; SSSE3-NEXT:    movsbq %dl, %rdx
3244; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
3245; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
3246; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
3247; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
3248; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
3249; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3250; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
3251; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
3252; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
3253; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
3254; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
3255; SSSE3-NEXT:    shlq $57, %r8
3256; SSSE3-NEXT:    sarq $63, %r8
3257; SSSE3-NEXT:    movd %r8d, %xmm2
3258; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
3259; SSSE3-NEXT:    shlq $53, %r9
3260; SSSE3-NEXT:    sarq $63, %r9
3261; SSSE3-NEXT:    movd %r9d, %xmm3
3262; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
3263; SSSE3-NEXT:    shlq $61, %r10
3264; SSSE3-NEXT:    sarq $63, %r10
3265; SSSE3-NEXT:    movd %r10d, %xmm4
3266; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
3267; SSSE3-NEXT:    shlq $51, %r11
3268; SSSE3-NEXT:    sarq $63, %r11
3269; SSSE3-NEXT:    movd %r11d, %xmm5
3270; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3271; SSSE3-NEXT:    shlq $59, %r14
3272; SSSE3-NEXT:    sarq $63, %r14
3273; SSSE3-NEXT:    movd %r14d, %xmm6
3274; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
3275; SSSE3-NEXT:    shlq $55, %r15
3276; SSSE3-NEXT:    sarq $63, %r15
3277; SSSE3-NEXT:    movd %r15d, %xmm3
3278; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
3279; SSSE3-NEXT:    shlq $63, %r12
3280; SSSE3-NEXT:    sarq $63, %r12
3281; SSSE3-NEXT:    movd %r12d, %xmm1
3282; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
3283; SSSE3-NEXT:    shlq $50, %r13
3284; SSSE3-NEXT:    sarq $63, %r13
3285; SSSE3-NEXT:    movd %r13d, %xmm2
3286; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
3287; SSSE3-NEXT:    shlq $58, %rbx
3288; SSSE3-NEXT:    sarq $63, %rbx
3289; SSSE3-NEXT:    movd %ebx, %xmm3
3290; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
3291; SSSE3-NEXT:    shlq $54, %rax
3292; SSSE3-NEXT:    sarq $63, %rax
3293; SSSE3-NEXT:    movd %eax, %xmm5
3294; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
3295; SSSE3-NEXT:    shlq $62, %rcx
3296; SSSE3-NEXT:    sarq $63, %rcx
3297; SSSE3-NEXT:    movd %ecx, %xmm4
3298; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3299; SSSE3-NEXT:    shlq $52, %rsi
3300; SSSE3-NEXT:    sarq $63, %rsi
3301; SSSE3-NEXT:    movd %esi, %xmm2
3302; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
3303; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
3304; SSSE3-NEXT:    shlq $60, %rdi
3305; SSSE3-NEXT:    sarq $63, %rdi
3306; SSSE3-NEXT:    movd %edi, %xmm3
3307; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3308; SSSE3-NEXT:    shrq $15, %rbp
3309; SSSE3-NEXT:    movd %ebp, %xmm2
3310; SSSE3-NEXT:    shrq $7, %rdx
3311; SSSE3-NEXT:    movd %edx, %xmm5
3312; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
3313; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
3314; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
3315; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
3316; SSSE3-NEXT:    popq %rbx
3317; SSSE3-NEXT:    popq %r12
3318; SSSE3-NEXT:    popq %r13
3319; SSSE3-NEXT:    popq %r14
3320; SSSE3-NEXT:    popq %r15
3321; SSSE3-NEXT:    popq %rbp
3322; SSSE3-NEXT:    retq
3323;
3324; SSE41-LABEL: load_sext_32i1_to_32i8:
3325; SSE41:       # BB#0: # %entry
3326; SSE41-NEXT:    movswq (%rdi), %rax
3327; SSE41-NEXT:    movq %rax, %rcx
3328; SSE41-NEXT:    shlq $62, %rcx
3329; SSE41-NEXT:    sarq $63, %rcx
3330; SSE41-NEXT:    movq %rax, %rdx
3331; SSE41-NEXT:    shlq $63, %rdx
3332; SSE41-NEXT:    sarq $63, %rdx
3333; SSE41-NEXT:    movd %edx, %xmm0
3334; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
3335; SSE41-NEXT:    movq %rax, %rcx
3336; SSE41-NEXT:    shlq $61, %rcx
3337; SSE41-NEXT:    sarq $63, %rcx
3338; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
3339; SSE41-NEXT:    movq %rax, %rcx
3340; SSE41-NEXT:    shlq $60, %rcx
3341; SSE41-NEXT:    sarq $63, %rcx
3342; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
3343; SSE41-NEXT:    movq %rax, %rcx
3344; SSE41-NEXT:    shlq $59, %rcx
3345; SSE41-NEXT:    sarq $63, %rcx
3346; SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
3347; SSE41-NEXT:    movq %rax, %rcx
3348; SSE41-NEXT:    shlq $58, %rcx
3349; SSE41-NEXT:    sarq $63, %rcx
3350; SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
3351; SSE41-NEXT:    movq %rax, %rcx
3352; SSE41-NEXT:    shlq $57, %rcx
3353; SSE41-NEXT:    sarq $63, %rcx
3354; SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
3355; SSE41-NEXT:    movsbq %al, %rcx
3356; SSE41-NEXT:    shrq $7, %rcx
3357; SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
3358; SSE41-NEXT:    movq %rax, %rcx
3359; SSE41-NEXT:    shlq $55, %rcx
3360; SSE41-NEXT:    sarq $63, %rcx
3361; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
3362; SSE41-NEXT:    movq %rax, %rcx
3363; SSE41-NEXT:    shlq $54, %rcx
3364; SSE41-NEXT:    sarq $63, %rcx
3365; SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
3366; SSE41-NEXT:    movq %rax, %rcx
3367; SSE41-NEXT:    shlq $53, %rcx
3368; SSE41-NEXT:    sarq $63, %rcx
3369; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
3370; SSE41-NEXT:    movq %rax, %rcx
3371; SSE41-NEXT:    shlq $52, %rcx
3372; SSE41-NEXT:    sarq $63, %rcx
3373; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
3374; SSE41-NEXT:    movq %rax, %rcx
3375; SSE41-NEXT:    shlq $51, %rcx
3376; SSE41-NEXT:    sarq $63, %rcx
3377; SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
3378; SSE41-NEXT:    movq %rax, %rcx
3379; SSE41-NEXT:    shlq $50, %rcx
3380; SSE41-NEXT:    sarq $63, %rcx
3381; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
3382; SSE41-NEXT:    movq %rax, %rcx
3383; SSE41-NEXT:    shlq $49, %rcx
3384; SSE41-NEXT:    sarq $63, %rcx
3385; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
3386; SSE41-NEXT:    shrq $15, %rax
3387; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
3388; SSE41-NEXT:    movswq 2(%rdi), %rax
3389; SSE41-NEXT:    movq %rax, %rcx
3390; SSE41-NEXT:    shlq $62, %rcx
3391; SSE41-NEXT:    sarq $63, %rcx
3392; SSE41-NEXT:    movq %rax, %rdx
3393; SSE41-NEXT:    shlq $63, %rdx
3394; SSE41-NEXT:    sarq $63, %rdx
3395; SSE41-NEXT:    movd %edx, %xmm1
3396; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
3397; SSE41-NEXT:    movq %rax, %rcx
3398; SSE41-NEXT:    shlq $61, %rcx
3399; SSE41-NEXT:    sarq $63, %rcx
3400; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
3401; SSE41-NEXT:    movq %rax, %rcx
3402; SSE41-NEXT:    shlq $60, %rcx
3403; SSE41-NEXT:    sarq $63, %rcx
3404; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
3405; SSE41-NEXT:    movq %rax, %rcx
3406; SSE41-NEXT:    shlq $59, %rcx
3407; SSE41-NEXT:    sarq $63, %rcx
3408; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
3409; SSE41-NEXT:    movq %rax, %rcx
3410; SSE41-NEXT:    shlq $58, %rcx
3411; SSE41-NEXT:    sarq $63, %rcx
3412; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
3413; SSE41-NEXT:    movq %rax, %rcx
3414; SSE41-NEXT:    shlq $57, %rcx
3415; SSE41-NEXT:    sarq $63, %rcx
3416; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
3417; SSE41-NEXT:    movsbq %al, %rcx
3418; SSE41-NEXT:    shrq $7, %rcx
3419; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
3420; SSE41-NEXT:    movq %rax, %rcx
3421; SSE41-NEXT:    shlq $55, %rcx
3422; SSE41-NEXT:    sarq $63, %rcx
3423; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
3424; SSE41-NEXT:    movq %rax, %rcx
3425; SSE41-NEXT:    shlq $54, %rcx
3426; SSE41-NEXT:    sarq $63, %rcx
3427; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
3428; SSE41-NEXT:    movq %rax, %rcx
3429; SSE41-NEXT:    shlq $53, %rcx
3430; SSE41-NEXT:    sarq $63, %rcx
3431; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
3432; SSE41-NEXT:    movq %rax, %rcx
3433; SSE41-NEXT:    shlq $52, %rcx
3434; SSE41-NEXT:    sarq $63, %rcx
3435; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
3436; SSE41-NEXT:    movq %rax, %rcx
3437; SSE41-NEXT:    shlq $51, %rcx
3438; SSE41-NEXT:    sarq $63, %rcx
3439; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
3440; SSE41-NEXT:    movq %rax, %rcx
3441; SSE41-NEXT:    shlq $50, %rcx
3442; SSE41-NEXT:    sarq $63, %rcx
3443; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
3444; SSE41-NEXT:    movq %rax, %rcx
3445; SSE41-NEXT:    shlq $49, %rcx
3446; SSE41-NEXT:    sarq $63, %rcx
3447; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
3448; SSE41-NEXT:    shrq $15, %rax
3449; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
3450; SSE41-NEXT:    retq
3451;
3452; AVX1-LABEL: load_sext_32i1_to_32i8:
3453; AVX1:       # BB#0: # %entry
3454; AVX1-NEXT:    pushq %rbp
3455; AVX1-NEXT:    pushq %r15
3456; AVX1-NEXT:    pushq %r14
3457; AVX1-NEXT:    pushq %r13
3458; AVX1-NEXT:    pushq %r12
3459; AVX1-NEXT:    pushq %rbx
3460; AVX1-NEXT:    movslq (%rdi), %rax
3461; AVX1-NEXT:    movq %rax, %rcx
3462; AVX1-NEXT:    shlq $47, %rcx
3463; AVX1-NEXT:    sarq $63, %rcx
3464; AVX1-NEXT:    vmovd %ecx, %xmm0
3465; AVX1-NEXT:    movq %rax, %r8
3466; AVX1-NEXT:    movq %rax, %rdx
3467; AVX1-NEXT:    movq %rax, %rcx
3468; AVX1-NEXT:    movq %rax, %rdi
3469; AVX1-NEXT:    movq %rax, %r13
3470; AVX1-NEXT:    movq %rax, %rsi
3471; AVX1-NEXT:    movq %rax, %r10
3472; AVX1-NEXT:    movq %rax, %r11
3473; AVX1-NEXT:    movq %rax, %r9
3474; AVX1-NEXT:    movq %rax, %rbx
3475; AVX1-NEXT:    movq %rax, %r14
3476; AVX1-NEXT:    movq %rax, %r15
3477; AVX1-NEXT:    movq %rax, %r12
3478; AVX1-NEXT:    movq %rax, %rbp
3479; AVX1-NEXT:    shlq $46, %rbp
3480; AVX1-NEXT:    sarq $63, %rbp
3481; AVX1-NEXT:    vpinsrb $1, %ebp, %xmm0, %xmm0
3482; AVX1-NEXT:    movq %rax, %rbp
3483; AVX1-NEXT:    shlq $45, %r8
3484; AVX1-NEXT:    sarq $63, %r8
3485; AVX1-NEXT:    vpinsrb $2, %r8d, %xmm0, %xmm0
3486; AVX1-NEXT:    movq %rax, %r8
3487; AVX1-NEXT:    shlq $44, %rdx
3488; AVX1-NEXT:    sarq $63, %rdx
3489; AVX1-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
3490; AVX1-NEXT:    movq %rax, %rdx
3491; AVX1-NEXT:    shlq $43, %rcx
3492; AVX1-NEXT:    sarq $63, %rcx
3493; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
3494; AVX1-NEXT:    movq %rax, %rcx
3495; AVX1-NEXT:    shlq $42, %rdi
3496; AVX1-NEXT:    sarq $63, %rdi
3497; AVX1-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
3498; AVX1-NEXT:    movq %rax, %rdi
3499; AVX1-NEXT:    shlq $41, %r13
3500; AVX1-NEXT:    sarq $63, %r13
3501; AVX1-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
3502; AVX1-NEXT:    movq %rax, %r13
3503; AVX1-NEXT:    shlq $40, %rsi
3504; AVX1-NEXT:    sarq $63, %rsi
3505; AVX1-NEXT:    vpinsrb $7, %esi, %xmm0, %xmm0
3506; AVX1-NEXT:    movq %rax, %rsi
3507; AVX1-NEXT:    shlq $39, %r10
3508; AVX1-NEXT:    sarq $63, %r10
3509; AVX1-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
3510; AVX1-NEXT:    movq %rax, %r10
3511; AVX1-NEXT:    shlq $38, %r11
3512; AVX1-NEXT:    sarq $63, %r11
3513; AVX1-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
3514; AVX1-NEXT:    movsbq %al, %r11
3515; AVX1-NEXT:    shlq $37, %r9
3516; AVX1-NEXT:    sarq $63, %r9
3517; AVX1-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
3518; AVX1-NEXT:    movq %rax, %r9
3519; AVX1-NEXT:    shlq $36, %rbx
3520; AVX1-NEXT:    sarq $63, %rbx
3521; AVX1-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
3522; AVX1-NEXT:    movq %rax, %rbx
3523; AVX1-NEXT:    shlq $35, %r14
3524; AVX1-NEXT:    sarq $63, %r14
3525; AVX1-NEXT:    vpinsrb $12, %r14d, %xmm0, %xmm0
3526; AVX1-NEXT:    movq %rax, %r14
3527; AVX1-NEXT:    shlq $34, %r15
3528; AVX1-NEXT:    sarq $63, %r15
3529; AVX1-NEXT:    vpinsrb $13, %r15d, %xmm0, %xmm0
3530; AVX1-NEXT:    movq %rax, %r15
3531; AVX1-NEXT:    shlq $33, %r12
3532; AVX1-NEXT:    sarq $63, %r12
3533; AVX1-NEXT:    vpinsrb $14, %r12d, %xmm0, %xmm0
3534; AVX1-NEXT:    movq %rax, %r12
3535; AVX1-NEXT:    shrq $31, %rbp
3536; AVX1-NEXT:    vpinsrb $15, %ebp, %xmm0, %xmm0
3537; AVX1-NEXT:    movq %rax, %rbp
3538; AVX1-NEXT:    shlq $63, %rdx
3539; AVX1-NEXT:    sarq $63, %rdx
3540; AVX1-NEXT:    vmovd %edx, %xmm1
3541; AVX1-NEXT:    movq %rax, %rdx
3542; AVX1-NEXT:    movswq %ax, %rax
3543; AVX1-NEXT:    shlq $62, %r8
3544; AVX1-NEXT:    sarq $63, %r8
3545; AVX1-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm1
3546; AVX1-NEXT:    shlq $61, %rcx
3547; AVX1-NEXT:    sarq $63, %rcx
3548; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
3549; AVX1-NEXT:    shlq $60, %rdi
3550; AVX1-NEXT:    sarq $63, %rdi
3551; AVX1-NEXT:    vpinsrb $3, %edi, %xmm1, %xmm1
3552; AVX1-NEXT:    shlq $59, %r13
3553; AVX1-NEXT:    sarq $63, %r13
3554; AVX1-NEXT:    vpinsrb $4, %r13d, %xmm1, %xmm1
3555; AVX1-NEXT:    shlq $58, %rsi
3556; AVX1-NEXT:    sarq $63, %rsi
3557; AVX1-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
3558; AVX1-NEXT:    shlq $57, %r10
3559; AVX1-NEXT:    sarq $63, %r10
3560; AVX1-NEXT:    vpinsrb $6, %r10d, %xmm1, %xmm1
3561; AVX1-NEXT:    shrq $7, %r11
3562; AVX1-NEXT:    vpinsrb $7, %r11d, %xmm1, %xmm1
3563; AVX1-NEXT:    shlq $55, %r9
3564; AVX1-NEXT:    sarq $63, %r9
3565; AVX1-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
3566; AVX1-NEXT:    shlq $54, %rbx
3567; AVX1-NEXT:    sarq $63, %rbx
3568; AVX1-NEXT:    vpinsrb $9, %ebx, %xmm1, %xmm1
3569; AVX1-NEXT:    shlq $53, %r14
3570; AVX1-NEXT:    sarq $63, %r14
3571; AVX1-NEXT:    vpinsrb $10, %r14d, %xmm1, %xmm1
3572; AVX1-NEXT:    shlq $52, %r15
3573; AVX1-NEXT:    sarq $63, %r15
3574; AVX1-NEXT:    vpinsrb $11, %r15d, %xmm1, %xmm1
3575; AVX1-NEXT:    shlq $51, %r12
3576; AVX1-NEXT:    sarq $63, %r12
3577; AVX1-NEXT:    vpinsrb $12, %r12d, %xmm1, %xmm1
3578; AVX1-NEXT:    shlq $50, %rbp
3579; AVX1-NEXT:    sarq $63, %rbp
3580; AVX1-NEXT:    vpinsrb $13, %ebp, %xmm1, %xmm1
3581; AVX1-NEXT:    shlq $49, %rdx
3582; AVX1-NEXT:    sarq $63, %rdx
3583; AVX1-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
3584; AVX1-NEXT:    shrq $15, %rax
3585; AVX1-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
3586; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3587; AVX1-NEXT:    popq %rbx
3588; AVX1-NEXT:    popq %r12
3589; AVX1-NEXT:    popq %r13
3590; AVX1-NEXT:    popq %r14
3591; AVX1-NEXT:    popq %r15
3592; AVX1-NEXT:    popq %rbp
3593; AVX1-NEXT:    retq
3594;
3595; AVX2-LABEL: load_sext_32i1_to_32i8:
3596; AVX2:       # BB#0: # %entry
3597; AVX2-NEXT:    pushq %rbp
3598; AVX2-NEXT:    pushq %r15
3599; AVX2-NEXT:    pushq %r14
3600; AVX2-NEXT:    pushq %r13
3601; AVX2-NEXT:    pushq %r12
3602; AVX2-NEXT:    pushq %rbx
3603; AVX2-NEXT:    movslq (%rdi), %rax
3604; AVX2-NEXT:    movq %rax, %rcx
3605; AVX2-NEXT:    shlq $47, %rcx
3606; AVX2-NEXT:    sarq $63, %rcx
3607; AVX2-NEXT:    vmovd %ecx, %xmm0
3608; AVX2-NEXT:    movq %rax, %r8
3609; AVX2-NEXT:    movq %rax, %rdx
3610; AVX2-NEXT:    movq %rax, %rcx
3611; AVX2-NEXT:    movq %rax, %rdi
3612; AVX2-NEXT:    movq %rax, %r13
3613; AVX2-NEXT:    movq %rax, %rsi
3614; AVX2-NEXT:    movq %rax, %r10
3615; AVX2-NEXT:    movq %rax, %r11
3616; AVX2-NEXT:    movq %rax, %r9
3617; AVX2-NEXT:    movq %rax, %rbx
3618; AVX2-NEXT:    movq %rax, %r14
3619; AVX2-NEXT:    movq %rax, %r15
3620; AVX2-NEXT:    movq %rax, %r12
3621; AVX2-NEXT:    movq %rax, %rbp
3622; AVX2-NEXT:    shlq $46, %rbp
3623; AVX2-NEXT:    sarq $63, %rbp
3624; AVX2-NEXT:    vpinsrb $1, %ebp, %xmm0, %xmm0
3625; AVX2-NEXT:    movq %rax, %rbp
3626; AVX2-NEXT:    shlq $45, %r8
3627; AVX2-NEXT:    sarq $63, %r8
3628; AVX2-NEXT:    vpinsrb $2, %r8d, %xmm0, %xmm0
3629; AVX2-NEXT:    movq %rax, %r8
3630; AVX2-NEXT:    shlq $44, %rdx
3631; AVX2-NEXT:    sarq $63, %rdx
3632; AVX2-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
3633; AVX2-NEXT:    movq %rax, %rdx
3634; AVX2-NEXT:    shlq $43, %rcx
3635; AVX2-NEXT:    sarq $63, %rcx
3636; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
3637; AVX2-NEXT:    movq %rax, %rcx
3638; AVX2-NEXT:    shlq $42, %rdi
3639; AVX2-NEXT:    sarq $63, %rdi
3640; AVX2-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
3641; AVX2-NEXT:    movq %rax, %rdi
3642; AVX2-NEXT:    shlq $41, %r13
3643; AVX2-NEXT:    sarq $63, %r13
3644; AVX2-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
3645; AVX2-NEXT:    movq %rax, %r13
3646; AVX2-NEXT:    shlq $40, %rsi
3647; AVX2-NEXT:    sarq $63, %rsi
3648; AVX2-NEXT:    vpinsrb $7, %esi, %xmm0, %xmm0
3649; AVX2-NEXT:    movq %rax, %rsi
3650; AVX2-NEXT:    shlq $39, %r10
3651; AVX2-NEXT:    sarq $63, %r10
3652; AVX2-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
3653; AVX2-NEXT:    movq %rax, %r10
3654; AVX2-NEXT:    shlq $38, %r11
3655; AVX2-NEXT:    sarq $63, %r11
3656; AVX2-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
3657; AVX2-NEXT:    movsbq %al, %r11
3658; AVX2-NEXT:    shlq $37, %r9
3659; AVX2-NEXT:    sarq $63, %r9
3660; AVX2-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
3661; AVX2-NEXT:    movq %rax, %r9
3662; AVX2-NEXT:    shlq $36, %rbx
3663; AVX2-NEXT:    sarq $63, %rbx
3664; AVX2-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
3665; AVX2-NEXT:    movq %rax, %rbx
3666; AVX2-NEXT:    shlq $35, %r14
3667; AVX2-NEXT:    sarq $63, %r14
3668; AVX2-NEXT:    vpinsrb $12, %r14d, %xmm0, %xmm0
3669; AVX2-NEXT:    movq %rax, %r14
3670; AVX2-NEXT:    shlq $34, %r15
3671; AVX2-NEXT:    sarq $63, %r15
3672; AVX2-NEXT:    vpinsrb $13, %r15d, %xmm0, %xmm0
3673; AVX2-NEXT:    movq %rax, %r15
3674; AVX2-NEXT:    shlq $33, %r12
3675; AVX2-NEXT:    sarq $63, %r12
3676; AVX2-NEXT:    vpinsrb $14, %r12d, %xmm0, %xmm0
3677; AVX2-NEXT:    movq %rax, %r12
3678; AVX2-NEXT:    shrq $31, %rbp
3679; AVX2-NEXT:    vpinsrb $15, %ebp, %xmm0, %xmm0
3680; AVX2-NEXT:    movq %rax, %rbp
3681; AVX2-NEXT:    shlq $63, %rdx
3682; AVX2-NEXT:    sarq $63, %rdx
3683; AVX2-NEXT:    vmovd %edx, %xmm1
3684; AVX2-NEXT:    movq %rax, %rdx
3685; AVX2-NEXT:    movswq %ax, %rax
3686; AVX2-NEXT:    shlq $62, %r8
3687; AVX2-NEXT:    sarq $63, %r8
3688; AVX2-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm1
3689; AVX2-NEXT:    shlq $61, %rcx
3690; AVX2-NEXT:    sarq $63, %rcx
3691; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
3692; AVX2-NEXT:    shlq $60, %rdi
3693; AVX2-NEXT:    sarq $63, %rdi
3694; AVX2-NEXT:    vpinsrb $3, %edi, %xmm1, %xmm1
3695; AVX2-NEXT:    shlq $59, %r13
3696; AVX2-NEXT:    sarq $63, %r13
3697; AVX2-NEXT:    vpinsrb $4, %r13d, %xmm1, %xmm1
3698; AVX2-NEXT:    shlq $58, %rsi
3699; AVX2-NEXT:    sarq $63, %rsi
3700; AVX2-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
3701; AVX2-NEXT:    shlq $57, %r10
3702; AVX2-NEXT:    sarq $63, %r10
3703; AVX2-NEXT:    vpinsrb $6, %r10d, %xmm1, %xmm1
3704; AVX2-NEXT:    shrq $7, %r11
3705; AVX2-NEXT:    vpinsrb $7, %r11d, %xmm1, %xmm1
3706; AVX2-NEXT:    shlq $55, %r9
3707; AVX2-NEXT:    sarq $63, %r9
3708; AVX2-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
3709; AVX2-NEXT:    shlq $54, %rbx
3710; AVX2-NEXT:    sarq $63, %rbx
3711; AVX2-NEXT:    vpinsrb $9, %ebx, %xmm1, %xmm1
3712; AVX2-NEXT:    shlq $53, %r14
3713; AVX2-NEXT:    sarq $63, %r14
3714; AVX2-NEXT:    vpinsrb $10, %r14d, %xmm1, %xmm1
3715; AVX2-NEXT:    shlq $52, %r15
3716; AVX2-NEXT:    sarq $63, %r15
3717; AVX2-NEXT:    vpinsrb $11, %r15d, %xmm1, %xmm1
3718; AVX2-NEXT:    shlq $51, %r12
3719; AVX2-NEXT:    sarq $63, %r12
3720; AVX2-NEXT:    vpinsrb $12, %r12d, %xmm1, %xmm1
3721; AVX2-NEXT:    shlq $50, %rbp
3722; AVX2-NEXT:    sarq $63, %rbp
3723; AVX2-NEXT:    vpinsrb $13, %ebp, %xmm1, %xmm1
3724; AVX2-NEXT:    shlq $49, %rdx
3725; AVX2-NEXT:    sarq $63, %rdx
3726; AVX2-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
3727; AVX2-NEXT:    shrq $15, %rax
3728; AVX2-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
3729; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
3730; AVX2-NEXT:    popq %rbx
3731; AVX2-NEXT:    popq %r12
3732; AVX2-NEXT:    popq %r13
3733; AVX2-NEXT:    popq %r14
3734; AVX2-NEXT:    popq %r15
3735; AVX2-NEXT:    popq %rbp
3736; AVX2-NEXT:    retq
3737;
3738; AVX512-LABEL: load_sext_32i1_to_32i8:
3739; AVX512:       # BB#0: # %entry
3740; AVX512-NEXT:    kmovw (%rdi), %k1
3741; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
3742; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1} {z}
3743; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
3744; AVX512-NEXT:    kmovw 2(%rdi), %k1
3745; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
3746; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3747; AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3748; AVX512-NEXT:    retq
3749;
3750; X32-SSE41-LABEL: load_sext_32i1_to_32i8:
3751; X32-SSE41:       # BB#0: # %entry
3752; X32-SSE41-NEXT:    pushl %esi
3753; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3754; X32-SSE41-NEXT:    movswl (%eax), %ecx
3755; X32-SSE41-NEXT:    movl %ecx, %edx
3756; X32-SSE41-NEXT:    shll $30, %edx
3757; X32-SSE41-NEXT:    sarl $31, %edx
3758; X32-SSE41-NEXT:    movl %ecx, %esi
3759; X32-SSE41-NEXT:    shll $31, %esi
3760; X32-SSE41-NEXT:    sarl $31, %esi
3761; X32-SSE41-NEXT:    movd %esi, %xmm0
3762; X32-SSE41-NEXT:    pinsrb $1, %edx, %xmm0
3763; X32-SSE41-NEXT:    movl %ecx, %edx
3764; X32-SSE41-NEXT:    shll $29, %edx
3765; X32-SSE41-NEXT:    sarl $31, %edx
3766; X32-SSE41-NEXT:    pinsrb $2, %edx, %xmm0
3767; X32-SSE41-NEXT:    movl %ecx, %edx
3768; X32-SSE41-NEXT:    shll $28, %edx
3769; X32-SSE41-NEXT:    sarl $31, %edx
3770; X32-SSE41-NEXT:    pinsrb $3, %edx, %xmm0
3771; X32-SSE41-NEXT:    movl %ecx, %edx
3772; X32-SSE41-NEXT:    shll $27, %edx
3773; X32-SSE41-NEXT:    sarl $31, %edx
3774; X32-SSE41-NEXT:    pinsrb $4, %edx, %xmm0
3775; X32-SSE41-NEXT:    movl %ecx, %edx
3776; X32-SSE41-NEXT:    shll $26, %edx
3777; X32-SSE41-NEXT:    sarl $31, %edx
3778; X32-SSE41-NEXT:    pinsrb $5, %edx, %xmm0
3779; X32-SSE41-NEXT:    movl %ecx, %edx
3780; X32-SSE41-NEXT:    shll $25, %edx
3781; X32-SSE41-NEXT:    sarl $31, %edx
3782; X32-SSE41-NEXT:    pinsrb $6, %edx, %xmm0
3783; X32-SSE41-NEXT:    movsbl %cl, %edx
3784; X32-SSE41-NEXT:    shrl $7, %edx
3785; X32-SSE41-NEXT:    pinsrb $7, %edx, %xmm0
3786; X32-SSE41-NEXT:    movl %ecx, %edx
3787; X32-SSE41-NEXT:    shll $23, %edx
3788; X32-SSE41-NEXT:    sarl $31, %edx
3789; X32-SSE41-NEXT:    pinsrb $8, %edx, %xmm0
3790; X32-SSE41-NEXT:    movl %ecx, %edx
3791; X32-SSE41-NEXT:    shll $22, %edx
3792; X32-SSE41-NEXT:    sarl $31, %edx
3793; X32-SSE41-NEXT:    pinsrb $9, %edx, %xmm0
3794; X32-SSE41-NEXT:    movl %ecx, %edx
3795; X32-SSE41-NEXT:    shll $21, %edx
3796; X32-SSE41-NEXT:    sarl $31, %edx
3797; X32-SSE41-NEXT:    pinsrb $10, %edx, %xmm0
3798; X32-SSE41-NEXT:    movl %ecx, %edx
3799; X32-SSE41-NEXT:    shll $20, %edx
3800; X32-SSE41-NEXT:    sarl $31, %edx
3801; X32-SSE41-NEXT:    pinsrb $11, %edx, %xmm0
3802; X32-SSE41-NEXT:    movl %ecx, %edx
3803; X32-SSE41-NEXT:    shll $19, %edx
3804; X32-SSE41-NEXT:    sarl $31, %edx
3805; X32-SSE41-NEXT:    pinsrb $12, %edx, %xmm0
3806; X32-SSE41-NEXT:    movl %ecx, %edx
3807; X32-SSE41-NEXT:    shll $18, %edx
3808; X32-SSE41-NEXT:    sarl $31, %edx
3809; X32-SSE41-NEXT:    pinsrb $13, %edx, %xmm0
3810; X32-SSE41-NEXT:    movl %ecx, %edx
3811; X32-SSE41-NEXT:    shll $17, %edx
3812; X32-SSE41-NEXT:    sarl $31, %edx
3813; X32-SSE41-NEXT:    pinsrb $14, %edx, %xmm0
3814; X32-SSE41-NEXT:    shrl $15, %ecx
3815; X32-SSE41-NEXT:    pinsrb $15, %ecx, %xmm0
3816; X32-SSE41-NEXT:    movswl 2(%eax), %eax
3817; X32-SSE41-NEXT:    movl %eax, %ecx
3818; X32-SSE41-NEXT:    shll $30, %ecx
3819; X32-SSE41-NEXT:    sarl $31, %ecx
3820; X32-SSE41-NEXT:    movl %eax, %edx
3821; X32-SSE41-NEXT:    shll $31, %edx
3822; X32-SSE41-NEXT:    sarl $31, %edx
3823; X32-SSE41-NEXT:    movd %edx, %xmm1
3824; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
3825; X32-SSE41-NEXT:    movl %eax, %ecx
3826; X32-SSE41-NEXT:    shll $29, %ecx
3827; X32-SSE41-NEXT:    sarl $31, %ecx
3828; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
3829; X32-SSE41-NEXT:    movl %eax, %ecx
3830; X32-SSE41-NEXT:    shll $28, %ecx
3831; X32-SSE41-NEXT:    sarl $31, %ecx
3832; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
3833; X32-SSE41-NEXT:    movl %eax, %ecx
3834; X32-SSE41-NEXT:    shll $27, %ecx
3835; X32-SSE41-NEXT:    sarl $31, %ecx
3836; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
3837; X32-SSE41-NEXT:    movl %eax, %ecx
3838; X32-SSE41-NEXT:    shll $26, %ecx
3839; X32-SSE41-NEXT:    sarl $31, %ecx
3840; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
3841; X32-SSE41-NEXT:    movl %eax, %ecx
3842; X32-SSE41-NEXT:    shll $25, %ecx
3843; X32-SSE41-NEXT:    sarl $31, %ecx
3844; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
3845; X32-SSE41-NEXT:    movsbl %al, %ecx
3846; X32-SSE41-NEXT:    shrl $7, %ecx
3847; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
3848; X32-SSE41-NEXT:    movl %eax, %ecx
3849; X32-SSE41-NEXT:    shll $23, %ecx
3850; X32-SSE41-NEXT:    sarl $31, %ecx
3851; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
3852; X32-SSE41-NEXT:    movl %eax, %ecx
3853; X32-SSE41-NEXT:    shll $22, %ecx
3854; X32-SSE41-NEXT:    sarl $31, %ecx
3855; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
3856; X32-SSE41-NEXT:    movl %eax, %ecx
3857; X32-SSE41-NEXT:    shll $21, %ecx
3858; X32-SSE41-NEXT:    sarl $31, %ecx
3859; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
3860; X32-SSE41-NEXT:    movl %eax, %ecx
3861; X32-SSE41-NEXT:    shll $20, %ecx
3862; X32-SSE41-NEXT:    sarl $31, %ecx
3863; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
3864; X32-SSE41-NEXT:    movl %eax, %ecx
3865; X32-SSE41-NEXT:    shll $19, %ecx
3866; X32-SSE41-NEXT:    sarl $31, %ecx
3867; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
3868; X32-SSE41-NEXT:    movl %eax, %ecx
3869; X32-SSE41-NEXT:    shll $18, %ecx
3870; X32-SSE41-NEXT:    sarl $31, %ecx
3871; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
3872; X32-SSE41-NEXT:    movl %eax, %ecx
3873; X32-SSE41-NEXT:    shll $17, %ecx
3874; X32-SSE41-NEXT:    sarl $31, %ecx
3875; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
3876; X32-SSE41-NEXT:    shrl $15, %eax
3877; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm1
3878; X32-SSE41-NEXT:    popl %esi
3879; X32-SSE41-NEXT:    retl
3880entry:
3881 %X = load <32 x i1>, <32 x i1>* %ptr
3882 %Y = sext <32 x i1> %X to <32 x i8>
3883 ret <32 x i8> %Y
3884}
3885
3886define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
3887; SSE2-LABEL: load_sext_16i8_to_16i16:
3888; SSE2:       # BB#0: # %entry
3889; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3890; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3891; SSE2-NEXT:    psraw $8, %xmm0
3892; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
3893; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3894; SSE2-NEXT:    psraw $8, %xmm1
3895; SSE2-NEXT:    retq
3896;
3897; SSSE3-LABEL: load_sext_16i8_to_16i16:
3898; SSSE3:       # BB#0: # %entry
3899; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3900; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3901; SSSE3-NEXT:    psraw $8, %xmm0
3902; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
3903; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3904; SSSE3-NEXT:    psraw $8, %xmm1
3905; SSSE3-NEXT:    retq
3906;
3907; SSE41-LABEL: load_sext_16i8_to_16i16:
3908; SSE41:       # BB#0: # %entry
3909; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
3910; SSE41-NEXT:    pmovsxbw 8(%rdi), %xmm1
3911; SSE41-NEXT:    retq
3912;
3913; AVX1-LABEL: load_sext_16i8_to_16i16:
3914; AVX1:       # BB#0: # %entry
3915; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
3916; AVX1-NEXT:    vpmovsxbw 8(%rdi), %xmm1
3917; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3918; AVX1-NEXT:    retq
3919;
3920; AVX2-LABEL: load_sext_16i8_to_16i16:
3921; AVX2:       # BB#0: # %entry
3922; AVX2-NEXT:    vpmovsxbw (%rdi), %ymm0
3923; AVX2-NEXT:    retq
3924;
3925; AVX512-LABEL: load_sext_16i8_to_16i16:
3926; AVX512:       # BB#0: # %entry
3927; AVX512-NEXT:    vpmovsxbw (%rdi), %ymm0
3928; AVX512-NEXT:    retq
3929;
3930; X32-SSE41-LABEL: load_sext_16i8_to_16i16:
3931; X32-SSE41:       # BB#0: # %entry
3932; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3933; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
3934; X32-SSE41-NEXT:    pmovsxbw 8(%eax), %xmm1
3935; X32-SSE41-NEXT:    retl
3936entry:
3937 %X = load <16 x i8>, <16 x i8>* %ptr
3938 %Y = sext <16 x i8> %X to <16 x i16>
3939 ret <16 x i16> %Y
3940}
3941
3942define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
3943; SSE2-LABEL: load_sext_2i16_to_2i64:
3944; SSE2:       # BB#0: # %entry
3945; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3946; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3947; SSE2-NEXT:    movdqa %xmm0, %xmm1
3948; SSE2-NEXT:    psrad $31, %xmm1
3949; SSE2-NEXT:    psrad $16, %xmm0
3950; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3951; SSE2-NEXT:    retq
3952;
3953; SSSE3-LABEL: load_sext_2i16_to_2i64:
3954; SSSE3:       # BB#0: # %entry
3955; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3956; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3957; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3958; SSSE3-NEXT:    psrad $31, %xmm1
3959; SSSE3-NEXT:    psrad $16, %xmm0
3960; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3961; SSSE3-NEXT:    retq
3962;
3963; SSE41-LABEL: load_sext_2i16_to_2i64:
3964; SSE41:       # BB#0: # %entry
3965; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
3966; SSE41-NEXT:    retq
3967;
3968; AVX-LABEL: load_sext_2i16_to_2i64:
3969; AVX:       # BB#0: # %entry
3970; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
3971; AVX-NEXT:    retq
3972;
3973; X32-SSE41-LABEL: load_sext_2i16_to_2i64:
3974; X32-SSE41:       # BB#0: # %entry
3975; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3976; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
3977; X32-SSE41-NEXT:    retl
3978entry:
3979 %X = load <2 x i16>, <2 x i16>* %ptr
3980 %Y = sext <2 x i16> %X to <2 x i64>
3981 ret <2 x i64> %Y
3982}
3983
3984define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) {
3985; SSE2-LABEL: load_sext_4i16_to_4i32:
3986; SSE2:       # BB#0: # %entry
3987; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3988; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3989; SSE2-NEXT:    psrad $16, %xmm0
3990; SSE2-NEXT:    retq
3991;
3992; SSSE3-LABEL: load_sext_4i16_to_4i32:
3993; SSSE3:       # BB#0: # %entry
3994; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3995; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3996; SSSE3-NEXT:    psrad $16, %xmm0
3997; SSSE3-NEXT:    retq
3998;
3999; SSE41-LABEL: load_sext_4i16_to_4i32:
4000; SSE41:       # BB#0: # %entry
4001; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
4002; SSE41-NEXT:    retq
4003;
4004; AVX-LABEL: load_sext_4i16_to_4i32:
4005; AVX:       # BB#0: # %entry
4006; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
4007; AVX-NEXT:    retq
4008;
4009; X32-SSE41-LABEL: load_sext_4i16_to_4i32:
4010; X32-SSE41:       # BB#0: # %entry
4011; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4012; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
4013; X32-SSE41-NEXT:    retl
4014entry:
4015 %X = load <4 x i16>, <4 x i16>* %ptr
4016 %Y = sext <4 x i16> %X to <4 x i32>
4017 ret <4 x i32> %Y
4018}
4019
4020define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
4021; SSE2-LABEL: load_sext_4i16_to_4i64:
4022; SSE2:       # BB#0: # %entry
4023; SSE2-NEXT:    movswq 2(%rdi), %rax
4024; SSE2-NEXT:    movd %rax, %xmm1
4025; SSE2-NEXT:    movswq (%rdi), %rax
4026; SSE2-NEXT:    movd %rax, %xmm0
4027; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4028; SSE2-NEXT:    movswq 6(%rdi), %rax
4029; SSE2-NEXT:    movd %rax, %xmm2
4030; SSE2-NEXT:    movswq 4(%rdi), %rax
4031; SSE2-NEXT:    movd %rax, %xmm1
4032; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4033; SSE2-NEXT:    retq
4034;
4035; SSSE3-LABEL: load_sext_4i16_to_4i64:
4036; SSSE3:       # BB#0: # %entry
4037; SSSE3-NEXT:    movswq 2(%rdi), %rax
4038; SSSE3-NEXT:    movd %rax, %xmm1
4039; SSSE3-NEXT:    movswq (%rdi), %rax
4040; SSSE3-NEXT:    movd %rax, %xmm0
4041; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4042; SSSE3-NEXT:    movswq 6(%rdi), %rax
4043; SSSE3-NEXT:    movd %rax, %xmm2
4044; SSSE3-NEXT:    movswq 4(%rdi), %rax
4045; SSSE3-NEXT:    movd %rax, %xmm1
4046; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4047; SSSE3-NEXT:    retq
4048;
4049; SSE41-LABEL: load_sext_4i16_to_4i64:
4050; SSE41:       # BB#0: # %entry
4051; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
4052; SSE41-NEXT:    pmovsxwq 4(%rdi), %xmm1
4053; SSE41-NEXT:    retq
4054;
4055; AVX1-LABEL: load_sext_4i16_to_4i64:
4056; AVX1:       # BB#0: # %entry
4057; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
4058; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
4059; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4060; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
4061; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4062; AVX1-NEXT:    retq
4063;
4064; AVX2-LABEL: load_sext_4i16_to_4i64:
4065; AVX2:       # BB#0: # %entry
4066; AVX2-NEXT:    vpmovsxwq (%rdi), %ymm0
4067; AVX2-NEXT:    retq
4068;
4069; AVX512-LABEL: load_sext_4i16_to_4i64:
4070; AVX512:       # BB#0: # %entry
4071; AVX512-NEXT:    vpmovsxwq (%rdi), %ymm0
4072; AVX512-NEXT:    retq
4073;
4074; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
4075; X32-SSE41:       # BB#0: # %entry
4076; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4077; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
4078; X32-SSE41-NEXT:    pmovsxwq 4(%eax), %xmm1
4079; X32-SSE41-NEXT:    retl
4080entry:
4081 %X = load <4 x i16>, <4 x i16>* %ptr
4082 %Y = sext <4 x i16> %X to <4 x i64>
4083 ret <4 x i64> %Y
4084}
4085
4086define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
4087; SSE2-LABEL: load_sext_8i16_to_8i32:
4088; SSE2:       # BB#0: # %entry
4089; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4090; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4091; SSE2-NEXT:    psrad $16, %xmm0
4092; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
4093; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
4094; SSE2-NEXT:    psrad $16, %xmm1
4095; SSE2-NEXT:    retq
4096;
4097; SSSE3-LABEL: load_sext_8i16_to_8i32:
4098; SSSE3:       # BB#0: # %entry
4099; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4100; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4101; SSSE3-NEXT:    psrad $16, %xmm0
4102; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
4103; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
4104; SSSE3-NEXT:    psrad $16, %xmm1
4105; SSSE3-NEXT:    retq
4106;
4107; SSE41-LABEL: load_sext_8i16_to_8i32:
4108; SSE41:       # BB#0: # %entry
4109; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
4110; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
4111; SSE41-NEXT:    retq
4112;
4113; AVX1-LABEL: load_sext_8i16_to_8i32:
4114; AVX1:       # BB#0: # %entry
4115; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
4116; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm1
4117; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4118; AVX1-NEXT:    retq
4119;
4120; AVX2-LABEL: load_sext_8i16_to_8i32:
4121; AVX2:       # BB#0: # %entry
4122; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
4123; AVX2-NEXT:    retq
4124;
4125; AVX512-LABEL: load_sext_8i16_to_8i32:
4126; AVX512:       # BB#0: # %entry
4127; AVX512-NEXT:    vpmovsxwd (%rdi), %ymm0
4128; AVX512-NEXT:    retq
4129;
4130; X32-SSE41-LABEL: load_sext_8i16_to_8i32:
4131; X32-SSE41:       # BB#0: # %entry
4132; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4133; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
4134; X32-SSE41-NEXT:    pmovsxwd 8(%eax), %xmm1
4135; X32-SSE41-NEXT:    retl
4136entry:
4137 %X = load <8 x i16>, <8 x i16>* %ptr
4138 %Y = sext <8 x i16> %X to <8 x i32>
4139 ret <8 x i32> %Y
4140}
4141
4142define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
4143; SSE2-LABEL: load_sext_2i32_to_2i64:
4144; SSE2:       # BB#0: # %entry
4145; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4146; SSE2-NEXT:    movdqa %xmm0, %xmm1
4147; SSE2-NEXT:    psrad $31, %xmm1
4148; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4149; SSE2-NEXT:    retq
4150;
4151; SSSE3-LABEL: load_sext_2i32_to_2i64:
4152; SSSE3:       # BB#0: # %entry
4153; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4154; SSSE3-NEXT:    movdqa %xmm0, %xmm1
4155; SSSE3-NEXT:    psrad $31, %xmm1
4156; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4157; SSSE3-NEXT:    retq
4158;
4159; SSE41-LABEL: load_sext_2i32_to_2i64:
4160; SSE41:       # BB#0: # %entry
4161; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
4162; SSE41-NEXT:    retq
4163;
4164; AVX-LABEL: load_sext_2i32_to_2i64:
4165; AVX:       # BB#0: # %entry
4166; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
4167; AVX-NEXT:    retq
4168;
4169; X32-SSE41-LABEL: load_sext_2i32_to_2i64:
4170; X32-SSE41:       # BB#0: # %entry
4171; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4172; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
4173; X32-SSE41-NEXT:    retl
4174entry:
4175 %X = load <2 x i32>, <2 x i32>* %ptr
4176 %Y = sext <2 x i32> %X to <2 x i64>
4177 ret <2 x i64> %Y
4178}
4179
4180define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
4181; SSE2-LABEL: load_sext_4i32_to_4i64:
4182; SSE2:       # BB#0: # %entry
4183; SSE2-NEXT:    movdqa (%rdi), %xmm0
4184; SSE2-NEXT:    movdqa %xmm0, %xmm2
4185; SSE2-NEXT:    psrad $31, %xmm2
4186; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4187; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4188; SSE2-NEXT:    movdqa %xmm1, %xmm2
4189; SSE2-NEXT:    psrad $31, %xmm2
4190; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4191; SSE2-NEXT:    retq
4192;
4193; SSSE3-LABEL: load_sext_4i32_to_4i64:
4194; SSSE3:       # BB#0: # %entry
4195; SSSE3-NEXT:    movdqa (%rdi), %xmm0
4196; SSSE3-NEXT:    movdqa %xmm0, %xmm2
4197; SSSE3-NEXT:    psrad $31, %xmm2
4198; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4199; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4200; SSSE3-NEXT:    movdqa %xmm1, %xmm2
4201; SSSE3-NEXT:    psrad $31, %xmm2
4202; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4203; SSSE3-NEXT:    retq
4204;
4205; SSE41-LABEL: load_sext_4i32_to_4i64:
4206; SSE41:       # BB#0: # %entry
4207; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
4208; SSE41-NEXT:    pmovsxdq 8(%rdi), %xmm1
4209; SSE41-NEXT:    retq
4210;
4211; AVX1-LABEL: load_sext_4i32_to_4i64:
4212; AVX1:       # BB#0: # %entry
4213; AVX1-NEXT:    vpmovsxdq (%rdi), %xmm0
4214; AVX1-NEXT:    vpmovsxdq 8(%rdi), %xmm1
4215; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4216; AVX1-NEXT:    retq
4217;
4218; AVX2-LABEL: load_sext_4i32_to_4i64:
4219; AVX2:       # BB#0: # %entry
4220; AVX2-NEXT:    vpmovsxdq (%rdi), %ymm0
4221; AVX2-NEXT:    retq
4222;
4223; AVX512-LABEL: load_sext_4i32_to_4i64:
4224; AVX512:       # BB#0: # %entry
4225; AVX512-NEXT:    vpmovsxdq (%rdi), %ymm0
4226; AVX512-NEXT:    retq
4227;
4228; X32-SSE41-LABEL: load_sext_4i32_to_4i64:
4229; X32-SSE41:       # BB#0: # %entry
4230; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4231; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
4232; X32-SSE41-NEXT:    pmovsxdq 8(%eax), %xmm1
4233; X32-SSE41-NEXT:    retl
4234entry:
4235 %X = load <4 x i32>, <4 x i32>* %ptr
4236 %Y = sext <4 x i32> %X to <4 x i64>
4237 ret <4 x i64> %Y
4238}
4239
4240define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
4241; SSE2-LABEL: sext_2i8_to_i32:
4242; SSE2:       # BB#0: # %entry
4243; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4244; SSE2-NEXT:    psraw $8, %xmm0
4245; SSE2-NEXT:    movd %xmm0, %eax
4246; SSE2-NEXT:    retq
4247;
4248; SSSE3-LABEL: sext_2i8_to_i32:
4249; SSSE3:       # BB#0: # %entry
4250; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4251; SSSE3-NEXT:    psraw $8, %xmm0
4252; SSSE3-NEXT:    movd %xmm0, %eax
4253; SSSE3-NEXT:    retq
4254;
4255; SSE41-LABEL: sext_2i8_to_i32:
4256; SSE41:       # BB#0: # %entry
4257; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
4258; SSE41-NEXT:    movd %xmm0, %eax
4259; SSE41-NEXT:    retq
4260;
4261; AVX-LABEL: sext_2i8_to_i32:
4262; AVX:       # BB#0: # %entry
4263; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
4264; AVX-NEXT:    vmovd %xmm0, %eax
4265; AVX-NEXT:    retq
4266;
4267; X32-SSE41-LABEL: sext_2i8_to_i32:
4268; X32-SSE41:       # BB#0: # %entry
4269; X32-SSE41-NEXT:    pushl %eax
4270; X32-SSE41-NEXT:  .Ltmp0:
4271; X32-SSE41-NEXT:    .cfi_def_cfa_offset 8
4272; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
4273; X32-SSE41-NEXT:    movd %xmm0, %eax
4274; X32-SSE41-NEXT:    popl %ecx
4275; X32-SSE41-NEXT:    retl
4276entry:
4277  %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
4278  %Ex = sext <2 x i8> %Shuf to <2 x i16>
4279  %Bc = bitcast <2 x i16> %Ex to i32
4280  ret i32 %Bc
4281}
4282
4283define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
4284; SSE2-LABEL: sext_4i1_to_4i64:
4285; SSE2:       # BB#0:
4286; SSE2-NEXT:    pslld $31, %xmm0
4287; SSE2-NEXT:    psrad $31, %xmm0
4288; SSE2-NEXT:    movdqa %xmm0, %xmm2
4289; SSE2-NEXT:    psrad $31, %xmm2
4290; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4291; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4292; SSE2-NEXT:    movdqa %xmm1, %xmm2
4293; SSE2-NEXT:    psrad $31, %xmm2
4294; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4295; SSE2-NEXT:    retq
4296;
4297; SSSE3-LABEL: sext_4i1_to_4i64:
4298; SSSE3:       # BB#0:
4299; SSSE3-NEXT:    pslld $31, %xmm0
4300; SSSE3-NEXT:    psrad $31, %xmm0
4301; SSSE3-NEXT:    movdqa %xmm0, %xmm2
4302; SSSE3-NEXT:    psrad $31, %xmm2
4303; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4304; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4305; SSSE3-NEXT:    movdqa %xmm1, %xmm2
4306; SSSE3-NEXT:    psrad $31, %xmm2
4307; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4308; SSSE3-NEXT:    retq
4309;
4310; SSE41-LABEL: sext_4i1_to_4i64:
4311; SSE41:       # BB#0:
4312; SSE41-NEXT:    pslld $31, %xmm0
4313; SSE41-NEXT:    psrad $31, %xmm0
4314; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
4315; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4316; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
4317; SSE41-NEXT:    movdqa %xmm2, %xmm0
4318; SSE41-NEXT:    retq
4319;
4320; AVX1-LABEL: sext_4i1_to_4i64:
4321; AVX1:       # BB#0:
4322; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
4323; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
4324; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
4325; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4326; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
4327; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4328; AVX1-NEXT:    retq
4329;
4330; AVX2-LABEL: sext_4i1_to_4i64:
4331; AVX2:       # BB#0:
4332; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
4333; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
4334; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
4335; AVX2-NEXT:    retq
4336;
4337; AVX512-LABEL: sext_4i1_to_4i64:
4338; AVX512:       # BB#0:
4339; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
4340; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
4341; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
4342; AVX512-NEXT:    retq
4343;
4344; X32-SSE41-LABEL: sext_4i1_to_4i64:
4345; X32-SSE41:       # BB#0:
4346; X32-SSE41-NEXT:    pslld $31, %xmm0
4347; X32-SSE41-NEXT:    psrad $31, %xmm0
4348; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
4349; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4350; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
4351; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
4352; X32-SSE41-NEXT:    retl
4353  %extmask = sext <4 x i1> %mask to <4 x i64>
4354  ret <4 x i64> %extmask
4355}
4356
4357define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
4358; SSE2-LABEL: sext_4i8_to_4i64:
4359; SSE2:       # BB#0:
4360; SSE2-NEXT:    pslld $24, %xmm0
4361; SSE2-NEXT:    psrad $24, %xmm0
4362; SSE2-NEXT:    movdqa %xmm0, %xmm2
4363; SSE2-NEXT:    psrad $31, %xmm2
4364; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4365; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4366; SSE2-NEXT:    movdqa %xmm1, %xmm2
4367; SSE2-NEXT:    psrad $31, %xmm2
4368; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4369; SSE2-NEXT:    retq
4370;
4371; SSSE3-LABEL: sext_4i8_to_4i64:
4372; SSSE3:       # BB#0:
4373; SSSE3-NEXT:    pslld $24, %xmm0
4374; SSSE3-NEXT:    psrad $24, %xmm0
4375; SSSE3-NEXT:    movdqa %xmm0, %xmm2
4376; SSSE3-NEXT:    psrad $31, %xmm2
4377; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4378; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4379; SSSE3-NEXT:    movdqa %xmm1, %xmm2
4380; SSSE3-NEXT:    psrad $31, %xmm2
4381; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4382; SSSE3-NEXT:    retq
4383;
4384; SSE41-LABEL: sext_4i8_to_4i64:
4385; SSE41:       # BB#0:
4386; SSE41-NEXT:    pslld $24, %xmm0
4387; SSE41-NEXT:    psrad $24, %xmm0
4388; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
4389; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4390; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
4391; SSE41-NEXT:    movdqa %xmm2, %xmm0
4392; SSE41-NEXT:    retq
4393;
4394; AVX1-LABEL: sext_4i8_to_4i64:
4395; AVX1:       # BB#0:
4396; AVX1-NEXT:    vpslld $24, %xmm0, %xmm0
4397; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
4398; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
4399; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4400; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
4401; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4402; AVX1-NEXT:    retq
4403;
4404; AVX2-LABEL: sext_4i8_to_4i64:
4405; AVX2:       # BB#0:
4406; AVX2-NEXT:    vpslld $24, %xmm0, %xmm0
4407; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
4408; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
4409; AVX2-NEXT:    retq
4410;
4411; AVX512-LABEL: sext_4i8_to_4i64:
4412; AVX512:       # BB#0:
4413; AVX512-NEXT:    vpslld $24, %xmm0, %xmm0
4414; AVX512-NEXT:    vpsrad $24, %xmm0, %xmm0
4415; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
4416; AVX512-NEXT:    retq
4417;
4418; X32-SSE41-LABEL: sext_4i8_to_4i64:
4419; X32-SSE41:       # BB#0:
4420; X32-SSE41-NEXT:    pslld $24, %xmm0
4421; X32-SSE41-NEXT:    psrad $24, %xmm0
4422; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
4423; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4424; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
4425; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
4426; X32-SSE41-NEXT:    retl
4427  %extmask = sext <4 x i8> %mask to <4 x i64>
4428  ret <4 x i64> %extmask
4429}
4430