• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4;
5; 32-bit tests to make sure we're not doing anything stupid.
6; RUN: llc < %s -mtriple=i686-unknown-unknown
7; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse
8; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2
9
10;
11; Signed Integer to Double
12;
13
14define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
15; SSE-LABEL: sitofp_2i64_to_2f64:
16; SSE:       # BB#0:
17; SSE-NEXT:    movd %xmm0, %rax
18; SSE-NEXT:    cvtsi2sdq %rax, %xmm1
19; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
20; SSE-NEXT:    movd %xmm0, %rax
21; SSE-NEXT:    xorps %xmm0, %xmm0
22; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
23; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
24; SSE-NEXT:    movapd %xmm1, %xmm0
25; SSE-NEXT:    retq
26;
27; AVX-LABEL: sitofp_2i64_to_2f64:
28; AVX:       # BB#0:
29; AVX-NEXT:    vpextrq $1, %xmm0, %rax
30; AVX-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
31; AVX-NEXT:    vmovq %xmm0, %rax
32; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
33; AVX-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
34; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
35; AVX-NEXT:    retq
36  %cvt = sitofp <2 x i64> %a to <2 x double>
37  ret <2 x double> %cvt
38}
39
40define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
41; SSE-LABEL: sitofp_2i32_to_2f64:
42; SSE:       # BB#0:
43; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
44; SSE-NEXT:    retq
45;
46; AVX-LABEL: sitofp_2i32_to_2f64:
47; AVX:       # BB#0:
48; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
49; AVX-NEXT:    retq
50  %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
51  %cvt = sitofp <2 x i32> %shuf to <2 x double>
52  ret <2 x double> %cvt
53}
54
55define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
56; SSE-LABEL: sitofp_4i32_to_2f64:
57; SSE:       # BB#0:
58; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
59; SSE-NEXT:    retq
60;
61; AVX-LABEL: sitofp_4i32_to_2f64:
62; AVX:       # BB#0:
63; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
64; AVX-NEXT:    vzeroupper
65; AVX-NEXT:    retq
66  %cvt = sitofp <4 x i32> %a to <4 x double>
67  %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
68  ret <2 x double> %shuf
69}
70
71define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
72; SSE-LABEL: sitofp_2i16_to_2f64:
73; SSE:       # BB#0:
74; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
75; SSE-NEXT:    psrad $16, %xmm0
76; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
77; SSE-NEXT:    retq
78;
79; AVX-LABEL: sitofp_2i16_to_2f64:
80; AVX:       # BB#0:
81; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
82; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
83; AVX-NEXT:    retq
84  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
85  %cvt = sitofp <2 x i16> %shuf to <2 x double>
86  ret <2 x double> %cvt
87}
88
89define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
90; SSE-LABEL: sitofp_8i16_to_2f64:
91; SSE:       # BB#0:
92; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
93; SSE-NEXT:    psrad $16, %xmm0
94; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
95; SSE-NEXT:    retq
96;
97; AVX1-LABEL: sitofp_8i16_to_2f64:
98; AVX1:       # BB#0:
99; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
100; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
101; AVX1-NEXT:    vzeroupper
102; AVX1-NEXT:    retq
103;
104; AVX2-LABEL: sitofp_8i16_to_2f64:
105; AVX2:       # BB#0:
106; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
107; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
108; AVX2-NEXT:    vzeroupper
109; AVX2-NEXT:    retq
110  %cvt = sitofp <8 x i16> %a to <8 x double>
111  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
112  ret <2 x double> %shuf
113}
114
115define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
116; SSE-LABEL: sitofp_2i8_to_2f64:
117; SSE:       # BB#0:
118; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
119; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
120; SSE-NEXT:    psrad $24, %xmm0
121; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
122; SSE-NEXT:    retq
123;
124; AVX-LABEL: sitofp_2i8_to_2f64:
125; AVX:       # BB#0:
126; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
127; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
128; AVX-NEXT:    retq
129  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
130  %cvt = sitofp <2 x i8> %shuf to <2 x double>
131  ret <2 x double> %cvt
132}
133
134define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
135; SSE-LABEL: sitofp_16i8_to_2f64:
136; SSE:       # BB#0:
137; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
138; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
139; SSE-NEXT:    psrad $24, %xmm0
140; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
141; SSE-NEXT:    retq
142;
143; AVX1-LABEL: sitofp_16i8_to_2f64:
144; AVX1:       # BB#0:
145; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
146; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
147; AVX1-NEXT:    vzeroupper
148; AVX1-NEXT:    retq
149;
150; AVX2-LABEL: sitofp_16i8_to_2f64:
151; AVX2:       # BB#0:
152; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
153; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
154; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
155; AVX2-NEXT:    vzeroupper
156; AVX2-NEXT:    retq
157  %cvt = sitofp <16 x i8> %a to <16 x double>
158  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
159  ret <2 x double> %shuf
160}
161
162define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
163; SSE-LABEL: sitofp_4i64_to_4f64:
164; SSE:       # BB#0:
165; SSE-NEXT:    movd %xmm0, %rax
166; SSE-NEXT:    cvtsi2sdq %rax, %xmm2
167; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
168; SSE-NEXT:    movd %xmm0, %rax
169; SSE-NEXT:    xorps %xmm0, %xmm0
170; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
171; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
172; SSE-NEXT:    movd %xmm1, %rax
173; SSE-NEXT:    cvtsi2sdq %rax, %xmm3
174; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
175; SSE-NEXT:    movd %xmm0, %rax
176; SSE-NEXT:    xorps %xmm0, %xmm0
177; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
178; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
179; SSE-NEXT:    movapd %xmm2, %xmm0
180; SSE-NEXT:    movapd %xmm3, %xmm1
181; SSE-NEXT:    retq
182;
183; AVX1-LABEL: sitofp_4i64_to_4f64:
184; AVX1:       # BB#0:
185; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
186; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
187; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
188; AVX1-NEXT:    vmovq %xmm1, %rax
189; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
190; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
191; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
192; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
193; AVX1-NEXT:    vmovq %xmm0, %rax
194; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
195; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
196; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
197; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
198; AVX1-NEXT:    retq
199;
200; AVX2-LABEL: sitofp_4i64_to_4f64:
201; AVX2:       # BB#0:
202; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
203; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
204; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
205; AVX2-NEXT:    vmovq %xmm1, %rax
206; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
207; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
208; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
209; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
210; AVX2-NEXT:    vmovq %xmm0, %rax
211; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
212; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
213; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
214; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
215; AVX2-NEXT:    retq
216  %cvt = sitofp <4 x i64> %a to <4 x double>
217  ret <4 x double> %cvt
218}
219
220define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
221; SSE-LABEL: sitofp_4i32_to_4f64:
222; SSE:       # BB#0:
223; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
224; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
225; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
226; SSE-NEXT:    movaps %xmm2, %xmm0
227; SSE-NEXT:    retq
228;
229; AVX-LABEL: sitofp_4i32_to_4f64:
230; AVX:       # BB#0:
231; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
232; AVX-NEXT:    retq
233  %cvt = sitofp <4 x i32> %a to <4 x double>
234  ret <4 x double> %cvt
235}
236
237define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
238; SSE-LABEL: sitofp_4i16_to_4f64:
239; SSE:       # BB#0:
240; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
241; SSE-NEXT:    psrad $16, %xmm1
242; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
243; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
244; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
245; SSE-NEXT:    retq
246;
247; AVX-LABEL: sitofp_4i16_to_4f64:
248; AVX:       # BB#0:
249; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
250; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
251; AVX-NEXT:    retq
252  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
253  %cvt = sitofp <4 x i16> %shuf to <4 x double>
254  ret <4 x double> %cvt
255}
256
257define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
258; SSE-LABEL: sitofp_8i16_to_4f64:
259; SSE:       # BB#0:
260; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
261; SSE-NEXT:    psrad $16, %xmm1
262; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
263; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
264; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
265; SSE-NEXT:    retq
266;
267; AVX1-LABEL: sitofp_8i16_to_4f64:
268; AVX1:       # BB#0:
269; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
270; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
271; AVX1-NEXT:    retq
272;
273; AVX2-LABEL: sitofp_8i16_to_4f64:
274; AVX2:       # BB#0:
275; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
276; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
277; AVX2-NEXT:    retq
278  %cvt = sitofp <8 x i16> %a to <8 x double>
279  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
280  ret <4 x double> %shuf
281}
282
283define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
284; SSE-LABEL: sitofp_4i8_to_4f64:
285; SSE:       # BB#0:
286; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
287; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
288; SSE-NEXT:    psrad $24, %xmm1
289; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
290; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
291; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
292; SSE-NEXT:    retq
293;
294; AVX-LABEL: sitofp_4i8_to_4f64:
295; AVX:       # BB#0:
296; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
297; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
298; AVX-NEXT:    retq
299  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
300  %cvt = sitofp <4 x i8> %shuf to <4 x double>
301  ret <4 x double> %cvt
302}
303
304define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
305; SSE-LABEL: sitofp_16i8_to_4f64:
306; SSE:       # BB#0:
307; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
308; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
309; SSE-NEXT:    psrad $24, %xmm1
310; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
311; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
312; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
313; SSE-NEXT:    retq
314;
315; AVX1-LABEL: sitofp_16i8_to_4f64:
316; AVX1:       # BB#0:
317; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
318; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
319; AVX1-NEXT:    retq
320;
321; AVX2-LABEL: sitofp_16i8_to_4f64:
322; AVX2:       # BB#0:
323; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
324; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
325; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
326; AVX2-NEXT:    retq
327  %cvt = sitofp <16 x i8> %a to <16 x double>
328  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
329  ret <4 x double> %shuf
330}
331
332;
333; Unsigned Integer to Double
334;
335
336define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
337; SSE-LABEL: uitofp_2i64_to_2f64:
338; SSE:       # BB#0:
339; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
340; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
341; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
342; SSE-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
343; SSE-NEXT:    subpd %xmm3, %xmm0
344; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
345; SSE-NEXT:    addpd %xmm4, %xmm0
346; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
347; SSE-NEXT:    subpd %xmm3, %xmm2
348; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
349; SSE-NEXT:    addpd %xmm2, %xmm1
350; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
351; SSE-NEXT:    retq
352;
353; AVX-LABEL: uitofp_2i64_to_2f64:
354; AVX:       # BB#0:
355; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
356; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
357; AVX-NEXT:    vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
358; AVX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
359; AVX-NEXT:    vhaddpd %xmm2, %xmm2, %xmm2
360; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
361; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
362; AVX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
363; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
364; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
365; AVX-NEXT:    retq
366  %cvt = uitofp <2 x i64> %a to <2 x double>
367  ret <2 x double> %cvt
368}
369
370define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
371; SSE-LABEL: uitofp_2i32_to_2f64:
372; SSE:       # BB#0:
373; SSE-NEXT:    pxor %xmm1, %xmm1
374; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
375; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
376; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
377; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
378; SSE-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
379; SSE-NEXT:    subpd %xmm3, %xmm0
380; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
381; SSE-NEXT:    addpd %xmm4, %xmm0
382; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
383; SSE-NEXT:    subpd %xmm3, %xmm2
384; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
385; SSE-NEXT:    addpd %xmm2, %xmm1
386; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
387; SSE-NEXT:    retq
388;
389; AVX-LABEL: uitofp_2i32_to_2f64:
390; AVX:       # BB#0:
391; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
392; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
393; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
394; AVX-NEXT:    vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
395; AVX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
396; AVX-NEXT:    vhaddpd %xmm2, %xmm2, %xmm2
397; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
398; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
399; AVX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
400; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
401; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
402; AVX-NEXT:    retq
403  %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
404  %cvt = uitofp <2 x i32> %shuf to <2 x double>
405  ret <2 x double> %cvt
406}
407
408define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
409; SSE-LABEL: uitofp_4i32_to_2f64:
410; SSE:       # BB#0:
411; SSE-NEXT:    pxor %xmm1, %xmm1
412; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
413; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
414; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
415; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
416; SSE-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
417; SSE-NEXT:    subpd %xmm3, %xmm0
418; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
419; SSE-NEXT:    addpd %xmm4, %xmm0
420; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
421; SSE-NEXT:    subpd %xmm3, %xmm2
422; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
423; SSE-NEXT:    addpd %xmm2, %xmm1
424; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
425; SSE-NEXT:    retq
426;
427; AVX1-LABEL: uitofp_4i32_to_2f64:
428; AVX1:       # BB#0:
429; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
430; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
431; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
432; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
433; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
434; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
435; AVX1-NEXT:    vzeroupper
436; AVX1-NEXT:    retq
437;
438; AVX2-LABEL: uitofp_4i32_to_2f64:
439; AVX2:       # BB#0:
440; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
441; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
442; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
443; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
444; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
445; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
446; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
447; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
448; AVX2-NEXT:    vzeroupper
449; AVX2-NEXT:    retq
450  %cvt = uitofp <4 x i32> %a to <4 x double>
451  %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
452  ret <2 x double> %shuf
453}
454
455define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
456; SSE-LABEL: uitofp_2i16_to_2f64:
457; SSE:       # BB#0:
458; SSE-NEXT:    pxor %xmm1, %xmm1
459; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
460; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
461; SSE-NEXT:    retq
462;
463; AVX-LABEL: uitofp_2i16_to_2f64:
464; AVX:       # BB#0:
465; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
466; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
467; AVX-NEXT:    retq
468  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
469  %cvt = uitofp <2 x i16> %shuf to <2 x double>
470  ret <2 x double> %cvt
471}
472
473define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
474; SSE-LABEL: uitofp_8i16_to_2f64:
475; SSE:       # BB#0:
476; SSE-NEXT:    pxor %xmm1, %xmm1
477; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
478; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
479; SSE-NEXT:    retq
480;
481; AVX1-LABEL: uitofp_8i16_to_2f64:
482; AVX1:       # BB#0:
483; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
484; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
485; AVX1-NEXT:    vzeroupper
486; AVX1-NEXT:    retq
487;
488; AVX2-LABEL: uitofp_8i16_to_2f64:
489; AVX2:       # BB#0:
490; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
491; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
492; AVX2-NEXT:    vzeroupper
493; AVX2-NEXT:    retq
494  %cvt = uitofp <8 x i16> %a to <8 x double>
495  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
496  ret <2 x double> %shuf
497}
498
499define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
500; SSE-LABEL: uitofp_2i8_to_2f64:
501; SSE:       # BB#0:
502; SSE-NEXT:    pxor %xmm1, %xmm1
503; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
504; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
505; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
506; SSE-NEXT:    retq
507;
508; AVX-LABEL: uitofp_2i8_to_2f64:
509; AVX:       # BB#0:
510; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
511; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
512; AVX-NEXT:    retq
513  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
514  %cvt = uitofp <2 x i8> %shuf to <2 x double>
515  ret <2 x double> %cvt
516}
517
518define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
519; SSE-LABEL: uitofp_16i8_to_2f64:
520; SSE:       # BB#0:
521; SSE-NEXT:    pxor %xmm1, %xmm1
522; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
523; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
524; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
525; SSE-NEXT:    retq
526;
527; AVX1-LABEL: uitofp_16i8_to_2f64:
528; AVX1:       # BB#0:
529; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
530; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
531; AVX1-NEXT:    vzeroupper
532; AVX1-NEXT:    retq
533;
534; AVX2-LABEL: uitofp_16i8_to_2f64:
535; AVX2:       # BB#0:
536; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
537; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
538; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
539; AVX2-NEXT:    vzeroupper
540; AVX2-NEXT:    retq
541  %cvt = uitofp <16 x i8> %a to <16 x double>
542  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
543  ret <2 x double> %shuf
544}
545
546define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
547; SSE-LABEL: uitofp_4i64_to_4f64:
548; SSE:       # BB#0:
549; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
550; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
551; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
552; SSE-NEXT:    movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
553; SSE-NEXT:    subpd %xmm4, %xmm0
554; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
555; SSE-NEXT:    addpd %xmm5, %xmm0
556; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
557; SSE-NEXT:    subpd %xmm4, %xmm3
558; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
559; SSE-NEXT:    addpd %xmm3, %xmm5
560; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
561; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
562; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
563; SSE-NEXT:    subpd %xmm4, %xmm1
564; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
565; SSE-NEXT:    addpd %xmm5, %xmm1
566; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
567; SSE-NEXT:    subpd %xmm4, %xmm3
568; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
569; SSE-NEXT:    addpd %xmm3, %xmm2
570; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
571; SSE-NEXT:    retq
572;
573; AVX1-LABEL: uitofp_4i64_to_4f64:
574; AVX1:       # BB#0:
575; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
576; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
577; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
578; AVX1-NEXT:    vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
579; AVX1-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
580; AVX1-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
581; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
582; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
583; AVX1-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
584; AVX1-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
585; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
586; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
587; AVX1-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
588; AVX1-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
589; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
590; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
591; AVX1-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
592; AVX1-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
593; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
594; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
595; AVX1-NEXT:    retq
596;
597; AVX2-LABEL: uitofp_4i64_to_4f64:
598; AVX2:       # BB#0:
599; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
600; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
601; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
602; AVX2-NEXT:    vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
603; AVX2-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
604; AVX2-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
605; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
606; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
607; AVX2-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
608; AVX2-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
609; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
610; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
611; AVX2-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
612; AVX2-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
613; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
614; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
615; AVX2-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
616; AVX2-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
617; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
618; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
619; AVX2-NEXT:    retq
620  %cvt = uitofp <4 x i64> %a to <4 x double>
621  ret <4 x double> %cvt
622}
623
624define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
625; SSE-LABEL: uitofp_4i32_to_4f64:
626; SSE:       # BB#0:
627; SSE-NEXT:    movdqa %xmm0, %xmm2
628; SSE-NEXT:    pxor %xmm1, %xmm1
629; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
630; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
631; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
632; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
633; SSE-NEXT:    movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
634; SSE-NEXT:    subpd %xmm5, %xmm0
635; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
636; SSE-NEXT:    addpd %xmm6, %xmm0
637; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
638; SSE-NEXT:    subpd %xmm5, %xmm4
639; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
640; SSE-NEXT:    addpd %xmm4, %xmm6
641; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0]
642; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
643; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
644; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
645; SSE-NEXT:    subpd %xmm5, %xmm2
646; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
647; SSE-NEXT:    addpd %xmm2, %xmm1
648; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
649; SSE-NEXT:    subpd %xmm5, %xmm4
650; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
651; SSE-NEXT:    addpd %xmm4, %xmm2
652; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
653; SSE-NEXT:    retq
654;
655; AVX1-LABEL: uitofp_4i32_to_4f64:
656; AVX1:       # BB#0:
657; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
658; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
659; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
660; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
661; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
662; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
663; AVX1-NEXT:    retq
664;
665; AVX2-LABEL: uitofp_4i32_to_4f64:
666; AVX2:       # BB#0:
667; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
668; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
669; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
670; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
671; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
672; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
673; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
674; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
675; AVX2-NEXT:    retq
676  %cvt = uitofp <4 x i32> %a to <4 x double>
677  ret <4 x double> %cvt
678}
679
680define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
681; SSE-LABEL: uitofp_4i16_to_4f64:
682; SSE:       # BB#0:
683; SSE-NEXT:    pxor %xmm1, %xmm1
684; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
685; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
686; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
687; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
688; SSE-NEXT:    movaps %xmm2, %xmm0
689; SSE-NEXT:    retq
690;
691; AVX-LABEL: uitofp_4i16_to_4f64:
692; AVX:       # BB#0:
693; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
694; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
695; AVX-NEXT:    retq
696  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
697  %cvt = uitofp <4 x i16> %shuf to <4 x double>
698  ret <4 x double> %cvt
699}
700
701define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
702; SSE-LABEL: uitofp_8i16_to_4f64:
703; SSE:       # BB#0:
704; SSE-NEXT:    pxor %xmm1, %xmm1
705; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
706; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
707; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
708; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
709; SSE-NEXT:    movaps %xmm2, %xmm0
710; SSE-NEXT:    retq
711;
712; AVX1-LABEL: uitofp_8i16_to_4f64:
713; AVX1:       # BB#0:
714; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
715; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
716; AVX1-NEXT:    retq
717;
718; AVX2-LABEL: uitofp_8i16_to_4f64:
719; AVX2:       # BB#0:
720; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
721; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
722; AVX2-NEXT:    retq
723  %cvt = uitofp <8 x i16> %a to <8 x double>
724  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
725  ret <4 x double> %shuf
726}
727
728define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
729; SSE-LABEL: uitofp_4i8_to_4f64:
730; SSE:       # BB#0:
731; SSE-NEXT:    pxor %xmm1, %xmm1
732; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
733; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
734; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
735; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
736; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
737; SSE-NEXT:    movaps %xmm2, %xmm0
738; SSE-NEXT:    retq
739;
740; AVX-LABEL: uitofp_4i8_to_4f64:
741; AVX:       # BB#0:
742; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
743; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
744; AVX-NEXT:    retq
745  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
746  %cvt = uitofp <4 x i8> %shuf to <4 x double>
747  ret <4 x double> %cvt
748}
749
750define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
751; SSE-LABEL: uitofp_16i8_to_4f64:
752; SSE:       # BB#0:
753; SSE-NEXT:    pxor %xmm1, %xmm1
754; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
755; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
756; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
757; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
758; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
759; SSE-NEXT:    movaps %xmm2, %xmm0
760; SSE-NEXT:    retq
761;
762; AVX1-LABEL: uitofp_16i8_to_4f64:
763; AVX1:       # BB#0:
764; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
765; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
766; AVX1-NEXT:    retq
767;
768; AVX2-LABEL: uitofp_16i8_to_4f64:
769; AVX2:       # BB#0:
770; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
771; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
772; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
773; AVX2-NEXT:    retq
774  %cvt = uitofp <16 x i8> %a to <16 x double>
775  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
776  ret <4 x double> %shuf
777}
778
779;
780; Signed Integer to Float
781;
782
783define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
784; SSE-LABEL: sitofp_2i64_to_4f32:
785; SSE:       # BB#0:
786; SSE-NEXT:    movd %xmm0, %rax
787; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
788; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
789; SSE-NEXT:    movd %xmm0, %rax
790; SSE-NEXT:    xorps %xmm0, %xmm0
791; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
792; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
793; SSE-NEXT:    movaps %xmm1, %xmm0
794; SSE-NEXT:    retq
795;
796; AVX-LABEL: sitofp_2i64_to_4f32:
797; AVX:       # BB#0:
798; AVX-NEXT:    vpextrq $1, %xmm0, %rax
799; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
800; AVX-NEXT:    vmovq %xmm0, %rax
801; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
802; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
803; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
804; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
805; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
806; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
807; AVX-NEXT:    retq
808  %cvt = sitofp <2 x i64> %a to <2 x float>
809  %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
810  ret <4 x float> %ext
811}
812
813define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
814; SSE-LABEL: sitofp_4i64_to_4f32_undef:
815; SSE:       # BB#0:
816; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
817; SSE-NEXT:    movd %xmm0, %rax
818; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
819; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
820; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
821; SSE-NEXT:    movd %xmm0, %rax
822; SSE-NEXT:    xorps %xmm0, %xmm0
823; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
824; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
825; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
826; SSE-NEXT:    movaps %xmm1, %xmm0
827; SSE-NEXT:    retq
828;
829; AVX-LABEL: sitofp_4i64_to_4f32_undef:
830; AVX:       # BB#0:
831; AVX-NEXT:    vpextrq $1, %xmm0, %rax
832; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
833; AVX-NEXT:    vmovq %xmm0, %rax
834; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
835; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
836; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
837; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
838; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
839; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
840; AVX-NEXT:    retq
841  %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
842  %cvt = sitofp <4 x i64> %ext to <4 x float>
843  ret <4 x float> %cvt
844}
845
846define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
847; SSE-LABEL: sitofp_4i32_to_4f32:
848; SSE:       # BB#0:
849; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
850; SSE-NEXT:    retq
851;
852; AVX-LABEL: sitofp_4i32_to_4f32:
853; AVX:       # BB#0:
854; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
855; AVX-NEXT:    retq
856  %cvt = sitofp <4 x i32> %a to <4 x float>
857  ret <4 x float> %cvt
858}
859
860define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
861; SSE-LABEL: sitofp_4i16_to_4f32:
862; SSE:       # BB#0:
863; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
864; SSE-NEXT:    psrad $16, %xmm0
865; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
866; SSE-NEXT:    retq
867;
868; AVX-LABEL: sitofp_4i16_to_4f32:
869; AVX:       # BB#0:
870; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
871; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
872; AVX-NEXT:    retq
873  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
874  %cvt = sitofp <4 x i16> %shuf to <4 x float>
875  ret <4 x float> %cvt
876}
877
878define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
879; SSE-LABEL: sitofp_8i16_to_4f32:
880; SSE:       # BB#0:
881; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
882; SSE-NEXT:    psrad $16, %xmm0
883; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
884; SSE-NEXT:    retq
885;
886; AVX1-LABEL: sitofp_8i16_to_4f32:
887; AVX1:       # BB#0:
888; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
889; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
890; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
891; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
892; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
893; AVX1-NEXT:    vzeroupper
894; AVX1-NEXT:    retq
895;
896; AVX2-LABEL: sitofp_8i16_to_4f32:
897; AVX2:       # BB#0:
898; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
899; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
900; AVX2-NEXT:    vzeroupper
901; AVX2-NEXT:    retq
902  %cvt = sitofp <8 x i16> %a to <8 x float>
903  %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
904  ret <4 x float> %shuf
905}
906
907define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
908; SSE-LABEL: sitofp_4i8_to_4f32:
909; SSE:       # BB#0:
910; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
911; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
912; SSE-NEXT:    psrad $24, %xmm0
913; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
914; SSE-NEXT:    retq
915;
916; AVX-LABEL: sitofp_4i8_to_4f32:
917; AVX:       # BB#0:
918; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
919; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
920; AVX-NEXT:    retq
921  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
922  %cvt = sitofp <4 x i8> %shuf to <4 x float>
923  ret <4 x float> %cvt
924}
925
926define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
927; SSE-LABEL: sitofp_16i8_to_4f32:
928; SSE:       # BB#0:
929; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
930; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
931; SSE-NEXT:    psrad $24, %xmm0
932; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
933; SSE-NEXT:    retq
934;
935; AVX1-LABEL: sitofp_16i8_to_4f32:
936; AVX1:       # BB#0:
937; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
938; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
939; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
940; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
941; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
942; AVX1-NEXT:    vzeroupper
943; AVX1-NEXT:    retq
944;
945; AVX2-LABEL: sitofp_16i8_to_4f32:
946; AVX2:       # BB#0:
947; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
948; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
949; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
950; AVX2-NEXT:    vzeroupper
951; AVX2-NEXT:    retq
952  %cvt = sitofp <16 x i8> %a to <16 x float>
953  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
954  ret <4 x float> %shuf
955}
956
957define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
958; SSE-LABEL: sitofp_4i64_to_4f32:
959; SSE:       # BB#0:
960; SSE-NEXT:    movd %xmm1, %rax
961; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
962; SSE-NEXT:    movd %xmm0, %rax
963; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
964; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
965; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
966; SSE-NEXT:    movd %xmm1, %rax
967; SSE-NEXT:    xorps %xmm1, %xmm1
968; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
969; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
970; SSE-NEXT:    movd %xmm0, %rax
971; SSE-NEXT:    xorps %xmm0, %xmm0
972; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
973; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
974; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
975; SSE-NEXT:    movaps %xmm2, %xmm0
976; SSE-NEXT:    retq
977;
978; AVX1-LABEL: sitofp_4i64_to_4f32:
979; AVX1:       # BB#0:
980; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
981; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
982; AVX1-NEXT:    vmovq %xmm0, %rax
983; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
984; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
985; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
986; AVX1-NEXT:    vmovq %xmm0, %rax
987; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
988; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
989; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
990; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
991; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
992; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
993; AVX1-NEXT:    vzeroupper
994; AVX1-NEXT:    retq
995;
996; AVX2-LABEL: sitofp_4i64_to_4f32:
997; AVX2:       # BB#0:
998; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
999; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1000; AVX2-NEXT:    vmovq %xmm0, %rax
1001; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1002; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1003; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1004; AVX2-NEXT:    vmovq %xmm0, %rax
1005; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1006; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1007; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
1008; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1009; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
1010; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1011; AVX2-NEXT:    vzeroupper
1012; AVX2-NEXT:    retq
1013  %cvt = sitofp <4 x i64> %a to <4 x float>
1014  ret <4 x float> %cvt
1015}
1016
1017define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
1018; SSE-LABEL: sitofp_8i32_to_8f32:
1019; SSE:       # BB#0:
1020; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1021; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
1022; SSE-NEXT:    retq
1023;
1024; AVX-LABEL: sitofp_8i32_to_8f32:
1025; AVX:       # BB#0:
1026; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
1027; AVX-NEXT:    retq
1028  %cvt = sitofp <8 x i32> %a to <8 x float>
1029  ret <8 x float> %cvt
1030}
1031
1032define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
1033; SSE-LABEL: sitofp_8i16_to_8f32:
1034; SSE:       # BB#0:
1035; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1036; SSE-NEXT:    psrad $16, %xmm1
1037; SSE-NEXT:    cvtdq2ps %xmm1, %xmm2
1038; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1039; SSE-NEXT:    psrad $16, %xmm0
1040; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
1041; SSE-NEXT:    movaps %xmm2, %xmm0
1042; SSE-NEXT:    retq
1043;
1044; AVX1-LABEL: sitofp_8i16_to_8f32:
1045; AVX1:       # BB#0:
1046; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
1047; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1048; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
1049; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1050; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1051; AVX1-NEXT:    retq
1052;
1053; AVX2-LABEL: sitofp_8i16_to_8f32:
1054; AVX2:       # BB#0:
1055; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
1056; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1057; AVX2-NEXT:    retq
1058  %cvt = sitofp <8 x i16> %a to <8 x float>
1059  ret <8 x float> %cvt
1060}
1061
1062define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
1063; SSE-LABEL: sitofp_8i8_to_8f32:
1064; SSE:       # BB#0:
1065; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1066; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1067; SSE-NEXT:    psrad $24, %xmm1
1068; SSE-NEXT:    cvtdq2ps %xmm1, %xmm2
1069; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1070; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1071; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1072; SSE-NEXT:    psrad $24, %xmm0
1073; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
1074; SSE-NEXT:    movaps %xmm2, %xmm0
1075; SSE-NEXT:    retq
1076;
1077; AVX1-LABEL: sitofp_8i8_to_8f32:
1078; AVX1:       # BB#0:
1079; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
1080; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1081; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
1082; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1083; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1084; AVX1-NEXT:    retq
1085;
1086; AVX2-LABEL: sitofp_8i8_to_8f32:
1087; AVX2:       # BB#0:
1088; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1089; AVX2-NEXT:    vpslld $24, %ymm0, %ymm0
1090; AVX2-NEXT:    vpsrad $24, %ymm0, %ymm0
1091; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1092; AVX2-NEXT:    retq
1093  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1094  %cvt = sitofp <8 x i8> %shuf to <8 x float>
1095  ret <8 x float> %cvt
1096}
1097
1098define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
1099; SSE-LABEL: sitofp_16i8_to_8f32:
1100; SSE:       # BB#0:
1101; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1102; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1103; SSE-NEXT:    psrad $24, %xmm1
1104; SSE-NEXT:    cvtdq2ps %xmm1, %xmm2
1105; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1106; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1107; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1108; SSE-NEXT:    psrad $24, %xmm0
1109; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
1110; SSE-NEXT:    movaps %xmm2, %xmm0
1111; SSE-NEXT:    retq
1112;
1113; AVX1-LABEL: sitofp_16i8_to_8f32:
1114; AVX1:       # BB#0:
1115; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
1116; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1117; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
1118; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1119; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1120; AVX1-NEXT:    retq
1121;
1122; AVX2-LABEL: sitofp_16i8_to_8f32:
1123; AVX2:       # BB#0:
1124; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1125; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
1126; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1127; AVX2-NEXT:    retq
1128  %cvt = sitofp <16 x i8> %a to <16 x float>
1129  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1130  ret <8 x float> %shuf
1131}
1132
1133;
1134; Unsigned Integer to Float
1135;
1136
1137define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
1138; SSE-LABEL: uitofp_2i64_to_4f32:
1139; SSE:       # BB#0:
1140; SSE-NEXT:    movdqa %xmm0, %xmm1
1141; SSE-NEXT:    movd %xmm1, %rax
1142; SSE-NEXT:    movl %eax, %ecx
1143; SSE-NEXT:    andl $1, %ecx
1144; SSE-NEXT:    testq %rax, %rax
1145; SSE-NEXT:    js .LBB38_1
1146; SSE-NEXT:  # BB#2:
1147; SSE-NEXT:    xorps %xmm0, %xmm0
1148; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
1149; SSE-NEXT:    jmp .LBB38_3
1150; SSE-NEXT:  .LBB38_1:
1151; SSE-NEXT:    shrq %rax
1152; SSE-NEXT:    orq %rax, %rcx
1153; SSE-NEXT:    xorps %xmm0, %xmm0
1154; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
1155; SSE-NEXT:    addss %xmm0, %xmm0
1156; SSE-NEXT:  .LBB38_3:
1157; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1158; SSE-NEXT:    movd %xmm1, %rax
1159; SSE-NEXT:    movl %eax, %ecx
1160; SSE-NEXT:    andl $1, %ecx
1161; SSE-NEXT:    testq %rax, %rax
1162; SSE-NEXT:    js .LBB38_4
1163; SSE-NEXT:  # BB#5:
1164; SSE-NEXT:    xorps %xmm1, %xmm1
1165; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
1166; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1167; SSE-NEXT:    retq
1168; SSE-NEXT:  .LBB38_4:
1169; SSE-NEXT:    shrq %rax
1170; SSE-NEXT:    orq %rax, %rcx
1171; SSE-NEXT:    xorps %xmm1, %xmm1
1172; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
1173; SSE-NEXT:    addss %xmm1, %xmm1
1174; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1175; SSE-NEXT:    retq
1176;
1177; AVX-LABEL: uitofp_2i64_to_4f32:
1178; AVX:       # BB#0:
1179; AVX-NEXT:    vpextrq $1, %xmm0, %rax
1180; AVX-NEXT:    movl %eax, %ecx
1181; AVX-NEXT:    andl $1, %ecx
1182; AVX-NEXT:    testq %rax, %rax
1183; AVX-NEXT:    js .LBB38_1
1184; AVX-NEXT:  # BB#2:
1185; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1186; AVX-NEXT:    jmp .LBB38_3
1187; AVX-NEXT:  .LBB38_1:
1188; AVX-NEXT:    shrq %rax
1189; AVX-NEXT:    orq %rax, %rcx
1190; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
1191; AVX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
1192; AVX-NEXT:  .LBB38_3:
1193; AVX-NEXT:    vmovq %xmm0, %rax
1194; AVX-NEXT:    movl %eax, %ecx
1195; AVX-NEXT:    andl $1, %ecx
1196; AVX-NEXT:    testq %rax, %rax
1197; AVX-NEXT:    js .LBB38_4
1198; AVX-NEXT:  # BB#5:
1199; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1200; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
1201; AVX-NEXT:    jmp .LBB38_6
1202; AVX-NEXT:  .LBB38_4:
1203; AVX-NEXT:    shrq %rax
1204; AVX-NEXT:    orq %rax, %rcx
1205; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1206; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
1207; AVX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
1208; AVX-NEXT:  .LBB38_6:
1209; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1210; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1211; AVX-NEXT:    testq %rax, %rax
1212; AVX-NEXT:    js .LBB38_8
1213; AVX-NEXT:  # BB#7:
1214; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1215; AVX-NEXT:  .LBB38_8:
1216; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1217; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1218; AVX-NEXT:    retq
1219  %cvt = uitofp <2 x i64> %a to <2 x float>
1220  %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1221  ret <4 x float> %ext
1222}
1223
1224define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
1225; SSE-LABEL: uitofp_4i64_to_4f32_undef:
1226; SSE:       # BB#0:
1227; SSE-NEXT:    movdqa %xmm0, %xmm1
1228; SSE-NEXT:    testq %rax, %rax
1229; SSE-NEXT:    xorps %xmm2, %xmm2
1230; SSE-NEXT:    js .LBB39_2
1231; SSE-NEXT:  # BB#1:
1232; SSE-NEXT:    xorps %xmm2, %xmm2
1233; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
1234; SSE-NEXT:  .LBB39_2:
1235; SSE-NEXT:    movd %xmm1, %rax
1236; SSE-NEXT:    movl %eax, %ecx
1237; SSE-NEXT:    andl $1, %ecx
1238; SSE-NEXT:    testq %rax, %rax
1239; SSE-NEXT:    js .LBB39_3
1240; SSE-NEXT:  # BB#4:
1241; SSE-NEXT:    xorps %xmm0, %xmm0
1242; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
1243; SSE-NEXT:    jmp .LBB39_5
1244; SSE-NEXT:  .LBB39_3:
1245; SSE-NEXT:    shrq %rax
1246; SSE-NEXT:    orq %rax, %rcx
1247; SSE-NEXT:    xorps %xmm0, %xmm0
1248; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
1249; SSE-NEXT:    addss %xmm0, %xmm0
1250; SSE-NEXT:  .LBB39_5:
1251; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1252; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1253; SSE-NEXT:    movd %xmm1, %rax
1254; SSE-NEXT:    movl %eax, %ecx
1255; SSE-NEXT:    andl $1, %ecx
1256; SSE-NEXT:    testq %rax, %rax
1257; SSE-NEXT:    js .LBB39_6
1258; SSE-NEXT:  # BB#7:
1259; SSE-NEXT:    xorps %xmm1, %xmm1
1260; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
1261; SSE-NEXT:    jmp .LBB39_8
1262; SSE-NEXT:  .LBB39_6:
1263; SSE-NEXT:    shrq %rax
1264; SSE-NEXT:    orq %rax, %rcx
1265; SSE-NEXT:    xorps %xmm1, %xmm1
1266; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
1267; SSE-NEXT:    addss %xmm1, %xmm1
1268; SSE-NEXT:  .LBB39_8:
1269; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1270; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1271; SSE-NEXT:    retq
1272;
1273; AVX-LABEL: uitofp_4i64_to_4f32_undef:
1274; AVX:       # BB#0:
1275; AVX-NEXT:    vpextrq $1, %xmm0, %rax
1276; AVX-NEXT:    movl %eax, %ecx
1277; AVX-NEXT:    andl $1, %ecx
1278; AVX-NEXT:    testq %rax, %rax
1279; AVX-NEXT:    js .LBB39_1
1280; AVX-NEXT:  # BB#2:
1281; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1282; AVX-NEXT:    jmp .LBB39_3
1283; AVX-NEXT:  .LBB39_1:
1284; AVX-NEXT:    shrq %rax
1285; AVX-NEXT:    orq %rax, %rcx
1286; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
1287; AVX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
1288; AVX-NEXT:  .LBB39_3:
1289; AVX-NEXT:    vmovq %xmm0, %rax
1290; AVX-NEXT:    movl %eax, %ecx
1291; AVX-NEXT:    andl $1, %ecx
1292; AVX-NEXT:    testq %rax, %rax
1293; AVX-NEXT:    js .LBB39_4
1294; AVX-NEXT:  # BB#5:
1295; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1296; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
1297; AVX-NEXT:    jmp .LBB39_6
1298; AVX-NEXT:  .LBB39_4:
1299; AVX-NEXT:    shrq %rax
1300; AVX-NEXT:    orq %rax, %rcx
1301; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1302; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
1303; AVX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
1304; AVX-NEXT:  .LBB39_6:
1305; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1306; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1307; AVX-NEXT:    testq %rax, %rax
1308; AVX-NEXT:    js .LBB39_8
1309; AVX-NEXT:  # BB#7:
1310; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1311; AVX-NEXT:  .LBB39_8:
1312; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1313; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1314; AVX-NEXT:    retq
1315  %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1316  %cvt = uitofp <4 x i64> %ext to <4 x float>
1317  ret <4 x float> %cvt
1318}
1319
1320define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
1321; SSE-LABEL: uitofp_4i32_to_4f32:
1322; SSE:       # BB#0:
1323; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
1324; SSE-NEXT:    pand %xmm0, %xmm1
1325; SSE-NEXT:    por {{.*}}(%rip), %xmm1
1326; SSE-NEXT:    psrld $16, %xmm0
1327; SSE-NEXT:    por {{.*}}(%rip), %xmm0
1328; SSE-NEXT:    addps {{.*}}(%rip), %xmm0
1329; SSE-NEXT:    addps %xmm1, %xmm0
1330; SSE-NEXT:    retq
1331;
1332; AVX1-LABEL: uitofp_4i32_to_4f32:
1333; AVX1:       # BB#0:
1334; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
1335; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
1336; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
1337; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
1338; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
1339; AVX1-NEXT:    retq
1340;
1341; AVX2-LABEL: uitofp_4i32_to_4f32:
1342; AVX2:       # BB#0:
1343; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
1344; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1345; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
1346; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
1347; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1348; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
1349; AVX2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
1350; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
1351; AVX2-NEXT:    retq
1352  %cvt = uitofp <4 x i32> %a to <4 x float>
1353  ret <4 x float> %cvt
1354}
1355
1356define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
1357; SSE-LABEL: uitofp_4i16_to_4f32:
1358; SSE:       # BB#0:
1359; SSE-NEXT:    pxor %xmm1, %xmm1
1360; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1361; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1362; SSE-NEXT:    retq
1363;
1364; AVX-LABEL: uitofp_4i16_to_4f32:
1365; AVX:       # BB#0:
1366; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1367; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
1368; AVX-NEXT:    retq
1369  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1370  %cvt = uitofp <4 x i16> %shuf to <4 x float>
1371  ret <4 x float> %cvt
1372}
1373
1374define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
1375; SSE-LABEL: uitofp_8i16_to_4f32:
1376; SSE:       # BB#0:
1377; SSE-NEXT:    pxor %xmm1, %xmm1
1378; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1379; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1380; SSE-NEXT:    retq
1381;
1382; AVX1-LABEL: uitofp_8i16_to_4f32:
1383; AVX1:       # BB#0:
1384; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1385; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1386; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1387; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1388; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1389; AVX1-NEXT:    vzeroupper
1390; AVX1-NEXT:    retq
1391;
1392; AVX2-LABEL: uitofp_8i16_to_4f32:
1393; AVX2:       # BB#0:
1394; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1395; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1396; AVX2-NEXT:    vzeroupper
1397; AVX2-NEXT:    retq
1398  %cvt = uitofp <8 x i16> %a to <8 x float>
1399  %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1400  ret <4 x float> %shuf
1401}
1402
1403define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
1404; SSE-LABEL: uitofp_4i8_to_4f32:
1405; SSE:       # BB#0:
1406; SSE-NEXT:    pxor %xmm1, %xmm1
1407; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1408; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1409; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1410; SSE-NEXT:    retq
1411;
1412; AVX-LABEL: uitofp_4i8_to_4f32:
1413; AVX:       # BB#0:
1414; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1415; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
1416; AVX-NEXT:    retq
1417  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1418  %cvt = uitofp <4 x i8> %shuf to <4 x float>
1419  ret <4 x float> %cvt
1420}
1421
1422define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
1423; SSE-LABEL: uitofp_16i8_to_4f32:
1424; SSE:       # BB#0:
1425; SSE-NEXT:    pxor %xmm1, %xmm1
1426; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1427; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1428; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1429; SSE-NEXT:    retq
1430;
1431; AVX1-LABEL: uitofp_16i8_to_4f32:
1432; AVX1:       # BB#0:
1433; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1434; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1435; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1436; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1437; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1438; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1439; AVX1-NEXT:    vzeroupper
1440; AVX1-NEXT:    retq
1441;
1442; AVX2-LABEL: uitofp_16i8_to_4f32:
1443; AVX2:       # BB#0:
1444; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1445; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1446; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1447; AVX2-NEXT:    vzeroupper
1448; AVX2-NEXT:    retq
1449  %cvt = uitofp <16 x i8> %a to <16 x float>
1450  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1451  ret <4 x float> %shuf
1452}
1453
1454define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
1455; SSE-LABEL: uitofp_4i64_to_4f32:
1456; SSE:       # BB#0:
1457; SSE-NEXT:    movd %xmm1, %rax
1458; SSE-NEXT:    movl %eax, %ecx
1459; SSE-NEXT:    andl $1, %ecx
1460; SSE-NEXT:    testq %rax, %rax
1461; SSE-NEXT:    js .LBB45_1
1462; SSE-NEXT:  # BB#2:
1463; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
1464; SSE-NEXT:    jmp .LBB45_3
1465; SSE-NEXT:  .LBB45_1:
1466; SSE-NEXT:    shrq %rax
1467; SSE-NEXT:    orq %rax, %rcx
1468; SSE-NEXT:    cvtsi2ssq %rcx, %xmm3
1469; SSE-NEXT:    addss %xmm3, %xmm3
1470; SSE-NEXT:  .LBB45_3:
1471; SSE-NEXT:    movd %xmm0, %rax
1472; SSE-NEXT:    movl %eax, %ecx
1473; SSE-NEXT:    andl $1, %ecx
1474; SSE-NEXT:    testq %rax, %rax
1475; SSE-NEXT:    js .LBB45_4
1476; SSE-NEXT:  # BB#5:
1477; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
1478; SSE-NEXT:    jmp .LBB45_6
1479; SSE-NEXT:  .LBB45_4:
1480; SSE-NEXT:    shrq %rax
1481; SSE-NEXT:    orq %rax, %rcx
1482; SSE-NEXT:    cvtsi2ssq %rcx, %xmm2
1483; SSE-NEXT:    addss %xmm2, %xmm2
1484; SSE-NEXT:  .LBB45_6:
1485; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1486; SSE-NEXT:    movd %xmm1, %rax
1487; SSE-NEXT:    movl %eax, %ecx
1488; SSE-NEXT:    andl $1, %ecx
1489; SSE-NEXT:    testq %rax, %rax
1490; SSE-NEXT:    js .LBB45_7
1491; SSE-NEXT:  # BB#8:
1492; SSE-NEXT:    xorps %xmm1, %xmm1
1493; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
1494; SSE-NEXT:    jmp .LBB45_9
1495; SSE-NEXT:  .LBB45_7:
1496; SSE-NEXT:    shrq %rax
1497; SSE-NEXT:    orq %rax, %rcx
1498; SSE-NEXT:    xorps %xmm1, %xmm1
1499; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
1500; SSE-NEXT:    addss %xmm1, %xmm1
1501; SSE-NEXT:  .LBB45_9:
1502; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1503; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1504; SSE-NEXT:    movd %xmm0, %rax
1505; SSE-NEXT:    movl %eax, %ecx
1506; SSE-NEXT:    andl $1, %ecx
1507; SSE-NEXT:    testq %rax, %rax
1508; SSE-NEXT:    js .LBB45_10
1509; SSE-NEXT:  # BB#11:
1510; SSE-NEXT:    xorps %xmm0, %xmm0
1511; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
1512; SSE-NEXT:    jmp .LBB45_12
1513; SSE-NEXT:  .LBB45_10:
1514; SSE-NEXT:    shrq %rax
1515; SSE-NEXT:    orq %rax, %rcx
1516; SSE-NEXT:    xorps %xmm0, %xmm0
1517; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
1518; SSE-NEXT:    addss %xmm0, %xmm0
1519; SSE-NEXT:  .LBB45_12:
1520; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1521; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1522; SSE-NEXT:    movaps %xmm2, %xmm0
1523; SSE-NEXT:    retq
1524;
1525; AVX1-LABEL: uitofp_4i64_to_4f32:
1526; AVX1:       # BB#0:
1527; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
1528; AVX1-NEXT:    movl %eax, %ecx
1529; AVX1-NEXT:    andl $1, %ecx
1530; AVX1-NEXT:    testq %rax, %rax
1531; AVX1-NEXT:    js .LBB45_1
1532; AVX1-NEXT:  # BB#2:
1533; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1534; AVX1-NEXT:    jmp .LBB45_3
1535; AVX1-NEXT:  .LBB45_1:
1536; AVX1-NEXT:    shrq %rax
1537; AVX1-NEXT:    orq %rax, %rcx
1538; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
1539; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
1540; AVX1-NEXT:  .LBB45_3:
1541; AVX1-NEXT:    vmovq %xmm0, %rax
1542; AVX1-NEXT:    movl %eax, %ecx
1543; AVX1-NEXT:    andl $1, %ecx
1544; AVX1-NEXT:    testq %rax, %rax
1545; AVX1-NEXT:    js .LBB45_4
1546; AVX1-NEXT:  # BB#5:
1547; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1548; AVX1-NEXT:    jmp .LBB45_6
1549; AVX1-NEXT:  .LBB45_4:
1550; AVX1-NEXT:    shrq %rax
1551; AVX1-NEXT:    orq %rax, %rcx
1552; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
1553; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
1554; AVX1-NEXT:  .LBB45_6:
1555; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1556; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1557; AVX1-NEXT:    vmovq %xmm0, %rax
1558; AVX1-NEXT:    movl %eax, %ecx
1559; AVX1-NEXT:    andl $1, %ecx
1560; AVX1-NEXT:    testq %rax, %rax
1561; AVX1-NEXT:    js .LBB45_7
1562; AVX1-NEXT:  # BB#8:
1563; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1564; AVX1-NEXT:    jmp .LBB45_9
1565; AVX1-NEXT:  .LBB45_7:
1566; AVX1-NEXT:    shrq %rax
1567; AVX1-NEXT:    orq %rax, %rcx
1568; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
1569; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
1570; AVX1-NEXT:  .LBB45_9:
1571; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1572; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
1573; AVX1-NEXT:    movl %eax, %ecx
1574; AVX1-NEXT:    andl $1, %ecx
1575; AVX1-NEXT:    testq %rax, %rax
1576; AVX1-NEXT:    js .LBB45_10
1577; AVX1-NEXT:  # BB#11:
1578; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1579; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
1580; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1581; AVX1-NEXT:    vzeroupper
1582; AVX1-NEXT:    retq
1583; AVX1-NEXT:  .LBB45_10:
1584; AVX1-NEXT:    shrq %rax
1585; AVX1-NEXT:    orq %rax, %rcx
1586; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
1587; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
1588; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1589; AVX1-NEXT:    vzeroupper
1590; AVX1-NEXT:    retq
1591;
1592; AVX2-LABEL: uitofp_4i64_to_4f32:
1593; AVX2:       # BB#0:
1594; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
1595; AVX2-NEXT:    movl %eax, %ecx
1596; AVX2-NEXT:    andl $1, %ecx
1597; AVX2-NEXT:    testq %rax, %rax
1598; AVX2-NEXT:    js .LBB45_1
1599; AVX2-NEXT:  # BB#2:
1600; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1601; AVX2-NEXT:    jmp .LBB45_3
1602; AVX2-NEXT:  .LBB45_1:
1603; AVX2-NEXT:    shrq %rax
1604; AVX2-NEXT:    orq %rax, %rcx
1605; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
1606; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
1607; AVX2-NEXT:  .LBB45_3:
1608; AVX2-NEXT:    vmovq %xmm0, %rax
1609; AVX2-NEXT:    movl %eax, %ecx
1610; AVX2-NEXT:    andl $1, %ecx
1611; AVX2-NEXT:    testq %rax, %rax
1612; AVX2-NEXT:    js .LBB45_4
1613; AVX2-NEXT:  # BB#5:
1614; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1615; AVX2-NEXT:    jmp .LBB45_6
1616; AVX2-NEXT:  .LBB45_4:
1617; AVX2-NEXT:    shrq %rax
1618; AVX2-NEXT:    orq %rax, %rcx
1619; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
1620; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
1621; AVX2-NEXT:  .LBB45_6:
1622; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1623; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1624; AVX2-NEXT:    vmovq %xmm0, %rax
1625; AVX2-NEXT:    movl %eax, %ecx
1626; AVX2-NEXT:    andl $1, %ecx
1627; AVX2-NEXT:    testq %rax, %rax
1628; AVX2-NEXT:    js .LBB45_7
1629; AVX2-NEXT:  # BB#8:
1630; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1631; AVX2-NEXT:    jmp .LBB45_9
1632; AVX2-NEXT:  .LBB45_7:
1633; AVX2-NEXT:    shrq %rax
1634; AVX2-NEXT:    orq %rax, %rcx
1635; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
1636; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
1637; AVX2-NEXT:  .LBB45_9:
1638; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1639; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
1640; AVX2-NEXT:    movl %eax, %ecx
1641; AVX2-NEXT:    andl $1, %ecx
1642; AVX2-NEXT:    testq %rax, %rax
1643; AVX2-NEXT:    js .LBB45_10
1644; AVX2-NEXT:  # BB#11:
1645; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1646; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
1647; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1648; AVX2-NEXT:    vzeroupper
1649; AVX2-NEXT:    retq
1650; AVX2-NEXT:  .LBB45_10:
1651; AVX2-NEXT:    shrq %rax
1652; AVX2-NEXT:    orq %rax, %rcx
1653; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
1654; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
1655; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1656; AVX2-NEXT:    vzeroupper
1657; AVX2-NEXT:    retq
1658  %cvt = uitofp <4 x i64> %a to <4 x float>
1659  ret <4 x float> %cvt
1660}
1661
1662define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
1663; SSE-LABEL: uitofp_8i32_to_8f32:
1664; SSE:       # BB#0:
1665; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1666; SSE-NEXT:    movdqa %xmm0, %xmm3
1667; SSE-NEXT:    pand %xmm2, %xmm3
1668; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
1669; SSE-NEXT:    por %xmm4, %xmm3
1670; SSE-NEXT:    psrld $16, %xmm0
1671; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
1672; SSE-NEXT:    por %xmm5, %xmm0
1673; SSE-NEXT:    movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
1674; SSE-NEXT:    addps %xmm6, %xmm0
1675; SSE-NEXT:    addps %xmm3, %xmm0
1676; SSE-NEXT:    pand %xmm1, %xmm2
1677; SSE-NEXT:    por %xmm4, %xmm2
1678; SSE-NEXT:    psrld $16, %xmm1
1679; SSE-NEXT:    por %xmm5, %xmm1
1680; SSE-NEXT:    addps %xmm6, %xmm1
1681; SSE-NEXT:    addps %xmm2, %xmm1
1682; SSE-NEXT:    retq
1683;
1684; AVX1-LABEL: uitofp_8i32_to_8f32:
1685; AVX1:       # BB#0:
1686; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm1
1687; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
1688; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
1689; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1690; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
1691; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
1692; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1693; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
1694; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
1695; AVX1-NEXT:    retq
1696;
1697; AVX2-LABEL: uitofp_8i32_to_8f32:
1698; AVX2:       # BB#0:
1699; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
1700; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1701; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
1702; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
1703; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
1704; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
1705; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
1706; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
1707; AVX2-NEXT:    retq
1708  %cvt = uitofp <8 x i32> %a to <8 x float>
1709  ret <8 x float> %cvt
1710}
1711
1712define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
1713; SSE-LABEL: uitofp_8i16_to_8f32:
1714; SSE:       # BB#0:
1715; SSE-NEXT:    pxor %xmm1, %xmm1
1716; SSE-NEXT:    movdqa %xmm0, %xmm2
1717; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1718; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
1719; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1720; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
1721; SSE-NEXT:    movaps %xmm2, %xmm0
1722; SSE-NEXT:    retq
1723;
1724; AVX1-LABEL: uitofp_8i16_to_8f32:
1725; AVX1:       # BB#0:
1726; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1727; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1728; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1729; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1730; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1731; AVX1-NEXT:    retq
1732;
1733; AVX2-LABEL: uitofp_8i16_to_8f32:
1734; AVX2:       # BB#0:
1735; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1736; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1737; AVX2-NEXT:    retq
1738  %cvt = uitofp <8 x i16> %a to <8 x float>
1739  ret <8 x float> %cvt
1740}
1741
1742define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
1743; SSE-LABEL: uitofp_8i8_to_8f32:
1744; SSE:       # BB#0:
1745; SSE-NEXT:    pxor %xmm1, %xmm1
1746; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1747; SSE-NEXT:    movdqa %xmm0, %xmm2
1748; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1749; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
1750; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1751; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
1752; SSE-NEXT:    movaps %xmm2, %xmm0
1753; SSE-NEXT:    retq
1754;
1755; AVX1-LABEL: uitofp_8i8_to_8f32:
1756; AVX1:       # BB#0:
1757; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1758; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1759; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1760; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1761; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1762; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1763; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1764; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1765; AVX1-NEXT:    retq
1766;
1767; AVX2-LABEL: uitofp_8i8_to_8f32:
1768; AVX2:       # BB#0:
1769; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1770; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1771; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1772; AVX2-NEXT:    retq
1773  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1774  %cvt = uitofp <8 x i8> %shuf to <8 x float>
1775  ret <8 x float> %cvt
1776}
1777
1778define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
1779; SSE-LABEL: uitofp_16i8_to_8f32:
1780; SSE:       # BB#0:
1781; SSE-NEXT:    pxor %xmm1, %xmm1
1782; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1783; SSE-NEXT:    movdqa %xmm0, %xmm2
1784; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1785; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
1786; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1787; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
1788; SSE-NEXT:    movaps %xmm2, %xmm0
1789; SSE-NEXT:    retq
1790;
1791; AVX1-LABEL: uitofp_16i8_to_8f32:
1792; AVX1:       # BB#0:
1793; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1794; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1795; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1796; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1797; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1798; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1799; AVX1-NEXT:    retq
1800;
1801; AVX2-LABEL: uitofp_16i8_to_8f32:
1802; AVX2:       # BB#0:
1803; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1804; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1805; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1806; AVX2-NEXT:    retq
1807  %cvt = uitofp <16 x i8> %a to <16 x float>
1808  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1809  ret <8 x float> %shuf
1810}
1811
1812;
1813; Aggregates
1814;
1815
1816%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
1817define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
1818; SSE-LABEL: aggregate_sitofp_8i16_to_8f32:
1819; SSE:       # BB#0:
1820; SSE-NEXT:    movq 24(%rdi), %rax
1821; SSE-NEXT:    movdqu 8(%rdi), %xmm0
1822; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1823; SSE-NEXT:    psrad $16, %xmm1
1824; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
1825; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1826; SSE-NEXT:    psrad $16, %xmm0
1827; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1828; SSE-NEXT:    movaps %xmm0, 16(%rax)
1829; SSE-NEXT:    movaps %xmm1, (%rax)
1830; SSE-NEXT:    retq
1831;
1832; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
1833; AVX1:       # BB#0:
1834; AVX1-NEXT:    movq 24(%rdi), %rax
1835; AVX1-NEXT:    vmovdqu 8(%rdi), %xmm0
1836; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
1837; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1838; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
1839; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1840; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1841; AVX1-NEXT:    vmovaps %ymm0, (%rax)
1842; AVX1-NEXT:    vzeroupper
1843; AVX1-NEXT:    retq
1844;
1845; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
1846; AVX2:       # BB#0:
1847; AVX2-NEXT:    movq 24(%rdi), %rax
1848; AVX2-NEXT:    vpmovsxwd 8(%rdi), %ymm0
1849; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1850; AVX2-NEXT:    vmovaps %ymm0, (%rax)
1851; AVX2-NEXT:    vzeroupper
1852; AVX2-NEXT:    retq
1853 %1 = load %Arguments, %Arguments* %a0, align 1
1854 %2 = extractvalue %Arguments %1, 1
1855 %3 = extractvalue %Arguments %1, 2
1856 %4 = sitofp <8 x i16> %2 to <8 x float>
1857 store <8 x float> %4, <8 x float>* %3, align 32
1858 ret void
1859}
1860