• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4;
5; 32-bit tests to make sure we're not doing anything stupid.
6; RUN: llc < %s -mtriple=i686-unknown-unknown
7; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse
8; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2
9
10;
11; Signed Integer to Double
12;
13
14define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
15; SSE-LABEL: sitofp_2i64_to_2f64:
16; SSE:       # BB#0:
17; SSE-NEXT:    movd %xmm0, %rax
18; SSE-NEXT:    cvtsi2sdq %rax, %xmm1
19; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
20; SSE-NEXT:    movd %xmm0, %rax
21; SSE-NEXT:    xorps %xmm0, %xmm0
22; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
23; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
24; SSE-NEXT:    movapd %xmm1, %xmm0
25; SSE-NEXT:    retq
26;
27; AVX-LABEL: sitofp_2i64_to_2f64:
28; AVX:       # BB#0:
29; AVX-NEXT:    vpextrq $1, %xmm0, %rax
30; AVX-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
31; AVX-NEXT:    vmovq %xmm0, %rax
32; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
33; AVX-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
34; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
35; AVX-NEXT:    retq
36  %cvt = sitofp <2 x i64> %a to <2 x double>
37  ret <2 x double> %cvt
38}
39
40define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
41; SSE-LABEL: sitofp_2i32_to_2f64:
42; SSE:       # BB#0:
43; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
44; SSE-NEXT:    retq
45;
46; AVX-LABEL: sitofp_2i32_to_2f64:
47; AVX:       # BB#0:
48; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
49; AVX-NEXT:    retq
50  %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
51  %cvt = sitofp <2 x i32> %shuf to <2 x double>
52  ret <2 x double> %cvt
53}
54
55define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
56; SSE-LABEL: sitofp_4i32_to_2f64:
57; SSE:       # BB#0:
58; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
59; SSE-NEXT:    retq
60;
61; AVX-LABEL: sitofp_4i32_to_2f64:
62; AVX:       # BB#0:
63; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
64; AVX-NEXT:    # kill
65; AVX-NEXT:    vzeroupper
66; AVX-NEXT:    retq
67  %cvt = sitofp <4 x i32> %a to <4 x double>
68  %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
69  ret <2 x double> %shuf
70}
71
72define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
73; SSE-LABEL: sitofp_2i16_to_2f64:
74; SSE:       # BB#0:
75; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
76; SSE-NEXT:    psrad $16, %xmm0
77; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
78; SSE-NEXT:    retq
79;
80; AVX-LABEL: sitofp_2i16_to_2f64:
81; AVX:       # BB#0:
82; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
83; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
84; AVX-NEXT:    retq
85  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
86  %cvt = sitofp <2 x i16> %shuf to <2 x double>
87  ret <2 x double> %cvt
88}
89
90define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
91; SSE-LABEL: sitofp_8i16_to_2f64:
92; SSE:       # BB#0:
93; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
94; SSE-NEXT:    psrad $16, %xmm0
95; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
96; SSE-NEXT:    retq
97;
98; AVX1-LABEL: sitofp_8i16_to_2f64:
99; AVX1:       # BB#0:
100; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
101; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
102; AVX1-NEXT:    # kill
103; AVX1-NEXT:    vzeroupper
104; AVX1-NEXT:    retq
105;
106; AVX2-LABEL: sitofp_8i16_to_2f64:
107; AVX2:       # BB#0:
108; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
109; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
110; AVX2-NEXT:    # kill
111; AVX2-NEXT:    vzeroupper
112; AVX2-NEXT:    retq
113  %cvt = sitofp <8 x i16> %a to <8 x double>
114  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
115  ret <2 x double> %shuf
116}
117
118define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
119; SSE-LABEL: sitofp_2i8_to_2f64:
120; SSE:       # BB#0:
121; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
122; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
123; SSE-NEXT:    psrad $24, %xmm0
124; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
125; SSE-NEXT:    retq
126;
127; AVX-LABEL: sitofp_2i8_to_2f64:
128; AVX:       # BB#0:
129; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
130; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
131; AVX-NEXT:    retq
132  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
133  %cvt = sitofp <2 x i8> %shuf to <2 x double>
134  ret <2 x double> %cvt
135}
136
137define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
138; SSE-LABEL: sitofp_16i8_to_2f64:
139; SSE:       # BB#0:
140; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
141; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
142; SSE-NEXT:    psrad $24, %xmm0
143; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
144; SSE-NEXT:    retq
145;
146; AVX1-LABEL: sitofp_16i8_to_2f64:
147; AVX1:       # BB#0:
148; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
149; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
150; AVX1-NEXT:    # kill
151; AVX1-NEXT:    vzeroupper
152; AVX1-NEXT:    retq
153;
154; AVX2-LABEL: sitofp_16i8_to_2f64:
155; AVX2:       # BB#0:
156; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
157; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
158; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
159; AVX2-NEXT:    # kill
160; AVX2-NEXT:    vzeroupper
161; AVX2-NEXT:    retq
162  %cvt = sitofp <16 x i8> %a to <16 x double>
163  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
164  ret <2 x double> %shuf
165}
166
167define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
168; SSE-LABEL: sitofp_4i64_to_4f64:
169; SSE:       # BB#0:
170; SSE-NEXT:    movd %xmm0, %rax
171; SSE-NEXT:    cvtsi2sdq %rax, %xmm2
172; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
173; SSE-NEXT:    movd %xmm0, %rax
174; SSE-NEXT:    xorps %xmm0, %xmm0
175; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
176; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
177; SSE-NEXT:    movd %xmm1, %rax
178; SSE-NEXT:    cvtsi2sdq %rax, %xmm3
179; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
180; SSE-NEXT:    movd %xmm0, %rax
181; SSE-NEXT:    xorps %xmm0, %xmm0
182; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
183; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
184; SSE-NEXT:    movapd %xmm2, %xmm0
185; SSE-NEXT:    movapd %xmm3, %xmm1
186; SSE-NEXT:    retq
187;
188; AVX1-LABEL: sitofp_4i64_to_4f64:
189; AVX1:       # BB#0:
190; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
191; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
192; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
193; AVX1-NEXT:    vmovq %xmm1, %rax
194; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
195; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
196; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
197; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
198; AVX1-NEXT:    vmovq %xmm0, %rax
199; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
200; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
201; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
202; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
203; AVX1-NEXT:    retq
204;
205; AVX2-LABEL: sitofp_4i64_to_4f64:
206; AVX2:       # BB#0:
207; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
208; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
209; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
210; AVX2-NEXT:    vmovq %xmm1, %rax
211; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
212; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
213; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
214; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
215; AVX2-NEXT:    vmovq %xmm0, %rax
216; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
217; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
218; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
219; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
220; AVX2-NEXT:    retq
221  %cvt = sitofp <4 x i64> %a to <4 x double>
222  ret <4 x double> %cvt
223}
224
225define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
226; SSE-LABEL: sitofp_4i32_to_4f64:
227; SSE:       # BB#0:
228; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
229; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
230; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
231; SSE-NEXT:    movaps %xmm2, %xmm0
232; SSE-NEXT:    retq
233;
234; AVX-LABEL: sitofp_4i32_to_4f64:
235; AVX:       # BB#0:
236; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
237; AVX-NEXT:    retq
238  %cvt = sitofp <4 x i32> %a to <4 x double>
239  ret <4 x double> %cvt
240}
241
242define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
243; SSE-LABEL: sitofp_4i16_to_4f64:
244; SSE:       # BB#0:
245; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
246; SSE-NEXT:    psrad $16, %xmm1
247; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
248; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
249; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
250; SSE-NEXT:    retq
251;
252; AVX-LABEL: sitofp_4i16_to_4f64:
253; AVX:       # BB#0:
254; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
255; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
256; AVX-NEXT:    retq
257  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
258  %cvt = sitofp <4 x i16> %shuf to <4 x double>
259  ret <4 x double> %cvt
260}
261
262define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
263; SSE-LABEL: sitofp_8i16_to_4f64:
264; SSE:       # BB#0:
265; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
266; SSE-NEXT:    psrad $16, %xmm1
267; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
268; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
269; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
270; SSE-NEXT:    retq
271;
272; AVX1-LABEL: sitofp_8i16_to_4f64:
273; AVX1:       # BB#0:
274; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
275; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
276; AVX1-NEXT:    retq
277;
278; AVX2-LABEL: sitofp_8i16_to_4f64:
279; AVX2:       # BB#0:
280; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
281; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
282; AVX2-NEXT:    retq
283  %cvt = sitofp <8 x i16> %a to <8 x double>
284  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
285  ret <4 x double> %shuf
286}
287
288define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
289; SSE-LABEL: sitofp_4i8_to_4f64:
290; SSE:       # BB#0:
291; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
292; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
293; SSE-NEXT:    psrad $24, %xmm1
294; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
295; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
296; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
297; SSE-NEXT:    retq
298;
299; AVX-LABEL: sitofp_4i8_to_4f64:
300; AVX:       # BB#0:
301; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
302; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
303; AVX-NEXT:    retq
304  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
305  %cvt = sitofp <4 x i8> %shuf to <4 x double>
306  ret <4 x double> %cvt
307}
308
309define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
310; SSE-LABEL: sitofp_16i8_to_4f64:
311; SSE:       # BB#0:
312; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
313; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
314; SSE-NEXT:    psrad $24, %xmm1
315; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
316; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
317; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
318; SSE-NEXT:    retq
319;
320; AVX1-LABEL: sitofp_16i8_to_4f64:
321; AVX1:       # BB#0:
322; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
323; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
324; AVX1-NEXT:    retq
325;
326; AVX2-LABEL: sitofp_16i8_to_4f64:
327; AVX2:       # BB#0:
328; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
329; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
330; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
331; AVX2-NEXT:    retq
332  %cvt = sitofp <16 x i8> %a to <16 x double>
333  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
334  ret <4 x double> %shuf
335}
336
337;
338; Unsigned Integer to Double
339;
340
341define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
342; SSE-LABEL: uitofp_2i64_to_2f64:
343; SSE:       # BB#0:
344; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
345; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
346; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
347; SSE-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
348; SSE-NEXT:    subpd %xmm3, %xmm0
349; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
350; SSE-NEXT:    addpd %xmm4, %xmm0
351; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
352; SSE-NEXT:    subpd %xmm3, %xmm2
353; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
354; SSE-NEXT:    addpd %xmm2, %xmm1
355; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
356; SSE-NEXT:    retq
357;
358; AVX-LABEL: uitofp_2i64_to_2f64:
359; AVX:       # BB#0:
360; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
361; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
362; AVX-NEXT:    vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
363; AVX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
364; AVX-NEXT:    vhaddpd %xmm2, %xmm2, %xmm2
365; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
366; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
367; AVX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
368; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
369; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
370; AVX-NEXT:    retq
371  %cvt = uitofp <2 x i64> %a to <2 x double>
372  ret <2 x double> %cvt
373}
374
375define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
376; SSE-LABEL: uitofp_2i32_to_2f64:
377; SSE:       # BB#0:
378; SSE-NEXT:    pxor %xmm1, %xmm1
379; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
380; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
381; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
382; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
383; SSE-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
384; SSE-NEXT:    subpd %xmm3, %xmm0
385; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
386; SSE-NEXT:    addpd %xmm4, %xmm0
387; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
388; SSE-NEXT:    subpd %xmm3, %xmm2
389; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
390; SSE-NEXT:    addpd %xmm2, %xmm1
391; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
392; SSE-NEXT:    retq
393;
394; AVX-LABEL: uitofp_2i32_to_2f64:
395; AVX:       # BB#0:
396; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
397; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
398; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
399; AVX-NEXT:    vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
400; AVX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
401; AVX-NEXT:    vhaddpd %xmm2, %xmm2, %xmm2
402; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
403; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
404; AVX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
405; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
406; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
407; AVX-NEXT:    retq
408  %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
409  %cvt = uitofp <2 x i32> %shuf to <2 x double>
410  ret <2 x double> %cvt
411}
412
413define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
414; SSE-LABEL: uitofp_4i32_to_2f64:
415; SSE:       # BB#0:
416; SSE-NEXT:    pxor %xmm1, %xmm1
417; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
418; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
419; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
420; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
421; SSE-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
422; SSE-NEXT:    subpd %xmm3, %xmm0
423; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
424; SSE-NEXT:    addpd %xmm4, %xmm0
425; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
426; SSE-NEXT:    subpd %xmm3, %xmm2
427; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
428; SSE-NEXT:    addpd %xmm2, %xmm1
429; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
430; SSE-NEXT:    retq
431;
432; AVX1-LABEL: uitofp_4i32_to_2f64:
433; AVX1:       # BB#0:
434; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
435; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
436; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
437; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
438; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
439; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
440; AVX1-NEXT:    # kill
441; AVX1-NEXT:    vzeroupper
442; AVX1-NEXT:    retq
443;
444; AVX2-LABEL: uitofp_4i32_to_2f64:
445; AVX2:       # BB#0:
446; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
447; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
448; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
449; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
450; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
451; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
452; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
453; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
454; AVX2-NEXT:    # kill
455; AVX2-NEXT:    vzeroupper
456; AVX2-NEXT:    retq
457  %cvt = uitofp <4 x i32> %a to <4 x double>
458  %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
459  ret <2 x double> %shuf
460}
461
462define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
463; SSE-LABEL: uitofp_2i16_to_2f64:
464; SSE:       # BB#0:
465; SSE-NEXT:    pxor %xmm1, %xmm1
466; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
467; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
468; SSE-NEXT:    retq
469;
470; AVX-LABEL: uitofp_2i16_to_2f64:
471; AVX:       # BB#0:
472; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
473; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
474; AVX-NEXT:    retq
475  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
476  %cvt = uitofp <2 x i16> %shuf to <2 x double>
477  ret <2 x double> %cvt
478}
479
480define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
481; SSE-LABEL: uitofp_8i16_to_2f64:
482; SSE:       # BB#0:
483; SSE-NEXT:    pxor %xmm1, %xmm1
484; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
485; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
486; SSE-NEXT:    retq
487;
488; AVX1-LABEL: uitofp_8i16_to_2f64:
489; AVX1:       # BB#0:
490; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
491; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
492; AVX1-NEXT:    # kill
493; AVX1-NEXT:    vzeroupper
494; AVX1-NEXT:    retq
495;
496; AVX2-LABEL: uitofp_8i16_to_2f64:
497; AVX2:       # BB#0:
498; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
499; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
500; AVX2-NEXT:    # kill
501; AVX2-NEXT:    vzeroupper
502; AVX2-NEXT:    retq
503  %cvt = uitofp <8 x i16> %a to <8 x double>
504  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
505  ret <2 x double> %shuf
506}
507
508define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
509; SSE-LABEL: uitofp_2i8_to_2f64:
510; SSE:       # BB#0:
511; SSE-NEXT:    pxor %xmm1, %xmm1
512; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
513; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
514; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
515; SSE-NEXT:    retq
516;
517; AVX-LABEL: uitofp_2i8_to_2f64:
518; AVX:       # BB#0:
519; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
520; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
521; AVX-NEXT:    retq
522  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
523  %cvt = uitofp <2 x i8> %shuf to <2 x double>
524  ret <2 x double> %cvt
525}
526
527define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
528; SSE-LABEL: uitofp_16i8_to_2f64:
529; SSE:       # BB#0:
530; SSE-NEXT:    pxor %xmm1, %xmm1
531; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
532; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
533; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
534; SSE-NEXT:    retq
535;
536; AVX1-LABEL: uitofp_16i8_to_2f64:
537; AVX1:       # BB#0:
538; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
539; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
540; AVX1-NEXT:    # kill
541; AVX1-NEXT:    vzeroupper
542; AVX1-NEXT:    retq
543;
544; AVX2-LABEL: uitofp_16i8_to_2f64:
545; AVX2:       # BB#0:
546; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
547; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
548; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
549; AVX2-NEXT:    # kill
550; AVX2-NEXT:    vzeroupper
551; AVX2-NEXT:    retq
552  %cvt = uitofp <16 x i8> %a to <16 x double>
553  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
554  ret <2 x double> %shuf
555}
556
557define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
558; SSE-LABEL: uitofp_4i64_to_4f64:
559; SSE:       # BB#0:
560; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
561; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
562; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
563; SSE-NEXT:    movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
564; SSE-NEXT:    subpd %xmm4, %xmm0
565; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
566; SSE-NEXT:    addpd %xmm5, %xmm0
567; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
568; SSE-NEXT:    subpd %xmm4, %xmm3
569; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
570; SSE-NEXT:    addpd %xmm3, %xmm5
571; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
572; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
573; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
574; SSE-NEXT:    subpd %xmm4, %xmm1
575; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
576; SSE-NEXT:    addpd %xmm5, %xmm1
577; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
578; SSE-NEXT:    subpd %xmm4, %xmm3
579; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
580; SSE-NEXT:    addpd %xmm3, %xmm2
581; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
582; SSE-NEXT:    retq
583;
584; AVX1-LABEL: uitofp_4i64_to_4f64:
585; AVX1:       # BB#0:
586; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
587; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
588; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
589; AVX1-NEXT:    vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
590; AVX1-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
591; AVX1-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
592; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
593; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
594; AVX1-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
595; AVX1-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
596; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
597; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
598; AVX1-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
599; AVX1-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
600; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
601; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
602; AVX1-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
603; AVX1-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
604; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
605; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
606; AVX1-NEXT:    retq
607;
608; AVX2-LABEL: uitofp_4i64_to_4f64:
609; AVX2:       # BB#0:
610; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
611; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
612; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
613; AVX2-NEXT:    vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
614; AVX2-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
615; AVX2-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
616; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
617; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
618; AVX2-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
619; AVX2-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
620; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
621; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
622; AVX2-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
623; AVX2-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
624; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
625; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
626; AVX2-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
627; AVX2-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
628; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
629; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
630; AVX2-NEXT:    retq
631  %cvt = uitofp <4 x i64> %a to <4 x double>
632  ret <4 x double> %cvt
633}
634
635define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
636; SSE-LABEL: uitofp_4i32_to_4f64:
637; SSE:       # BB#0:
638; SSE-NEXT:    movdqa %xmm0, %xmm2
639; SSE-NEXT:    pxor %xmm1, %xmm1
640; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
641; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
642; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
643; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
644; SSE-NEXT:    movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
645; SSE-NEXT:    subpd %xmm5, %xmm0
646; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
647; SSE-NEXT:    addpd %xmm6, %xmm0
648; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
649; SSE-NEXT:    subpd %xmm5, %xmm4
650; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
651; SSE-NEXT:    addpd %xmm4, %xmm6
652; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0]
653; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
654; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
655; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
656; SSE-NEXT:    subpd %xmm5, %xmm2
657; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
658; SSE-NEXT:    addpd %xmm2, %xmm1
659; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
660; SSE-NEXT:    subpd %xmm5, %xmm4
661; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
662; SSE-NEXT:    addpd %xmm4, %xmm2
663; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
664; SSE-NEXT:    retq
665;
666; AVX1-LABEL: uitofp_4i32_to_4f64:
667; AVX1:       # BB#0:
668; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
669; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
670; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
671; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
672; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
673; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
674; AVX1-NEXT:    retq
675;
676; AVX2-LABEL: uitofp_4i32_to_4f64:
677; AVX2:       # BB#0:
678; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
679; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
680; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
681; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
682; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
683; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
684; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
685; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
686; AVX2-NEXT:    retq
687  %cvt = uitofp <4 x i32> %a to <4 x double>
688  ret <4 x double> %cvt
689}
690
691define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
692; SSE-LABEL: uitofp_4i16_to_4f64:
693; SSE:       # BB#0:
694; SSE-NEXT:    pxor %xmm1, %xmm1
695; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
696; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
697; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
698; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
699; SSE-NEXT:    movaps %xmm2, %xmm0
700; SSE-NEXT:    retq
701;
702; AVX-LABEL: uitofp_4i16_to_4f64:
703; AVX:       # BB#0:
704; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
705; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
706; AVX-NEXT:    retq
707  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
708  %cvt = uitofp <4 x i16> %shuf to <4 x double>
709  ret <4 x double> %cvt
710}
711
712define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
713; SSE-LABEL: uitofp_8i16_to_4f64:
714; SSE:       # BB#0:
715; SSE-NEXT:    pxor %xmm1, %xmm1
716; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
717; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
718; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
719; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
720; SSE-NEXT:    movaps %xmm2, %xmm0
721; SSE-NEXT:    retq
722;
723; AVX1-LABEL: uitofp_8i16_to_4f64:
724; AVX1:       # BB#0:
725; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
726; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
727; AVX1-NEXT:    retq
728;
729; AVX2-LABEL: uitofp_8i16_to_4f64:
730; AVX2:       # BB#0:
731; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
732; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
733; AVX2-NEXT:    retq
734  %cvt = uitofp <8 x i16> %a to <8 x double>
735  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
736  ret <4 x double> %shuf
737}
738
739define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
740; SSE-LABEL: uitofp_4i8_to_4f64:
741; SSE:       # BB#0:
742; SSE-NEXT:    pxor %xmm1, %xmm1
743; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
744; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
745; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
746; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
747; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
748; SSE-NEXT:    movaps %xmm2, %xmm0
749; SSE-NEXT:    retq
750;
751; AVX-LABEL: uitofp_4i8_to_4f64:
752; AVX:       # BB#0:
753; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
754; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
755; AVX-NEXT:    retq
756  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
757  %cvt = uitofp <4 x i8> %shuf to <4 x double>
758  ret <4 x double> %cvt
759}
760
761define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
762; SSE-LABEL: uitofp_16i8_to_4f64:
763; SSE:       # BB#0:
764; SSE-NEXT:    pxor %xmm1, %xmm1
765; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
766; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
767; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
768; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
769; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
770; SSE-NEXT:    movaps %xmm2, %xmm0
771; SSE-NEXT:    retq
772;
773; AVX1-LABEL: uitofp_16i8_to_4f64:
774; AVX1:       # BB#0:
775; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
776; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
777; AVX1-NEXT:    retq
778;
779; AVX2-LABEL: uitofp_16i8_to_4f64:
780; AVX2:       # BB#0:
781; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
782; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
783; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
784; AVX2-NEXT:    retq
785  %cvt = uitofp <16 x i8> %a to <16 x double>
786  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
787  ret <4 x double> %shuf
788}
789
790;
791; Signed Integer to Float
792;
793
794define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
795; SSE-LABEL: sitofp_2i64_to_4f32:
796; SSE:       # BB#0:
797; SSE-NEXT:    movd %xmm0, %rax
798; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
799; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
800; SSE-NEXT:    movd %xmm0, %rax
801; SSE-NEXT:    xorps %xmm0, %xmm0
802; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
803; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
804; SSE-NEXT:    movaps %xmm1, %xmm0
805; SSE-NEXT:    retq
806;
807; AVX-LABEL: sitofp_2i64_to_4f32:
808; AVX:       # BB#0:
809; AVX-NEXT:    vpextrq $1, %xmm0, %rax
810; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
811; AVX-NEXT:    vmovq %xmm0, %rax
812; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
813; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
814; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
815; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
816; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
817; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
818; AVX-NEXT:    retq
819  %cvt = sitofp <2 x i64> %a to <2 x float>
820  %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
821  ret <4 x float> %ext
822}
823
824define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
825; SSE-LABEL: sitofp_4i64_to_4f32_undef:
826; SSE:       # BB#0:
827; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
828; SSE-NEXT:    movd %xmm0, %rax
829; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
830; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
831; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
832; SSE-NEXT:    movd %xmm0, %rax
833; SSE-NEXT:    xorps %xmm0, %xmm0
834; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
835; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
836; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
837; SSE-NEXT:    movaps %xmm1, %xmm0
838; SSE-NEXT:    retq
839;
840; AVX-LABEL: sitofp_4i64_to_4f32_undef:
841; AVX:       # BB#0:
842; AVX-NEXT:    vpextrq $1, %xmm0, %rax
843; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
844; AVX-NEXT:    vmovq %xmm0, %rax
845; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
846; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
847; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
848; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
849; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
850; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
851; AVX-NEXT:    retq
852  %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
853  %cvt = sitofp <4 x i64> %ext to <4 x float>
854  ret <4 x float> %cvt
855}
856
857define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
858; SSE-LABEL: sitofp_4i32_to_4f32:
859; SSE:       # BB#0:
860; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
861; SSE-NEXT:    retq
862;
863; AVX-LABEL: sitofp_4i32_to_4f32:
864; AVX:       # BB#0:
865; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
866; AVX-NEXT:    retq
867  %cvt = sitofp <4 x i32> %a to <4 x float>
868  ret <4 x float> %cvt
869}
870
871define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
872; SSE-LABEL: sitofp_4i16_to_4f32:
873; SSE:       # BB#0:
874; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
875; SSE-NEXT:    psrad $16, %xmm0
876; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
877; SSE-NEXT:    retq
878;
879; AVX-LABEL: sitofp_4i16_to_4f32:
880; AVX:       # BB#0:
881; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
882; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
883; AVX-NEXT:    retq
884  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
885  %cvt = sitofp <4 x i16> %shuf to <4 x float>
886  ret <4 x float> %cvt
887}
888
889define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
890; SSE-LABEL: sitofp_8i16_to_4f32:
891; SSE:       # BB#0:
892; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
893; SSE-NEXT:    psrad $16, %xmm0
894; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
895; SSE-NEXT:    retq
896;
897; AVX1-LABEL: sitofp_8i16_to_4f32:
898; AVX1:       # BB#0:
899; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
900; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
901; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
902; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
903; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
904; AVX1-NEXT:    # kill
905; AVX1-NEXT:    vzeroupper
906; AVX1-NEXT:    retq
907;
908; AVX2-LABEL: sitofp_8i16_to_4f32:
909; AVX2:       # BB#0:
910; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
911; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
912; AVX2-NEXT:    # kill
913; AVX2-NEXT:    vzeroupper
914; AVX2-NEXT:    retq
915  %cvt = sitofp <8 x i16> %a to <8 x float>
916  %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
917  ret <4 x float> %shuf
918}
919
920define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
921; SSE-LABEL: sitofp_4i8_to_4f32:
922; SSE:       # BB#0:
923; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
924; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
925; SSE-NEXT:    psrad $24, %xmm0
926; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
927; SSE-NEXT:    retq
928;
929; AVX-LABEL: sitofp_4i8_to_4f32:
930; AVX:       # BB#0:
931; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
932; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
933; AVX-NEXT:    retq
934  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
935  %cvt = sitofp <4 x i8> %shuf to <4 x float>
936  ret <4 x float> %cvt
937}
938
939define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
940; SSE-LABEL: sitofp_16i8_to_4f32:
941; SSE:       # BB#0:
942; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
943; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
944; SSE-NEXT:    psrad $24, %xmm0
945; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
946; SSE-NEXT:    retq
947;
948; AVX1-LABEL: sitofp_16i8_to_4f32:
949; AVX1:       # BB#0:
950; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
951; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
952; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
953; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
954; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
955; AVX1-NEXT:    # kill
956; AVX1-NEXT:    vzeroupper
957; AVX1-NEXT:    retq
958;
959; AVX2-LABEL: sitofp_16i8_to_4f32:
960; AVX2:       # BB#0:
961; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
962; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
963; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
964; AVX2-NEXT:    # kill
965; AVX2-NEXT:    vzeroupper
966; AVX2-NEXT:    retq
967  %cvt = sitofp <16 x i8> %a to <16 x float>
968  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
969  ret <4 x float> %shuf
970}
971
972define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
973; SSE-LABEL: sitofp_4i64_to_4f32:
974; SSE:       # BB#0:
975; SSE-NEXT:    movd %xmm1, %rax
976; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
977; SSE-NEXT:    movd %xmm0, %rax
978; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
979; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
980; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
981; SSE-NEXT:    movd %xmm1, %rax
982; SSE-NEXT:    xorps %xmm1, %xmm1
983; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
984; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
985; SSE-NEXT:    movd %xmm0, %rax
986; SSE-NEXT:    xorps %xmm0, %xmm0
987; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
988; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
989; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
990; SSE-NEXT:    movaps %xmm2, %xmm0
991; SSE-NEXT:    retq
992;
993; AVX1-LABEL: sitofp_4i64_to_4f32:
994; AVX1:       # BB#0:
995; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
996; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
997; AVX1-NEXT:    vmovq %xmm0, %rax
998; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
999; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1000; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1001; AVX1-NEXT:    vmovq %xmm0, %rax
1002; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1003; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1004; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
1005; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1006; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
1007; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1008; AVX1-NEXT:    vzeroupper
1009; AVX1-NEXT:    retq
1010;
1011; AVX2-LABEL: sitofp_4i64_to_4f32:
1012; AVX2:       # BB#0:
1013; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
1014; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1015; AVX2-NEXT:    vmovq %xmm0, %rax
1016; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1017; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1018; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1019; AVX2-NEXT:    vmovq %xmm0, %rax
1020; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1021; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1022; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
1023; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1024; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
1025; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1026; AVX2-NEXT:    vzeroupper
1027; AVX2-NEXT:    retq
1028  %cvt = sitofp <4 x i64> %a to <4 x float>
1029  ret <4 x float> %cvt
1030}
1031
1032define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
1033; SSE-LABEL: sitofp_8i32_to_8f32:
1034; SSE:       # BB#0:
1035; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1036; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
1037; SSE-NEXT:    retq
1038;
1039; AVX-LABEL: sitofp_8i32_to_8f32:
1040; AVX:       # BB#0:
1041; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
1042; AVX-NEXT:    retq
1043  %cvt = sitofp <8 x i32> %a to <8 x float>
1044  ret <8 x float> %cvt
1045}
1046
1047define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
1048; SSE-LABEL: sitofp_8i16_to_8f32:
1049; SSE:       # BB#0:
1050; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1051; SSE-NEXT:    psrad $16, %xmm1
1052; SSE-NEXT:    cvtdq2ps %xmm1, %xmm2
1053; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1054; SSE-NEXT:    psrad $16, %xmm0
1055; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
1056; SSE-NEXT:    movaps %xmm2, %xmm0
1057; SSE-NEXT:    retq
1058;
1059; AVX1-LABEL: sitofp_8i16_to_8f32:
1060; AVX1:       # BB#0:
1061; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
1062; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1063; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
1064; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1065; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1066; AVX1-NEXT:    retq
1067;
1068; AVX2-LABEL: sitofp_8i16_to_8f32:
1069; AVX2:       # BB#0:
1070; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
1071; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1072; AVX2-NEXT:    retq
1073  %cvt = sitofp <8 x i16> %a to <8 x float>
1074  ret <8 x float> %cvt
1075}
1076
1077define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
1078; SSE-LABEL: sitofp_8i8_to_8f32:
1079; SSE:       # BB#0:
1080; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1081; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1082; SSE-NEXT:    psrad $24, %xmm1
1083; SSE-NEXT:    cvtdq2ps %xmm1, %xmm2
1084; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1085; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1086; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1087; SSE-NEXT:    psrad $24, %xmm0
1088; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
1089; SSE-NEXT:    movaps %xmm2, %xmm0
1090; SSE-NEXT:    retq
1091;
1092; AVX1-LABEL: sitofp_8i8_to_8f32:
1093; AVX1:       # BB#0:
1094; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
1095; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1096; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
1097; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1098; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1099; AVX1-NEXT:    retq
1100;
1101; AVX2-LABEL: sitofp_8i8_to_8f32:
1102; AVX2:       # BB#0:
1103; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
1104; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1105; AVX2-NEXT:    retq
1106  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1107  %cvt = sitofp <8 x i8> %shuf to <8 x float>
1108  ret <8 x float> %cvt
1109}
1110
1111define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
1112; SSE-LABEL: sitofp_16i8_to_8f32:
1113; SSE:       # BB#0:
1114; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1115; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1116; SSE-NEXT:    psrad $24, %xmm1
1117; SSE-NEXT:    cvtdq2ps %xmm1, %xmm2
1118; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1119; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1120; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1121; SSE-NEXT:    psrad $24, %xmm0
1122; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
1123; SSE-NEXT:    movaps %xmm2, %xmm0
1124; SSE-NEXT:    retq
1125;
1126; AVX1-LABEL: sitofp_16i8_to_8f32:
1127; AVX1:       # BB#0:
1128; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
1129; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1130; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
1131; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1132; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1133; AVX1-NEXT:    retq
1134;
1135; AVX2-LABEL: sitofp_16i8_to_8f32:
1136; AVX2:       # BB#0:
1137; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1138; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
1139; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1140; AVX2-NEXT:    retq
1141  %cvt = sitofp <16 x i8> %a to <16 x float>
1142  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1143  ret <8 x float> %shuf
1144}
1145
1146;
1147; Unsigned Integer to Float
1148;
1149
1150define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
1151; SSE-LABEL: uitofp_2i64_to_4f32:
1152; SSE:       # BB#0:
1153; SSE-NEXT:    movdqa %xmm0, %xmm1
1154; SSE-NEXT:    movd %xmm1, %rax
1155; SSE-NEXT:    movl %eax, %ecx
1156; SSE-NEXT:    andl $1, %ecx
1157; SSE-NEXT:    testq %rax, %rax
1158; SSE-NEXT:    js .LBB38_1
1159; SSE-NEXT:  # BB#2:
1160; SSE-NEXT:    xorps %xmm0, %xmm0
1161; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
1162; SSE-NEXT:    jmp .LBB38_3
1163; SSE-NEXT:  .LBB38_1:
1164; SSE-NEXT:    shrq %rax
1165; SSE-NEXT:    orq %rax, %rcx
1166; SSE-NEXT:    xorps %xmm0, %xmm0
1167; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
1168; SSE-NEXT:    addss %xmm0, %xmm0
1169; SSE-NEXT:  .LBB38_3:
1170; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1171; SSE-NEXT:    movd %xmm1, %rax
1172; SSE-NEXT:    movl %eax, %ecx
1173; SSE-NEXT:    andl $1, %ecx
1174; SSE-NEXT:    testq %rax, %rax
1175; SSE-NEXT:    js .LBB38_4
1176; SSE-NEXT:  # BB#5:
1177; SSE-NEXT:    xorps %xmm1, %xmm1
1178; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
1179; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1180; SSE-NEXT:    retq
1181; SSE-NEXT:  .LBB38_4:
1182; SSE-NEXT:    shrq %rax
1183; SSE-NEXT:    orq %rax, %rcx
1184; SSE-NEXT:    xorps %xmm1, %xmm1
1185; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
1186; SSE-NEXT:    addss %xmm1, %xmm1
1187; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1188; SSE-NEXT:    retq
1189;
1190; AVX-LABEL: uitofp_2i64_to_4f32:
1191; AVX:       # BB#0:
1192; AVX-NEXT:    vpextrq $1, %xmm0, %rax
1193; AVX-NEXT:    movl %eax, %ecx
1194; AVX-NEXT:    andl $1, %ecx
1195; AVX-NEXT:    testq %rax, %rax
1196; AVX-NEXT:    js .LBB38_1
1197; AVX-NEXT:  # BB#2:
1198; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1199; AVX-NEXT:    jmp .LBB38_3
1200; AVX-NEXT:  .LBB38_1:
1201; AVX-NEXT:    shrq %rax
1202; AVX-NEXT:    orq %rax, %rcx
1203; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
1204; AVX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
1205; AVX-NEXT:  .LBB38_3:
1206; AVX-NEXT:    vmovq %xmm0, %rax
1207; AVX-NEXT:    movl %eax, %ecx
1208; AVX-NEXT:    andl $1, %ecx
1209; AVX-NEXT:    testq %rax, %rax
1210; AVX-NEXT:    js .LBB38_4
1211; AVX-NEXT:  # BB#5:
1212; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1213; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
1214; AVX-NEXT:    jmp .LBB38_6
1215; AVX-NEXT:  .LBB38_4:
1216; AVX-NEXT:    shrq %rax
1217; AVX-NEXT:    orq %rax, %rcx
1218; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1219; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
1220; AVX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
1221; AVX-NEXT:  .LBB38_6:
1222; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1223; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1224; AVX-NEXT:    testq %rax, %rax
1225; AVX-NEXT:    js .LBB38_8
1226; AVX-NEXT:  # BB#7:
1227; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1228; AVX-NEXT:  .LBB38_8:
1229; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1230; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1231; AVX-NEXT:    retq
1232  %cvt = uitofp <2 x i64> %a to <2 x float>
1233  %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1234  ret <4 x float> %ext
1235}
1236
1237define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
1238; SSE-LABEL: uitofp_4i64_to_4f32_undef:
1239; SSE:       # BB#0:
1240; SSE-NEXT:    movdqa %xmm0, %xmm1
1241; SSE-NEXT:    testq %rax, %rax
1242; SSE-NEXT:    xorps %xmm2, %xmm2
1243; SSE-NEXT:    js .LBB39_2
1244; SSE-NEXT:  # BB#1:
1245; SSE-NEXT:    xorps %xmm2, %xmm2
1246; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
1247; SSE-NEXT:  .LBB39_2:
1248; SSE-NEXT:    movd %xmm1, %rax
1249; SSE-NEXT:    movl %eax, %ecx
1250; SSE-NEXT:    andl $1, %ecx
1251; SSE-NEXT:    testq %rax, %rax
1252; SSE-NEXT:    js .LBB39_3
1253; SSE-NEXT:  # BB#4:
1254; SSE-NEXT:    xorps %xmm0, %xmm0
1255; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
1256; SSE-NEXT:    jmp .LBB39_5
1257; SSE-NEXT:  .LBB39_3:
1258; SSE-NEXT:    shrq %rax
1259; SSE-NEXT:    orq %rax, %rcx
1260; SSE-NEXT:    xorps %xmm0, %xmm0
1261; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
1262; SSE-NEXT:    addss %xmm0, %xmm0
1263; SSE-NEXT:  .LBB39_5:
1264; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1265; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1266; SSE-NEXT:    movd %xmm1, %rax
1267; SSE-NEXT:    movl %eax, %ecx
1268; SSE-NEXT:    andl $1, %ecx
1269; SSE-NEXT:    testq %rax, %rax
1270; SSE-NEXT:    js .LBB39_6
1271; SSE-NEXT:  # BB#7:
1272; SSE-NEXT:    xorps %xmm1, %xmm1
1273; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
1274; SSE-NEXT:    jmp .LBB39_8
1275; SSE-NEXT:  .LBB39_6:
1276; SSE-NEXT:    shrq %rax
1277; SSE-NEXT:    orq %rax, %rcx
1278; SSE-NEXT:    xorps %xmm1, %xmm1
1279; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
1280; SSE-NEXT:    addss %xmm1, %xmm1
1281; SSE-NEXT:  .LBB39_8:
1282; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1283; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1284; SSE-NEXT:    retq
1285;
1286; AVX-LABEL: uitofp_4i64_to_4f32_undef:
1287; AVX:       # BB#0:
1288; AVX-NEXT:    vpextrq $1, %xmm0, %rax
1289; AVX-NEXT:    movl %eax, %ecx
1290; AVX-NEXT:    andl $1, %ecx
1291; AVX-NEXT:    testq %rax, %rax
1292; AVX-NEXT:    js .LBB39_1
1293; AVX-NEXT:  # BB#2:
1294; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1295; AVX-NEXT:    jmp .LBB39_3
1296; AVX-NEXT:  .LBB39_1:
1297; AVX-NEXT:    shrq %rax
1298; AVX-NEXT:    orq %rax, %rcx
1299; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
1300; AVX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
1301; AVX-NEXT:  .LBB39_3:
1302; AVX-NEXT:    vmovq %xmm0, %rax
1303; AVX-NEXT:    movl %eax, %ecx
1304; AVX-NEXT:    andl $1, %ecx
1305; AVX-NEXT:    testq %rax, %rax
1306; AVX-NEXT:    js .LBB39_4
1307; AVX-NEXT:  # BB#5:
1308; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1309; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
1310; AVX-NEXT:    jmp .LBB39_6
1311; AVX-NEXT:  .LBB39_4:
1312; AVX-NEXT:    shrq %rax
1313; AVX-NEXT:    orq %rax, %rcx
1314; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1315; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
1316; AVX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
1317; AVX-NEXT:  .LBB39_6:
1318; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1319; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1320; AVX-NEXT:    testq %rax, %rax
1321; AVX-NEXT:    js .LBB39_8
1322; AVX-NEXT:  # BB#7:
1323; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1324; AVX-NEXT:  .LBB39_8:
1325; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1326; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1327; AVX-NEXT:    retq
1328  %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1329  %cvt = uitofp <4 x i64> %ext to <4 x float>
1330  ret <4 x float> %cvt
1331}
1332
1333define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
1334; SSE-LABEL: uitofp_4i32_to_4f32:
1335; SSE:       # BB#0:
1336; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
1337; SSE-NEXT:    pand %xmm0, %xmm1
1338; SSE-NEXT:    por {{.*}}(%rip), %xmm1
1339; SSE-NEXT:    psrld $16, %xmm0
1340; SSE-NEXT:    por {{.*}}(%rip), %xmm0
1341; SSE-NEXT:    addps {{.*}}(%rip), %xmm0
1342; SSE-NEXT:    addps %xmm1, %xmm0
1343; SSE-NEXT:    retq
1344;
1345; AVX1-LABEL: uitofp_4i32_to_4f32:
1346; AVX1:       # BB#0:
1347; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
1348; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
1349; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
1350; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
1351; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
1352; AVX1-NEXT:    retq
1353;
1354; AVX2-LABEL: uitofp_4i32_to_4f32:
1355; AVX2:       # BB#0:
1356; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
1357; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1358; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
1359; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
1360; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1361; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
1362; AVX2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
1363; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
1364; AVX2-NEXT:    retq
1365  %cvt = uitofp <4 x i32> %a to <4 x float>
1366  ret <4 x float> %cvt
1367}
1368
1369define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
1370; SSE-LABEL: uitofp_4i16_to_4f32:
1371; SSE:       # BB#0:
1372; SSE-NEXT:    pxor %xmm1, %xmm1
1373; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1374; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1375; SSE-NEXT:    retq
1376;
1377; AVX-LABEL: uitofp_4i16_to_4f32:
1378; AVX:       # BB#0:
1379; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1380; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
1381; AVX-NEXT:    retq
1382  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1383  %cvt = uitofp <4 x i16> %shuf to <4 x float>
1384  ret <4 x float> %cvt
1385}
1386
1387define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
1388; SSE-LABEL: uitofp_8i16_to_4f32:
1389; SSE:       # BB#0:
1390; SSE-NEXT:    pxor %xmm1, %xmm1
1391; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1392; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1393; SSE-NEXT:    retq
1394;
1395; AVX1-LABEL: uitofp_8i16_to_4f32:
1396; AVX1:       # BB#0:
1397; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1398; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1399; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1400; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1401; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1402; AVX1-NEXT:    # kill
1403; AVX1-NEXT:    vzeroupper
1404; AVX1-NEXT:    retq
1405;
1406; AVX2-LABEL: uitofp_8i16_to_4f32:
1407; AVX2:       # BB#0:
1408; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1409; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1410; AVX2-NEXT:    # kill
1411; AVX2-NEXT:    vzeroupper
1412; AVX2-NEXT:    retq
1413  %cvt = uitofp <8 x i16> %a to <8 x float>
1414  %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1415  ret <4 x float> %shuf
1416}
1417
1418define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
1419; SSE-LABEL: uitofp_4i8_to_4f32:
1420; SSE:       # BB#0:
1421; SSE-NEXT:    pxor %xmm1, %xmm1
1422; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1423; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1424; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1425; SSE-NEXT:    retq
1426;
1427; AVX-LABEL: uitofp_4i8_to_4f32:
1428; AVX:       # BB#0:
1429; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1430; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
1431; AVX-NEXT:    retq
1432  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1433  %cvt = uitofp <4 x i8> %shuf to <4 x float>
1434  ret <4 x float> %cvt
1435}
1436
1437define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
1438; SSE-LABEL: uitofp_16i8_to_4f32:
1439; SSE:       # BB#0:
1440; SSE-NEXT:    pxor %xmm1, %xmm1
1441; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1442; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1443; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
1444; SSE-NEXT:    retq
1445;
1446; AVX1-LABEL: uitofp_16i8_to_4f32:
1447; AVX1:       # BB#0:
1448; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1449; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1450; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1451; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1452; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1453; AVX1-NEXT:    # kill
1454; AVX1-NEXT:    vzeroupper
1455; AVX1-NEXT:    retq
1456;
1457; AVX2-LABEL: uitofp_16i8_to_4f32:
1458; AVX2:       # BB#0:
1459; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1460; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1461; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1462; AVX2-NEXT:    # kill
1463; AVX2-NEXT:    vzeroupper
1464; AVX2-NEXT:    retq
1465  %cvt = uitofp <16 x i8> %a to <16 x float>
1466  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1467  ret <4 x float> %shuf
1468}
1469
1470define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
1471; SSE-LABEL: uitofp_4i64_to_4f32:
1472; SSE:       # BB#0:
1473; SSE-NEXT:    movd %xmm1, %rax
1474; SSE-NEXT:    movl %eax, %ecx
1475; SSE-NEXT:    andl $1, %ecx
1476; SSE-NEXT:    testq %rax, %rax
1477; SSE-NEXT:    js .LBB45_1
1478; SSE-NEXT:  # BB#2:
1479; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
1480; SSE-NEXT:    jmp .LBB45_3
1481; SSE-NEXT:  .LBB45_1:
1482; SSE-NEXT:    shrq %rax
1483; SSE-NEXT:    orq %rax, %rcx
1484; SSE-NEXT:    cvtsi2ssq %rcx, %xmm3
1485; SSE-NEXT:    addss %xmm3, %xmm3
1486; SSE-NEXT:  .LBB45_3:
1487; SSE-NEXT:    movd %xmm0, %rax
1488; SSE-NEXT:    movl %eax, %ecx
1489; SSE-NEXT:    andl $1, %ecx
1490; SSE-NEXT:    testq %rax, %rax
1491; SSE-NEXT:    js .LBB45_4
1492; SSE-NEXT:  # BB#5:
1493; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
1494; SSE-NEXT:    jmp .LBB45_6
1495; SSE-NEXT:  .LBB45_4:
1496; SSE-NEXT:    shrq %rax
1497; SSE-NEXT:    orq %rax, %rcx
1498; SSE-NEXT:    cvtsi2ssq %rcx, %xmm2
1499; SSE-NEXT:    addss %xmm2, %xmm2
1500; SSE-NEXT:  .LBB45_6:
1501; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1502; SSE-NEXT:    movd %xmm1, %rax
1503; SSE-NEXT:    movl %eax, %ecx
1504; SSE-NEXT:    andl $1, %ecx
1505; SSE-NEXT:    testq %rax, %rax
1506; SSE-NEXT:    js .LBB45_7
1507; SSE-NEXT:  # BB#8:
1508; SSE-NEXT:    xorps %xmm1, %xmm1
1509; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
1510; SSE-NEXT:    jmp .LBB45_9
1511; SSE-NEXT:  .LBB45_7:
1512; SSE-NEXT:    shrq %rax
1513; SSE-NEXT:    orq %rax, %rcx
1514; SSE-NEXT:    xorps %xmm1, %xmm1
1515; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
1516; SSE-NEXT:    addss %xmm1, %xmm1
1517; SSE-NEXT:  .LBB45_9:
1518; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1519; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1520; SSE-NEXT:    movd %xmm0, %rax
1521; SSE-NEXT:    movl %eax, %ecx
1522; SSE-NEXT:    andl $1, %ecx
1523; SSE-NEXT:    testq %rax, %rax
1524; SSE-NEXT:    js .LBB45_10
1525; SSE-NEXT:  # BB#11:
1526; SSE-NEXT:    xorps %xmm0, %xmm0
1527; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
1528; SSE-NEXT:    jmp .LBB45_12
1529; SSE-NEXT:  .LBB45_10:
1530; SSE-NEXT:    shrq %rax
1531; SSE-NEXT:    orq %rax, %rcx
1532; SSE-NEXT:    xorps %xmm0, %xmm0
1533; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
1534; SSE-NEXT:    addss %xmm0, %xmm0
1535; SSE-NEXT:  .LBB45_12:
1536; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1537; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1538; SSE-NEXT:    movaps %xmm2, %xmm0
1539; SSE-NEXT:    retq
1540;
1541; AVX1-LABEL: uitofp_4i64_to_4f32:
1542; AVX1:       # BB#0:
1543; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
1544; AVX1-NEXT:    movl %eax, %ecx
1545; AVX1-NEXT:    andl $1, %ecx
1546; AVX1-NEXT:    testq %rax, %rax
1547; AVX1-NEXT:    js .LBB45_1
1548; AVX1-NEXT:  # BB#2:
1549; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1550; AVX1-NEXT:    jmp .LBB45_3
1551; AVX1-NEXT:  .LBB45_1:
1552; AVX1-NEXT:    shrq %rax
1553; AVX1-NEXT:    orq %rax, %rcx
1554; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
1555; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
1556; AVX1-NEXT:  .LBB45_3:
1557; AVX1-NEXT:    vmovq %xmm0, %rax
1558; AVX1-NEXT:    movl %eax, %ecx
1559; AVX1-NEXT:    andl $1, %ecx
1560; AVX1-NEXT:    testq %rax, %rax
1561; AVX1-NEXT:    js .LBB45_4
1562; AVX1-NEXT:  # BB#5:
1563; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1564; AVX1-NEXT:    jmp .LBB45_6
1565; AVX1-NEXT:  .LBB45_4:
1566; AVX1-NEXT:    shrq %rax
1567; AVX1-NEXT:    orq %rax, %rcx
1568; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
1569; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
1570; AVX1-NEXT:  .LBB45_6:
1571; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1572; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1573; AVX1-NEXT:    vmovq %xmm0, %rax
1574; AVX1-NEXT:    movl %eax, %ecx
1575; AVX1-NEXT:    andl $1, %ecx
1576; AVX1-NEXT:    testq %rax, %rax
1577; AVX1-NEXT:    js .LBB45_7
1578; AVX1-NEXT:  # BB#8:
1579; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1580; AVX1-NEXT:    jmp .LBB45_9
1581; AVX1-NEXT:  .LBB45_7:
1582; AVX1-NEXT:    shrq %rax
1583; AVX1-NEXT:    orq %rax, %rcx
1584; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
1585; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
1586; AVX1-NEXT:  .LBB45_9:
1587; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1588; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
1589; AVX1-NEXT:    movl %eax, %ecx
1590; AVX1-NEXT:    andl $1, %ecx
1591; AVX1-NEXT:    testq %rax, %rax
1592; AVX1-NEXT:    js .LBB45_10
1593; AVX1-NEXT:  # BB#11:
1594; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1595; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
1596; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1597; AVX1-NEXT:    vzeroupper
1598; AVX1-NEXT:    retq
1599; AVX1-NEXT:  .LBB45_10:
1600; AVX1-NEXT:    shrq %rax
1601; AVX1-NEXT:    orq %rax, %rcx
1602; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1603; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
1604; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
1605; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1606; AVX1-NEXT:    vzeroupper
1607; AVX1-NEXT:    retq
1608;
1609; AVX2-LABEL: uitofp_4i64_to_4f32:
1610; AVX2:       # BB#0:
1611; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
1612; AVX2-NEXT:    movl %eax, %ecx
1613; AVX2-NEXT:    andl $1, %ecx
1614; AVX2-NEXT:    testq %rax, %rax
1615; AVX2-NEXT:    js .LBB45_1
1616; AVX2-NEXT:  # BB#2:
1617; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
1618; AVX2-NEXT:    jmp .LBB45_3
1619; AVX2-NEXT:  .LBB45_1:
1620; AVX2-NEXT:    shrq %rax
1621; AVX2-NEXT:    orq %rax, %rcx
1622; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
1623; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
1624; AVX2-NEXT:  .LBB45_3:
1625; AVX2-NEXT:    vmovq %xmm0, %rax
1626; AVX2-NEXT:    movl %eax, %ecx
1627; AVX2-NEXT:    andl $1, %ecx
1628; AVX2-NEXT:    testq %rax, %rax
1629; AVX2-NEXT:    js .LBB45_4
1630; AVX2-NEXT:  # BB#5:
1631; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1632; AVX2-NEXT:    jmp .LBB45_6
1633; AVX2-NEXT:  .LBB45_4:
1634; AVX2-NEXT:    shrq %rax
1635; AVX2-NEXT:    orq %rax, %rcx
1636; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
1637; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
1638; AVX2-NEXT:  .LBB45_6:
1639; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1640; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1641; AVX2-NEXT:    vmovq %xmm0, %rax
1642; AVX2-NEXT:    movl %eax, %ecx
1643; AVX2-NEXT:    andl $1, %ecx
1644; AVX2-NEXT:    testq %rax, %rax
1645; AVX2-NEXT:    js .LBB45_7
1646; AVX2-NEXT:  # BB#8:
1647; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
1648; AVX2-NEXT:    jmp .LBB45_9
1649; AVX2-NEXT:  .LBB45_7:
1650; AVX2-NEXT:    shrq %rax
1651; AVX2-NEXT:    orq %rax, %rcx
1652; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
1653; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
1654; AVX2-NEXT:  .LBB45_9:
1655; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1656; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
1657; AVX2-NEXT:    movl %eax, %ecx
1658; AVX2-NEXT:    andl $1, %ecx
1659; AVX2-NEXT:    testq %rax, %rax
1660; AVX2-NEXT:    js .LBB45_10
1661; AVX2-NEXT:  # BB#11:
1662; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1663; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
1664; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1665; AVX2-NEXT:    vzeroupper
1666; AVX2-NEXT:    retq
1667; AVX2-NEXT:  .LBB45_10:
1668; AVX2-NEXT:    shrq %rax
1669; AVX2-NEXT:    orq %rax, %rcx
1670; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1671; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
1672; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
1673; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1674; AVX2-NEXT:    vzeroupper
1675; AVX2-NEXT:    retq
1676  %cvt = uitofp <4 x i64> %a to <4 x float>
1677  ret <4 x float> %cvt
1678}
1679
1680define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
1681; SSE-LABEL: uitofp_8i32_to_8f32:
1682; SSE:       # BB#0:
1683; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1684; SSE-NEXT:    movdqa %xmm0, %xmm3
1685; SSE-NEXT:    pand %xmm2, %xmm3
1686; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
1687; SSE-NEXT:    por %xmm4, %xmm3
1688; SSE-NEXT:    psrld $16, %xmm0
1689; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
1690; SSE-NEXT:    por %xmm5, %xmm0
1691; SSE-NEXT:    movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
1692; SSE-NEXT:    addps %xmm6, %xmm0
1693; SSE-NEXT:    addps %xmm3, %xmm0
1694; SSE-NEXT:    pand %xmm1, %xmm2
1695; SSE-NEXT:    por %xmm4, %xmm2
1696; SSE-NEXT:    psrld $16, %xmm1
1697; SSE-NEXT:    por %xmm5, %xmm1
1698; SSE-NEXT:    addps %xmm6, %xmm1
1699; SSE-NEXT:    addps %xmm2, %xmm1
1700; SSE-NEXT:    retq
1701;
1702; AVX1-LABEL: uitofp_8i32_to_8f32:
1703; AVX1:       # BB#0:
1704; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm1
1705; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
1706; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
1707; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1708; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
1709; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
1710; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1711; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
1712; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
1713; AVX1-NEXT:    retq
1714;
1715; AVX2-LABEL: uitofp_8i32_to_8f32:
1716; AVX2:       # BB#0:
1717; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
1718; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1719; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
1720; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
1721; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
1722; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
1723; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
1724; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
1725; AVX2-NEXT:    retq
1726  %cvt = uitofp <8 x i32> %a to <8 x float>
1727  ret <8 x float> %cvt
1728}
1729
1730define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
1731; SSE-LABEL: uitofp_8i16_to_8f32:
1732; SSE:       # BB#0:
1733; SSE-NEXT:    pxor %xmm1, %xmm1
1734; SSE-NEXT:    movdqa %xmm0, %xmm2
1735; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1736; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
1737; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1738; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
1739; SSE-NEXT:    movaps %xmm2, %xmm0
1740; SSE-NEXT:    retq
1741;
1742; AVX1-LABEL: uitofp_8i16_to_8f32:
1743; AVX1:       # BB#0:
1744; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1745; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1746; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1747; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1748; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1749; AVX1-NEXT:    retq
1750;
1751; AVX2-LABEL: uitofp_8i16_to_8f32:
1752; AVX2:       # BB#0:
1753; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1754; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1755; AVX2-NEXT:    retq
1756  %cvt = uitofp <8 x i16> %a to <8 x float>
1757  ret <8 x float> %cvt
1758}
1759
1760define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
1761; SSE-LABEL: uitofp_8i8_to_8f32:
1762; SSE:       # BB#0:
1763; SSE-NEXT:    pxor %xmm1, %xmm1
1764; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1765; SSE-NEXT:    movdqa %xmm0, %xmm2
1766; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1767; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
1768; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1769; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
1770; SSE-NEXT:    movaps %xmm2, %xmm0
1771; SSE-NEXT:    retq
1772;
1773; AVX1-LABEL: uitofp_8i8_to_8f32:
1774; AVX1:       # BB#0:
1775; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1776; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1777; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1778; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1779; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1780; AVX1-NEXT:    retq
1781;
1782; AVX2-LABEL: uitofp_8i8_to_8f32:
1783; AVX2:       # BB#0:
1784; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1785; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1786; AVX2-NEXT:    retq
1787  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1788  %cvt = uitofp <8 x i8> %shuf to <8 x float>
1789  ret <8 x float> %cvt
1790}
1791
1792define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
1793; SSE-LABEL: uitofp_16i8_to_8f32:
1794; SSE:       # BB#0:
1795; SSE-NEXT:    pxor %xmm1, %xmm1
1796; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1797; SSE-NEXT:    movdqa %xmm0, %xmm2
1798; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1799; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
1800; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1801; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
1802; SSE-NEXT:    movaps %xmm2, %xmm0
1803; SSE-NEXT:    retq
1804;
1805; AVX1-LABEL: uitofp_16i8_to_8f32:
1806; AVX1:       # BB#0:
1807; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1808; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1809; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1810; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1811; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
1812; AVX1-NEXT:    retq
1813;
1814; AVX2-LABEL: uitofp_16i8_to_8f32:
1815; AVX2:       # BB#0:
1816; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1817; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1818; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
1819; AVX2-NEXT:    retq
1820  %cvt = uitofp <16 x i8> %a to <16 x float>
1821  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1822  ret <8 x float> %shuf
1823}
1824
1825;
1826; Load Signed Integer to Double
1827;
1828
1829define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
1830; SSE-LABEL: sitofp_load_2i64_to_2f64:
1831; SSE:       # BB#0:
1832; SSE-NEXT:    movdqa (%rdi), %xmm1
1833; SSE-NEXT:    movd %xmm1, %rax
1834; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
1835; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1836; SSE-NEXT:    movd %xmm1, %rax
1837; SSE-NEXT:    xorps %xmm1, %xmm1
1838; SSE-NEXT:    cvtsi2sdq %rax, %xmm1
1839; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1840; SSE-NEXT:    retq
1841;
1842; AVX-LABEL: sitofp_load_2i64_to_2f64:
1843; AVX:       # BB#0:
1844; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1845; AVX-NEXT:    vpextrq $1, %xmm0, %rax
1846; AVX-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
1847; AVX-NEXT:    vmovq %xmm0, %rax
1848; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1849; AVX-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
1850; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1851; AVX-NEXT:    retq
1852  %ld = load <2 x i64>, <2 x i64> *%a
1853  %cvt = sitofp <2 x i64> %ld to <2 x double>
1854  ret <2 x double> %cvt
1855}
1856
1857define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) {
1858; SSE-LABEL: sitofp_load_2i32_to_2f64:
1859; SSE:       # BB#0:
1860; SSE-NEXT:    cvtdq2pd (%rdi), %xmm0
1861; SSE-NEXT:    retq
1862;
1863; AVX-LABEL: sitofp_load_2i32_to_2f64:
1864; AVX:       # BB#0:
1865; AVX-NEXT:    vcvtdq2pd (%rdi), %xmm0
1866; AVX-NEXT:    retq
1867  %ld = load <2 x i32>, <2 x i32> *%a
1868  %cvt = sitofp <2 x i32> %ld to <2 x double>
1869  ret <2 x double> %cvt
1870}
1871
1872define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
1873; SSE-LABEL: sitofp_load_2i16_to_2f64:
1874; SSE:       # BB#0:
1875; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1876; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1877; SSE-NEXT:    psrad $16, %xmm0
1878; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
1879; SSE-NEXT:    retq
1880;
1881; AVX-LABEL: sitofp_load_2i16_to_2f64:
1882; AVX:       # BB#0:
1883; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
1884; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1885; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
1886; AVX-NEXT:    retq
1887  %ld = load <2 x i16>, <2 x i16> *%a
1888  %cvt = sitofp <2 x i16> %ld to <2 x double>
1889  ret <2 x double> %cvt
1890}
1891
1892define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
1893; SSE-LABEL: sitofp_load_2i8_to_2f64:
1894; SSE:       # BB#0:
1895; SSE-NEXT:    movzwl (%rdi), %eax
1896; SSE-NEXT:    movd %eax, %xmm0
1897; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1898; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1899; SSE-NEXT:    psrad $24, %xmm0
1900; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
1901; SSE-NEXT:    retq
1902;
1903; AVX-LABEL: sitofp_load_2i8_to_2f64:
1904; AVX:       # BB#0:
1905; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
1906; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1907; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
1908; AVX-NEXT:    retq
1909  %ld = load <2 x i8>, <2 x i8> *%a
1910  %cvt = sitofp <2 x i8> %ld to <2 x double>
1911  ret <2 x double> %cvt
1912}
1913
1914define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
1915; SSE-LABEL: sitofp_load_4i64_to_4f64:
1916; SSE:       # BB#0:
1917; SSE-NEXT:    movdqa (%rdi), %xmm1
1918; SSE-NEXT:    movdqa 16(%rdi), %xmm2
1919; SSE-NEXT:    movd %xmm1, %rax
1920; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
1921; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1922; SSE-NEXT:    movd %xmm1, %rax
1923; SSE-NEXT:    xorps %xmm1, %xmm1
1924; SSE-NEXT:    cvtsi2sdq %rax, %xmm1
1925; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1926; SSE-NEXT:    movd %xmm2, %rax
1927; SSE-NEXT:    xorps %xmm1, %xmm1
1928; SSE-NEXT:    cvtsi2sdq %rax, %xmm1
1929; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1930; SSE-NEXT:    movd %xmm2, %rax
1931; SSE-NEXT:    xorps %xmm2, %xmm2
1932; SSE-NEXT:    cvtsi2sdq %rax, %xmm2
1933; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1934; SSE-NEXT:    retq
1935;
1936; AVX1-LABEL: sitofp_load_4i64_to_4f64:
1937; AVX1:       # BB#0:
1938; AVX1-NEXT:    vmovaps (%rdi), %ymm0
1939; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1940; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
1941; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
1942; AVX1-NEXT:    vmovq %xmm1, %rax
1943; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
1944; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1945; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
1946; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
1947; AVX1-NEXT:    vmovq %xmm0, %rax
1948; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1949; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
1950; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1951; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1952; AVX1-NEXT:    retq
1953;
1954; AVX2-LABEL: sitofp_load_4i64_to_4f64:
1955; AVX2:       # BB#0:
1956; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
1957; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1958; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
1959; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
1960; AVX2-NEXT:    vmovq %xmm1, %rax
1961; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
1962; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1963; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
1964; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
1965; AVX2-NEXT:    vmovq %xmm0, %rax
1966; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1967; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
1968; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1969; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1970; AVX2-NEXT:    retq
1971  %ld = load <4 x i64>, <4 x i64> *%a
1972  %cvt = sitofp <4 x i64> %ld to <4 x double>
1973  ret <4 x double> %cvt
1974}
1975
1976define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
1977; SSE-LABEL: sitofp_load_4i32_to_4f64:
1978; SSE:       # BB#0:
1979; SSE-NEXT:    movdqa (%rdi), %xmm1
1980; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
1981; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1982; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
1983; SSE-NEXT:    retq
1984;
1985; AVX-LABEL: sitofp_load_4i32_to_4f64:
1986; AVX:       # BB#0:
1987; AVX-NEXT:    vcvtdq2pd (%rdi), %ymm0
1988; AVX-NEXT:    retq
1989  %ld = load <4 x i32>, <4 x i32> *%a
1990  %cvt = sitofp <4 x i32> %ld to <4 x double>
1991  ret <4 x double> %cvt
1992}
1993
1994define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
1995; SSE-LABEL: sitofp_load_4i16_to_4f64:
1996; SSE:       # BB#0:
1997; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1998; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1999; SSE-NEXT:    psrad $16, %xmm1
2000; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
2001; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2002; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
2003; SSE-NEXT:    retq
2004;
2005; AVX-LABEL: sitofp_load_4i16_to_4f64:
2006; AVX:       # BB#0:
2007; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
2008; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
2009; AVX-NEXT:    retq
2010  %ld = load <4 x i16>, <4 x i16> *%a
2011  %cvt = sitofp <4 x i16> %ld to <4 x double>
2012  ret <4 x double> %cvt
2013}
2014
2015define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
2016; SSE-LABEL: sitofp_load_4i8_to_4f64:
2017; SSE:       # BB#0:
2018; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2019; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2020; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2021; SSE-NEXT:    psrad $24, %xmm1
2022; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
2023; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2024; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
2025; SSE-NEXT:    retq
2026;
2027; AVX-LABEL: sitofp_load_4i8_to_4f64:
2028; AVX:       # BB#0:
2029; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
2030; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
2031; AVX-NEXT:    retq
2032  %ld = load <4 x i8>, <4 x i8> *%a
2033  %cvt = sitofp <4 x i8> %ld to <4 x double>
2034  ret <4 x double> %cvt
2035}
2036
2037;
2038; Load Unsigned Integer to Double
2039;
2040
2041define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
2042; SSE-LABEL: uitofp_load_2i64_to_2f64:
2043; SSE:       # BB#0:
2044; SSE-NEXT:    movdqa (%rdi), %xmm1
2045; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
2046; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
2047; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2048; SSE-NEXT:    movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
2049; SSE-NEXT:    subpd %xmm4, %xmm1
2050; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
2051; SSE-NEXT:    addpd %xmm1, %xmm0
2052; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2053; SSE-NEXT:    subpd %xmm4, %xmm3
2054; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
2055; SSE-NEXT:    addpd %xmm3, %xmm1
2056; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2057; SSE-NEXT:    retq
2058;
2059; AVX-LABEL: uitofp_load_2i64_to_2f64:
2060; AVX:       # BB#0:
2061; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2062; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
2063; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2064; AVX-NEXT:    vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
2065; AVX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
2066; AVX-NEXT:    vhaddpd %xmm2, %xmm2, %xmm2
2067; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2068; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2069; AVX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
2070; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
2071; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2072; AVX-NEXT:    retq
2073  %ld = load <2 x i64>, <2 x i64> *%a
2074  %cvt = uitofp <2 x i64> %ld to <2 x double>
2075  ret <2 x double> %cvt
2076}
2077
2078define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
2079; SSE-LABEL: uitofp_load_2i32_to_2f64:
2080; SSE:       # BB#0:
2081; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
2082; SSE-NEXT:    pxor %xmm0, %xmm0
2083; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2084; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
2085; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
2086; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2087; SSE-NEXT:    movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
2088; SSE-NEXT:    subpd %xmm4, %xmm1
2089; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
2090; SSE-NEXT:    addpd %xmm1, %xmm0
2091; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2092; SSE-NEXT:    subpd %xmm4, %xmm3
2093; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
2094; SSE-NEXT:    addpd %xmm3, %xmm1
2095; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2096; SSE-NEXT:    retq
2097;
2098; AVX-LABEL: uitofp_load_2i32_to_2f64:
2099; AVX:       # BB#0:
2100; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2101; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
2102; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2103; AVX-NEXT:    vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
2104; AVX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
2105; AVX-NEXT:    vhaddpd %xmm2, %xmm2, %xmm2
2106; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2107; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2108; AVX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
2109; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
2110; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2111; AVX-NEXT:    retq
2112  %ld = load <2 x i32>, <2 x i32> *%a
2113  %cvt = uitofp <2 x i32> %ld to <2 x double>
2114  ret <2 x double> %cvt
2115}
2116
2117define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
2118; SSE-LABEL: uitofp_load_2i16_to_2f64:
2119; SSE:       # BB#0:
2120; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2121; SSE-NEXT:    pxor %xmm1, %xmm1
2122; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2123; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
2124; SSE-NEXT:    retq
2125;
2126; AVX-LABEL: uitofp_load_2i16_to_2f64:
2127; AVX:       # BB#0:
2128; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2129; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2130; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
2131; AVX-NEXT:    retq
2132  %ld = load <2 x i16>, <2 x i16> *%a
2133  %cvt = uitofp <2 x i16> %ld to <2 x double>
2134  ret <2 x double> %cvt
2135}
2136
2137define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
2138; SSE-LABEL: uitofp_load_2i8_to_2f64:
2139; SSE:       # BB#0:
2140; SSE-NEXT:    movzwl (%rdi), %eax
2141; SSE-NEXT:    movd %eax, %xmm0
2142; SSE-NEXT:    pxor %xmm1, %xmm1
2143; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2144; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2145; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
2146; SSE-NEXT:    retq
2147;
2148; AVX-LABEL: uitofp_load_2i8_to_2f64:
2149; AVX:       # BB#0:
2150; AVX-NEXT:    movzwl (%rdi), %eax
2151; AVX-NEXT:    vmovd %eax, %xmm0
2152; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2153; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
2154; AVX-NEXT:    retq
2155  %ld = load <2 x i8>, <2 x i8> *%a
2156  %cvt = uitofp <2 x i8> %ld to <2 x double>
2157  ret <2 x double> %cvt
2158}
2159
2160define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
2161; SSE-LABEL: uitofp_load_4i64_to_4f64:
2162; SSE:       # BB#0:
2163; SSE-NEXT:    movdqa (%rdi), %xmm1
2164; SSE-NEXT:    movdqa 16(%rdi), %xmm2
2165; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
2166; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
2167; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2168; SSE-NEXT:    movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
2169; SSE-NEXT:    subpd %xmm5, %xmm1
2170; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
2171; SSE-NEXT:    addpd %xmm1, %xmm0
2172; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2173; SSE-NEXT:    subpd %xmm5, %xmm4
2174; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
2175; SSE-NEXT:    addpd %xmm4, %xmm1
2176; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2177; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
2178; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2179; SSE-NEXT:    subpd %xmm5, %xmm2
2180; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
2181; SSE-NEXT:    addpd %xmm2, %xmm1
2182; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2183; SSE-NEXT:    subpd %xmm5, %xmm4
2184; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
2185; SSE-NEXT:    addpd %xmm4, %xmm2
2186; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2187; SSE-NEXT:    retq
2188;
2189; AVX1-LABEL: uitofp_load_4i64_to_4f64:
2190; AVX1:       # BB#0:
2191; AVX1-NEXT:    vmovaps (%rdi), %ymm0
2192; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2193; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
2194; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2195; AVX1-NEXT:    vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
2196; AVX1-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
2197; AVX1-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
2198; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2199; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2200; AVX1-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
2201; AVX1-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
2202; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
2203; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2204; AVX1-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
2205; AVX1-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
2206; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2207; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2208; AVX1-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
2209; AVX1-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
2210; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
2211; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2212; AVX1-NEXT:    retq
2213;
2214; AVX2-LABEL: uitofp_load_4i64_to_4f64:
2215; AVX2:       # BB#0:
2216; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2217; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2218; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
2219; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2220; AVX2-NEXT:    vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
2221; AVX2-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
2222; AVX2-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
2223; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2224; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2225; AVX2-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
2226; AVX2-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
2227; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
2228; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2229; AVX2-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
2230; AVX2-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
2231; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2232; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2233; AVX2-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
2234; AVX2-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
2235; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
2236; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2237; AVX2-NEXT:    retq
2238  %ld = load <4 x i64>, <4 x i64> *%a
2239  %cvt = uitofp <4 x i64> %ld to <4 x double>
2240  ret <4 x double> %cvt
2241}
2242
2243define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
2244; SSE-LABEL: uitofp_load_4i32_to_4f64:
2245; SSE:       # BB#0:
2246; SSE-NEXT:    movdqa (%rdi), %xmm2
2247; SSE-NEXT:    pxor %xmm1, %xmm1
2248; SSE-NEXT:    movdqa %xmm2, %xmm3
2249; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2250; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1127219200,1160773632,0,0]
2251; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
2252; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2253; SSE-NEXT:    movapd {{.*#+}} xmm6 = [4.503600e+15,1.934281e+25]
2254; SSE-NEXT:    subpd %xmm6, %xmm3
2255; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
2256; SSE-NEXT:    addpd %xmm3, %xmm0
2257; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
2258; SSE-NEXT:    subpd %xmm6, %xmm5
2259; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
2260; SSE-NEXT:    addpd %xmm5, %xmm3
2261; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
2262; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2263; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
2264; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2265; SSE-NEXT:    subpd %xmm6, %xmm2
2266; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
2267; SSE-NEXT:    addpd %xmm2, %xmm1
2268; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2269; SSE-NEXT:    subpd %xmm6, %xmm3
2270; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
2271; SSE-NEXT:    addpd %xmm3, %xmm2
2272; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2273; SSE-NEXT:    retq
2274;
2275; AVX1-LABEL: uitofp_load_4i32_to_4f64:
2276; AVX1:       # BB#0:
2277; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
2278; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
2279; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
2280; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
2281; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
2282; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
2283; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
2284; AVX1-NEXT:    retq
2285;
2286; AVX2-LABEL: uitofp_load_4i32_to_4f64:
2287; AVX2:       # BB#0:
2288; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
2289; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
2290; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
2291; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
2292; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
2293; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
2294; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
2295; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
2296; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
2297; AVX2-NEXT:    retq
2298  %ld = load <4 x i32>, <4 x i32> *%a
2299  %cvt = uitofp <4 x i32> %ld to <4 x double>
2300  ret <4 x double> %cvt
2301}
2302
2303define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
2304; SSE-LABEL: uitofp_load_4i16_to_4f64:
2305; SSE:       # BB#0:
2306; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
2307; SSE-NEXT:    pxor %xmm0, %xmm0
2308; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2309; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
2310; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2311; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
2312; SSE-NEXT:    retq
2313;
2314; AVX-LABEL: uitofp_load_4i16_to_4f64:
2315; AVX:       # BB#0:
2316; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2317; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
2318; AVX-NEXT:    retq
2319  %ld = load <4 x i16>, <4 x i16> *%a
2320  %cvt = uitofp <4 x i16> %ld to <4 x double>
2321  ret <4 x double> %cvt
2322}
2323
2324define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
2325; SSE-LABEL: uitofp_load_4i8_to_4f64:
2326; SSE:       # BB#0:
2327; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2328; SSE-NEXT:    pxor %xmm0, %xmm0
2329; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2330; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2331; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
2332; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2333; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
2334; SSE-NEXT:    retq
2335;
2336; AVX-LABEL: uitofp_load_4i8_to_4f64:
2337; AVX:       # BB#0:
2338; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2339; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
2340; AVX-NEXT:    retq
2341  %ld = load <4 x i8>, <4 x i8> *%a
2342  %cvt = uitofp <4 x i8> %ld to <4 x double>
2343  ret <4 x double> %cvt
2344}
2345
2346;
2347; Load Signed Integer to Float
2348;
2349
2350define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
2351; SSE-LABEL: sitofp_load_4i64_to_4f32:
2352; SSE:       # BB#0:
2353; SSE-NEXT:    movdqa (%rdi), %xmm1
2354; SSE-NEXT:    movdqa 16(%rdi), %xmm2
2355; SSE-NEXT:    movd %xmm2, %rax
2356; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
2357; SSE-NEXT:    movd %xmm1, %rax
2358; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
2359; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2360; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2361; SSE-NEXT:    movd %xmm2, %rax
2362; SSE-NEXT:    xorps %xmm2, %xmm2
2363; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
2364; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2365; SSE-NEXT:    movd %xmm1, %rax
2366; SSE-NEXT:    xorps %xmm1, %xmm1
2367; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
2368; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2369; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2370; SSE-NEXT:    retq
2371;
2372; AVX1-LABEL: sitofp_load_4i64_to_4f32:
2373; AVX1:       # BB#0:
2374; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
2375; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
2376; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
2377; AVX1-NEXT:    vmovq %xmm0, %rax
2378; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
2379; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
2380; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2381; AVX1-NEXT:    vmovq %xmm0, %rax
2382; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
2383; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
2384; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
2385; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2386; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
2387; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2388; AVX1-NEXT:    vzeroupper
2389; AVX1-NEXT:    retq
2390;
2391; AVX2-LABEL: sitofp_load_4i64_to_4f32:
2392; AVX2:       # BB#0:
2393; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2394; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
2395; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
2396; AVX2-NEXT:    vmovq %xmm0, %rax
2397; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
2398; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
2399; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
2400; AVX2-NEXT:    vmovq %xmm0, %rax
2401; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
2402; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
2403; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
2404; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2405; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
2406; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2407; AVX2-NEXT:    vzeroupper
2408; AVX2-NEXT:    retq
2409  %ld = load <4 x i64>, <4 x i64> *%a
2410  %cvt = sitofp <4 x i64> %ld to <4 x float>
2411  ret <4 x float> %cvt
2412}
2413
2414define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) {
2415; SSE-LABEL: sitofp_load_4i32_to_4f32:
2416; SSE:       # BB#0:
2417; SSE-NEXT:    cvtdq2ps (%rdi), %xmm0
2418; SSE-NEXT:    retq
2419;
2420; AVX-LABEL: sitofp_load_4i32_to_4f32:
2421; AVX:       # BB#0:
2422; AVX-NEXT:    vcvtdq2ps (%rdi), %xmm0
2423; AVX-NEXT:    retq
2424  %ld = load <4 x i32>, <4 x i32> *%a
2425  %cvt = sitofp <4 x i32> %ld to <4 x float>
2426  ret <4 x float> %cvt
2427}
2428
2429define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
2430; SSE-LABEL: sitofp_load_4i16_to_4f32:
2431; SSE:       # BB#0:
2432; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2433; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2434; SSE-NEXT:    psrad $16, %xmm0
2435; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
2436; SSE-NEXT:    retq
2437;
2438; AVX-LABEL: sitofp_load_4i16_to_4f32:
2439; AVX:       # BB#0:
2440; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
2441; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
2442; AVX-NEXT:    retq
2443  %ld = load <4 x i16>, <4 x i16> *%a
2444  %cvt = sitofp <4 x i16> %ld to <4 x float>
2445  ret <4 x float> %cvt
2446}
2447
2448define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
2449; SSE-LABEL: sitofp_load_4i8_to_4f32:
2450; SSE:       # BB#0:
2451; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2452; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2453; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2454; SSE-NEXT:    psrad $24, %xmm0
2455; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
2456; SSE-NEXT:    retq
2457;
2458; AVX-LABEL: sitofp_load_4i8_to_4f32:
2459; AVX:       # BB#0:
2460; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
2461; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
2462; AVX-NEXT:    retq
2463  %ld = load <4 x i8>, <4 x i8> *%a
2464  %cvt = sitofp <4 x i8> %ld to <4 x float>
2465  ret <4 x float> %cvt
2466}
2467
2468define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
2469; SSE-LABEL: sitofp_load_8i64_to_8f32:
2470; SSE:       # BB#0:
2471; SSE-NEXT:    movdqa (%rdi), %xmm1
2472; SSE-NEXT:    movdqa 16(%rdi), %xmm2
2473; SSE-NEXT:    movdqa 32(%rdi), %xmm3
2474; SSE-NEXT:    movdqa 48(%rdi), %xmm4
2475; SSE-NEXT:    movd %xmm2, %rax
2476; SSE-NEXT:    cvtsi2ssq %rax, %xmm5
2477; SSE-NEXT:    movd %xmm1, %rax
2478; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
2479; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
2480; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2481; SSE-NEXT:    movd %xmm2, %rax
2482; SSE-NEXT:    xorps %xmm2, %xmm2
2483; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
2484; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2485; SSE-NEXT:    movd %xmm1, %rax
2486; SSE-NEXT:    xorps %xmm1, %xmm1
2487; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
2488; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2489; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2490; SSE-NEXT:    movd %xmm4, %rax
2491; SSE-NEXT:    xorps %xmm2, %xmm2
2492; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
2493; SSE-NEXT:    movd %xmm3, %rax
2494; SSE-NEXT:    xorps %xmm1, %xmm1
2495; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
2496; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2497; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
2498; SSE-NEXT:    movd %xmm2, %rax
2499; SSE-NEXT:    xorps %xmm2, %xmm2
2500; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
2501; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2502; SSE-NEXT:    movd %xmm3, %rax
2503; SSE-NEXT:    xorps %xmm3, %xmm3
2504; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
2505; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2506; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2507; SSE-NEXT:    retq
2508;
2509; AVX1-LABEL: sitofp_load_8i64_to_8f32:
2510; AVX1:       # BB#0:
2511; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
2512; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
2513; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
2514; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
2515; AVX1-NEXT:    vmovq %xmm1, %rax
2516; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
2517; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2518; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2519; AVX1-NEXT:    vmovq %xmm1, %rax
2520; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
2521; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
2522; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
2523; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
2524; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
2525; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
2526; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
2527; AVX1-NEXT:    vmovq %xmm0, %rax
2528; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
2529; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2530; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2531; AVX1-NEXT:    vmovq %xmm0, %rax
2532; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
2533; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
2534; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
2535; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2536; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
2537; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
2538; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2539; AVX1-NEXT:    retq
2540;
2541; AVX2-LABEL: sitofp_load_8i64_to_8f32:
2542; AVX2:       # BB#0:
2543; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2544; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
2545; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
2546; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
2547; AVX2-NEXT:    vmovq %xmm1, %rax
2548; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
2549; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2550; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
2551; AVX2-NEXT:    vmovq %xmm1, %rax
2552; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
2553; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
2554; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
2555; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
2556; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
2557; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
2558; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
2559; AVX2-NEXT:    vmovq %xmm0, %rax
2560; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
2561; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2562; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
2563; AVX2-NEXT:    vmovq %xmm0, %rax
2564; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
2565; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
2566; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
2567; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2568; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
2569; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
2570; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2571; AVX2-NEXT:    retq
2572  %ld = load <8 x i64>, <8 x i64> *%a
2573  %cvt = sitofp <8 x i64> %ld to <8 x float>
2574  ret <8 x float> %cvt
2575}
2576
2577define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) {
2578; SSE-LABEL: sitofp_load_8i32_to_8f32:
2579; SSE:       # BB#0:
2580; SSE-NEXT:    cvtdq2ps (%rdi), %xmm0
2581; SSE-NEXT:    cvtdq2ps 16(%rdi), %xmm1
2582; SSE-NEXT:    retq
2583;
2584; AVX-LABEL: sitofp_load_8i32_to_8f32:
2585; AVX:       # BB#0:
2586; AVX-NEXT:    vcvtdq2ps (%rdi), %ymm0
2587; AVX-NEXT:    retq
2588  %ld = load <8 x i32>, <8 x i32> *%a
2589  %cvt = sitofp <8 x i32> %ld to <8 x float>
2590  ret <8 x float> %cvt
2591}
2592
2593define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
2594; SSE-LABEL: sitofp_load_8i16_to_8f32:
2595; SSE:       # BB#0:
2596; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2597; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2598; SSE-NEXT:    psrad $16, %xmm0
2599; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
2600; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
2601; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
2602; SSE-NEXT:    psrad $16, %xmm1
2603; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
2604; SSE-NEXT:    retq
2605;
2606; AVX1-LABEL: sitofp_load_8i16_to_8f32:
2607; AVX1:       # BB#0:
2608; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
2609; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm1
2610; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2611; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
2612; AVX1-NEXT:    retq
2613;
2614; AVX2-LABEL: sitofp_load_8i16_to_8f32:
2615; AVX2:       # BB#0:
2616; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
2617; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
2618; AVX2-NEXT:    retq
2619  %ld = load <8 x i16>, <8 x i16> *%a
2620  %cvt = sitofp <8 x i16> %ld to <8 x float>
2621  ret <8 x float> %cvt
2622}
2623
2624define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
2625; SSE-LABEL: sitofp_load_8i8_to_8f32:
2626; SSE:       # BB#0:
2627; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2628; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2629; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2630; SSE-NEXT:    psrad $24, %xmm0
2631; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
2632; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2633; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2634; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
2635; SSE-NEXT:    psrad $24, %xmm1
2636; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
2637; SSE-NEXT:    retq
2638;
2639; AVX1-LABEL: sitofp_load_8i8_to_8f32:
2640; AVX1:       # BB#0:
2641; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
2642; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
2643; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2644; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
2645; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2646; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
2647; AVX1-NEXT:    retq
2648;
2649; AVX2-LABEL: sitofp_load_8i8_to_8f32:
2650; AVX2:       # BB#0:
2651; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
2652; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
2653; AVX2-NEXT:    retq
2654  %ld = load <8 x i8>, <8 x i8> *%a
2655  %cvt = sitofp <8 x i8> %ld to <8 x float>
2656  ret <8 x float> %cvt
2657}
2658
2659;
2660; Load Unsigned Integer to Float
2661;
2662
2663define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
2664; SSE-LABEL: uitofp_load_4i64_to_4f32:
2665; SSE:       # BB#0:
2666; SSE-NEXT:    movdqa (%rdi), %xmm1
2667; SSE-NEXT:    movdqa 16(%rdi), %xmm3
2668; SSE-NEXT:    movd %xmm3, %rax
2669; SSE-NEXT:    movl %eax, %ecx
2670; SSE-NEXT:    andl $1, %ecx
2671; SSE-NEXT:    testq %rax, %rax
2672; SSE-NEXT:    js .LBB74_1
2673; SSE-NEXT:  # BB#2:
2674; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
2675; SSE-NEXT:    jmp .LBB74_3
2676; SSE-NEXT:  .LBB74_1:
2677; SSE-NEXT:    shrq %rax
2678; SSE-NEXT:    orq %rax, %rcx
2679; SSE-NEXT:    cvtsi2ssq %rcx, %xmm2
2680; SSE-NEXT:    addss %xmm2, %xmm2
2681; SSE-NEXT:  .LBB74_3:
2682; SSE-NEXT:    movd %xmm1, %rax
2683; SSE-NEXT:    movl %eax, %ecx
2684; SSE-NEXT:    andl $1, %ecx
2685; SSE-NEXT:    testq %rax, %rax
2686; SSE-NEXT:    js .LBB74_4
2687; SSE-NEXT:  # BB#5:
2688; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
2689; SSE-NEXT:    jmp .LBB74_6
2690; SSE-NEXT:  .LBB74_4:
2691; SSE-NEXT:    shrq %rax
2692; SSE-NEXT:    orq %rax, %rcx
2693; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
2694; SSE-NEXT:    addss %xmm0, %xmm0
2695; SSE-NEXT:  .LBB74_6:
2696; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2697; SSE-NEXT:    movd %xmm3, %rax
2698; SSE-NEXT:    movl %eax, %ecx
2699; SSE-NEXT:    andl $1, %ecx
2700; SSE-NEXT:    testq %rax, %rax
2701; SSE-NEXT:    js .LBB74_7
2702; SSE-NEXT:  # BB#8:
2703; SSE-NEXT:    xorps %xmm3, %xmm3
2704; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
2705; SSE-NEXT:    jmp .LBB74_9
2706; SSE-NEXT:  .LBB74_7:
2707; SSE-NEXT:    shrq %rax
2708; SSE-NEXT:    orq %rax, %rcx
2709; SSE-NEXT:    xorps %xmm3, %xmm3
2710; SSE-NEXT:    cvtsi2ssq %rcx, %xmm3
2711; SSE-NEXT:    addss %xmm3, %xmm3
2712; SSE-NEXT:  .LBB74_9:
2713; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2714; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2715; SSE-NEXT:    movd %xmm1, %rax
2716; SSE-NEXT:    movl %eax, %ecx
2717; SSE-NEXT:    andl $1, %ecx
2718; SSE-NEXT:    testq %rax, %rax
2719; SSE-NEXT:    js .LBB74_10
2720; SSE-NEXT:  # BB#11:
2721; SSE-NEXT:    xorps %xmm1, %xmm1
2722; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
2723; SSE-NEXT:    jmp .LBB74_12
2724; SSE-NEXT:  .LBB74_10:
2725; SSE-NEXT:    shrq %rax
2726; SSE-NEXT:    orq %rax, %rcx
2727; SSE-NEXT:    xorps %xmm1, %xmm1
2728; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
2729; SSE-NEXT:    addss %xmm1, %xmm1
2730; SSE-NEXT:  .LBB74_12:
2731; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2732; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2733; SSE-NEXT:    retq
2734;
2735; AVX1-LABEL: uitofp_load_4i64_to_4f32:
2736; AVX1:       # BB#0:
2737; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
2738; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
2739; AVX1-NEXT:    movl %eax, %ecx
2740; AVX1-NEXT:    andl $1, %ecx
2741; AVX1-NEXT:    testq %rax, %rax
2742; AVX1-NEXT:    js .LBB74_1
2743; AVX1-NEXT:  # BB#2:
2744; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
2745; AVX1-NEXT:    jmp .LBB74_3
2746; AVX1-NEXT:  .LBB74_1:
2747; AVX1-NEXT:    shrq %rax
2748; AVX1-NEXT:    orq %rax, %rcx
2749; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
2750; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
2751; AVX1-NEXT:  .LBB74_3:
2752; AVX1-NEXT:    vmovq %xmm0, %rax
2753; AVX1-NEXT:    movl %eax, %ecx
2754; AVX1-NEXT:    andl $1, %ecx
2755; AVX1-NEXT:    testq %rax, %rax
2756; AVX1-NEXT:    js .LBB74_4
2757; AVX1-NEXT:  # BB#5:
2758; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
2759; AVX1-NEXT:    jmp .LBB74_6
2760; AVX1-NEXT:  .LBB74_4:
2761; AVX1-NEXT:    shrq %rax
2762; AVX1-NEXT:    orq %rax, %rcx
2763; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
2764; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
2765; AVX1-NEXT:  .LBB74_6:
2766; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
2767; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2768; AVX1-NEXT:    vmovq %xmm0, %rax
2769; AVX1-NEXT:    movl %eax, %ecx
2770; AVX1-NEXT:    andl $1, %ecx
2771; AVX1-NEXT:    testq %rax, %rax
2772; AVX1-NEXT:    js .LBB74_7
2773; AVX1-NEXT:  # BB#8:
2774; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
2775; AVX1-NEXT:    jmp .LBB74_9
2776; AVX1-NEXT:  .LBB74_7:
2777; AVX1-NEXT:    shrq %rax
2778; AVX1-NEXT:    orq %rax, %rcx
2779; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
2780; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
2781; AVX1-NEXT:  .LBB74_9:
2782; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
2783; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
2784; AVX1-NEXT:    movl %eax, %ecx
2785; AVX1-NEXT:    andl $1, %ecx
2786; AVX1-NEXT:    testq %rax, %rax
2787; AVX1-NEXT:    js .LBB74_10
2788; AVX1-NEXT:  # BB#11:
2789; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2790; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
2791; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2792; AVX1-NEXT:    vzeroupper
2793; AVX1-NEXT:    retq
2794; AVX1-NEXT:  .LBB74_10:
2795; AVX1-NEXT:    shrq %rax
2796; AVX1-NEXT:    orq %rax, %rcx
2797; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2798; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
2799; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
2800; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2801; AVX1-NEXT:    vzeroupper
2802; AVX1-NEXT:    retq
2803;
2804; AVX2-LABEL: uitofp_load_4i64_to_4f32:
2805; AVX2:       # BB#0:
2806; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2807; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
2808; AVX2-NEXT:    movl %eax, %ecx
2809; AVX2-NEXT:    andl $1, %ecx
2810; AVX2-NEXT:    testq %rax, %rax
2811; AVX2-NEXT:    js .LBB74_1
2812; AVX2-NEXT:  # BB#2:
2813; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
2814; AVX2-NEXT:    jmp .LBB74_3
2815; AVX2-NEXT:  .LBB74_1:
2816; AVX2-NEXT:    shrq %rax
2817; AVX2-NEXT:    orq %rax, %rcx
2818; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
2819; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
2820; AVX2-NEXT:  .LBB74_3:
2821; AVX2-NEXT:    vmovq %xmm0, %rax
2822; AVX2-NEXT:    movl %eax, %ecx
2823; AVX2-NEXT:    andl $1, %ecx
2824; AVX2-NEXT:    testq %rax, %rax
2825; AVX2-NEXT:    js .LBB74_4
2826; AVX2-NEXT:  # BB#5:
2827; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
2828; AVX2-NEXT:    jmp .LBB74_6
2829; AVX2-NEXT:  .LBB74_4:
2830; AVX2-NEXT:    shrq %rax
2831; AVX2-NEXT:    orq %rax, %rcx
2832; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
2833; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
2834; AVX2-NEXT:  .LBB74_6:
2835; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
2836; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
2837; AVX2-NEXT:    vmovq %xmm0, %rax
2838; AVX2-NEXT:    movl %eax, %ecx
2839; AVX2-NEXT:    andl $1, %ecx
2840; AVX2-NEXT:    testq %rax, %rax
2841; AVX2-NEXT:    js .LBB74_7
2842; AVX2-NEXT:  # BB#8:
2843; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
2844; AVX2-NEXT:    jmp .LBB74_9
2845; AVX2-NEXT:  .LBB74_7:
2846; AVX2-NEXT:    shrq %rax
2847; AVX2-NEXT:    orq %rax, %rcx
2848; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
2849; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
2850; AVX2-NEXT:  .LBB74_9:
2851; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
2852; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
2853; AVX2-NEXT:    movl %eax, %ecx
2854; AVX2-NEXT:    andl $1, %ecx
2855; AVX2-NEXT:    testq %rax, %rax
2856; AVX2-NEXT:    js .LBB74_10
2857; AVX2-NEXT:  # BB#11:
2858; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2859; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
2860; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2861; AVX2-NEXT:    vzeroupper
2862; AVX2-NEXT:    retq
2863; AVX2-NEXT:  .LBB74_10:
2864; AVX2-NEXT:    shrq %rax
2865; AVX2-NEXT:    orq %rax, %rcx
2866; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2867; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
2868; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
2869; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2870; AVX2-NEXT:    vzeroupper
2871; AVX2-NEXT:    retq
2872  %ld = load <4 x i64>, <4 x i64> *%a
2873  %cvt = uitofp <4 x i64> %ld to <4 x float>
2874  ret <4 x float> %cvt
2875}
2876
2877define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
2878; SSE-LABEL: uitofp_load_4i32_to_4f32:
2879; SSE:       # BB#0:
2880; SSE-NEXT:    movdqa (%rdi), %xmm0
2881; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
2882; SSE-NEXT:    pand %xmm0, %xmm1
2883; SSE-NEXT:    por {{.*}}(%rip), %xmm1
2884; SSE-NEXT:    psrld $16, %xmm0
2885; SSE-NEXT:    por {{.*}}(%rip), %xmm0
2886; SSE-NEXT:    addps {{.*}}(%rip), %xmm0
2887; SSE-NEXT:    addps %xmm1, %xmm0
2888; SSE-NEXT:    retq
2889;
2890; AVX1-LABEL: uitofp_load_4i32_to_4f32:
2891; AVX1:       # BB#0:
2892; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
2893; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
2894; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
2895; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
2896; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
2897; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
2898; AVX1-NEXT:    retq
2899;
2900; AVX2-LABEL: uitofp_load_4i32_to_4f32:
2901; AVX2:       # BB#0:
2902; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
2903; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
2904; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2905; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
2906; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
2907; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
2908; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
2909; AVX2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
2910; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
2911; AVX2-NEXT:    retq
2912  %ld = load <4 x i32>, <4 x i32> *%a
2913  %cvt = uitofp <4 x i32> %ld to <4 x float>
2914  ret <4 x float> %cvt
2915}
2916
2917define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
2918; SSE-LABEL: uitofp_load_4i16_to_4f32:
2919; SSE:       # BB#0:
2920; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2921; SSE-NEXT:    pxor %xmm1, %xmm1
2922; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2923; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
2924; SSE-NEXT:    retq
2925;
2926; AVX-LABEL: uitofp_load_4i16_to_4f32:
2927; AVX:       # BB#0:
2928; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2929; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
2930; AVX-NEXT:    retq
2931  %ld = load <4 x i16>, <4 x i16> *%a
2932  %cvt = uitofp <4 x i16> %ld to <4 x float>
2933  ret <4 x float> %cvt
2934}
2935
2936define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
2937; SSE-LABEL: uitofp_load_4i8_to_4f32:
2938; SSE:       # BB#0:
2939; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2940; SSE-NEXT:    pxor %xmm1, %xmm1
2941; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2942; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2943; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
2944; SSE-NEXT:    retq
2945;
2946; AVX-LABEL: uitofp_load_4i8_to_4f32:
2947; AVX:       # BB#0:
2948; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2949; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
2950; AVX-NEXT:    retq
2951  %ld = load <4 x i8>, <4 x i8> *%a
2952  %cvt = uitofp <4 x i8> %ld to <4 x float>
2953  ret <4 x float> %cvt
2954}
2955
2956define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
2957; SSE-LABEL: uitofp_load_8i64_to_8f32:
2958; SSE:       # BB#0:
2959; SSE-NEXT:    movdqa (%rdi), %xmm1
2960; SSE-NEXT:    movdqa 16(%rdi), %xmm5
2961; SSE-NEXT:    movdqa 32(%rdi), %xmm2
2962; SSE-NEXT:    movdqa 48(%rdi), %xmm3
2963; SSE-NEXT:    movd %xmm5, %rax
2964; SSE-NEXT:    movl %eax, %ecx
2965; SSE-NEXT:    andl $1, %ecx
2966; SSE-NEXT:    testq %rax, %rax
2967; SSE-NEXT:    js .LBB78_1
2968; SSE-NEXT:  # BB#2:
2969; SSE-NEXT:    cvtsi2ssq %rax, %xmm4
2970; SSE-NEXT:    jmp .LBB78_3
2971; SSE-NEXT:  .LBB78_1:
2972; SSE-NEXT:    shrq %rax
2973; SSE-NEXT:    orq %rax, %rcx
2974; SSE-NEXT:    cvtsi2ssq %rcx, %xmm4
2975; SSE-NEXT:    addss %xmm4, %xmm4
2976; SSE-NEXT:  .LBB78_3:
2977; SSE-NEXT:    movd %xmm1, %rax
2978; SSE-NEXT:    movl %eax, %ecx
2979; SSE-NEXT:    andl $1, %ecx
2980; SSE-NEXT:    testq %rax, %rax
2981; SSE-NEXT:    js .LBB78_4
2982; SSE-NEXT:  # BB#5:
2983; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
2984; SSE-NEXT:    jmp .LBB78_6
2985; SSE-NEXT:  .LBB78_4:
2986; SSE-NEXT:    shrq %rax
2987; SSE-NEXT:    orq %rax, %rcx
2988; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
2989; SSE-NEXT:    addss %xmm0, %xmm0
2990; SSE-NEXT:  .LBB78_6:
2991; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
2992; SSE-NEXT:    movd %xmm5, %rax
2993; SSE-NEXT:    movl %eax, %ecx
2994; SSE-NEXT:    andl $1, %ecx
2995; SSE-NEXT:    testq %rax, %rax
2996; SSE-NEXT:    js .LBB78_7
2997; SSE-NEXT:  # BB#8:
2998; SSE-NEXT:    cvtsi2ssq %rax, %xmm6
2999; SSE-NEXT:    jmp .LBB78_9
3000; SSE-NEXT:  .LBB78_7:
3001; SSE-NEXT:    shrq %rax
3002; SSE-NEXT:    orq %rax, %rcx
3003; SSE-NEXT:    cvtsi2ssq %rcx, %xmm6
3004; SSE-NEXT:    addss %xmm6, %xmm6
3005; SSE-NEXT:  .LBB78_9:
3006; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
3007; SSE-NEXT:    movd %xmm1, %rax
3008; SSE-NEXT:    movl %eax, %ecx
3009; SSE-NEXT:    andl $1, %ecx
3010; SSE-NEXT:    testq %rax, %rax
3011; SSE-NEXT:    js .LBB78_10
3012; SSE-NEXT:  # BB#11:
3013; SSE-NEXT:    xorps %xmm5, %xmm5
3014; SSE-NEXT:    cvtsi2ssq %rax, %xmm5
3015; SSE-NEXT:    jmp .LBB78_12
3016; SSE-NEXT:  .LBB78_10:
3017; SSE-NEXT:    shrq %rax
3018; SSE-NEXT:    orq %rax, %rcx
3019; SSE-NEXT:    xorps %xmm5, %xmm5
3020; SSE-NEXT:    cvtsi2ssq %rcx, %xmm5
3021; SSE-NEXT:    addss %xmm5, %xmm5
3022; SSE-NEXT:  .LBB78_12:
3023; SSE-NEXT:    movd %xmm3, %rax
3024; SSE-NEXT:    movl %eax, %ecx
3025; SSE-NEXT:    andl $1, %ecx
3026; SSE-NEXT:    testq %rax, %rax
3027; SSE-NEXT:    js .LBB78_13
3028; SSE-NEXT:  # BB#14:
3029; SSE-NEXT:    cvtsi2ssq %rax, %xmm7
3030; SSE-NEXT:    jmp .LBB78_15
3031; SSE-NEXT:  .LBB78_13:
3032; SSE-NEXT:    shrq %rax
3033; SSE-NEXT:    orq %rax, %rcx
3034; SSE-NEXT:    cvtsi2ssq %rcx, %xmm7
3035; SSE-NEXT:    addss %xmm7, %xmm7
3036; SSE-NEXT:  .LBB78_15:
3037; SSE-NEXT:    movd %xmm2, %rax
3038; SSE-NEXT:    movl %eax, %ecx
3039; SSE-NEXT:    andl $1, %ecx
3040; SSE-NEXT:    testq %rax, %rax
3041; SSE-NEXT:    js .LBB78_16
3042; SSE-NEXT:  # BB#17:
3043; SSE-NEXT:    xorps %xmm1, %xmm1
3044; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
3045; SSE-NEXT:    jmp .LBB78_18
3046; SSE-NEXT:  .LBB78_16:
3047; SSE-NEXT:    shrq %rax
3048; SSE-NEXT:    orq %rax, %rcx
3049; SSE-NEXT:    xorps %xmm1, %xmm1
3050; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
3051; SSE-NEXT:    addss %xmm1, %xmm1
3052; SSE-NEXT:  .LBB78_18:
3053; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
3054; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
3055; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
3056; SSE-NEXT:    movd %xmm3, %rax
3057; SSE-NEXT:    movl %eax, %ecx
3058; SSE-NEXT:    andl $1, %ecx
3059; SSE-NEXT:    testq %rax, %rax
3060; SSE-NEXT:    js .LBB78_19
3061; SSE-NEXT:  # BB#20:
3062; SSE-NEXT:    xorps %xmm3, %xmm3
3063; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
3064; SSE-NEXT:    jmp .LBB78_21
3065; SSE-NEXT:  .LBB78_19:
3066; SSE-NEXT:    shrq %rax
3067; SSE-NEXT:    orq %rax, %rcx
3068; SSE-NEXT:    xorps %xmm3, %xmm3
3069; SSE-NEXT:    cvtsi2ssq %rcx, %xmm3
3070; SSE-NEXT:    addss %xmm3, %xmm3
3071; SSE-NEXT:  .LBB78_21:
3072; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
3073; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
3074; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
3075; SSE-NEXT:    movd %xmm2, %rax
3076; SSE-NEXT:    movl %eax, %ecx
3077; SSE-NEXT:    andl $1, %ecx
3078; SSE-NEXT:    testq %rax, %rax
3079; SSE-NEXT:    js .LBB78_22
3080; SSE-NEXT:  # BB#23:
3081; SSE-NEXT:    xorps %xmm2, %xmm2
3082; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
3083; SSE-NEXT:    jmp .LBB78_24
3084; SSE-NEXT:  .LBB78_22:
3085; SSE-NEXT:    shrq %rax
3086; SSE-NEXT:    orq %rax, %rcx
3087; SSE-NEXT:    xorps %xmm2, %xmm2
3088; SSE-NEXT:    cvtsi2ssq %rcx, %xmm2
3089; SSE-NEXT:    addss %xmm2, %xmm2
3090; SSE-NEXT:  .LBB78_24:
3091; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3092; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3093; SSE-NEXT:    retq
3094;
3095; AVX1-LABEL: uitofp_load_8i64_to_8f32:
3096; AVX1:       # BB#0:
3097; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
3098; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm2
3099; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
3100; AVX1-NEXT:    movl %eax, %ecx
3101; AVX1-NEXT:    andl $1, %ecx
3102; AVX1-NEXT:    testq %rax, %rax
3103; AVX1-NEXT:    js .LBB78_1
3104; AVX1-NEXT:  # BB#2:
3105; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
3106; AVX1-NEXT:    jmp .LBB78_3
3107; AVX1-NEXT:  .LBB78_1:
3108; AVX1-NEXT:    shrq %rax
3109; AVX1-NEXT:    orq %rax, %rcx
3110; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
3111; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
3112; AVX1-NEXT:  .LBB78_3:
3113; AVX1-NEXT:    vmovq %xmm2, %rax
3114; AVX1-NEXT:    movl %eax, %ecx
3115; AVX1-NEXT:    andl $1, %ecx
3116; AVX1-NEXT:    testq %rax, %rax
3117; AVX1-NEXT:    js .LBB78_4
3118; AVX1-NEXT:  # BB#5:
3119; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
3120; AVX1-NEXT:    jmp .LBB78_6
3121; AVX1-NEXT:  .LBB78_4:
3122; AVX1-NEXT:    shrq %rax
3123; AVX1-NEXT:    orq %rax, %rcx
3124; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm3
3125; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm3
3126; AVX1-NEXT:  .LBB78_6:
3127; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
3128; AVX1-NEXT:    vmovq %xmm2, %rax
3129; AVX1-NEXT:    movl %eax, %ecx
3130; AVX1-NEXT:    andl $1, %ecx
3131; AVX1-NEXT:    testq %rax, %rax
3132; AVX1-NEXT:    js .LBB78_7
3133; AVX1-NEXT:  # BB#8:
3134; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm4
3135; AVX1-NEXT:    jmp .LBB78_9
3136; AVX1-NEXT:  .LBB78_7:
3137; AVX1-NEXT:    shrq %rax
3138; AVX1-NEXT:    orq %rax, %rcx
3139; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm4
3140; AVX1-NEXT:    vaddss %xmm4, %xmm4, %xmm4
3141; AVX1-NEXT:  .LBB78_9:
3142; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
3143; AVX1-NEXT:    movl %eax, %ecx
3144; AVX1-NEXT:    andl $1, %ecx
3145; AVX1-NEXT:    testq %rax, %rax
3146; AVX1-NEXT:    js .LBB78_10
3147; AVX1-NEXT:  # BB#11:
3148; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
3149; AVX1-NEXT:    jmp .LBB78_12
3150; AVX1-NEXT:  .LBB78_10:
3151; AVX1-NEXT:    shrq %rax
3152; AVX1-NEXT:    orq %rax, %rcx
3153; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
3154; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
3155; AVX1-NEXT:  .LBB78_12:
3156; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
3157; AVX1-NEXT:    movl %eax, %ecx
3158; AVX1-NEXT:    andl $1, %ecx
3159; AVX1-NEXT:    testq %rax, %rax
3160; AVX1-NEXT:    js .LBB78_13
3161; AVX1-NEXT:  # BB#14:
3162; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm5
3163; AVX1-NEXT:    jmp .LBB78_15
3164; AVX1-NEXT:  .LBB78_13:
3165; AVX1-NEXT:    shrq %rax
3166; AVX1-NEXT:    orq %rax, %rcx
3167; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm5
3168; AVX1-NEXT:    vaddss %xmm5, %xmm5, %xmm5
3169; AVX1-NEXT:  .LBB78_15:
3170; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
3171; AVX1-NEXT:    vmovq %xmm0, %rax
3172; AVX1-NEXT:    movl %eax, %ecx
3173; AVX1-NEXT:    andl $1, %ecx
3174; AVX1-NEXT:    testq %rax, %rax
3175; AVX1-NEXT:    js .LBB78_16
3176; AVX1-NEXT:  # BB#17:
3177; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
3178; AVX1-NEXT:    jmp .LBB78_18
3179; AVX1-NEXT:  .LBB78_16:
3180; AVX1-NEXT:    shrq %rax
3181; AVX1-NEXT:    orq %rax, %rcx
3182; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm3
3183; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm3
3184; AVX1-NEXT:  .LBB78_18:
3185; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
3186; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
3187; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
3188; AVX1-NEXT:    vmovq %xmm4, %rax
3189; AVX1-NEXT:    movl %eax, %ecx
3190; AVX1-NEXT:    andl $1, %ecx
3191; AVX1-NEXT:    testq %rax, %rax
3192; AVX1-NEXT:    js .LBB78_19
3193; AVX1-NEXT:  # BB#20:
3194; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3195; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm5
3196; AVX1-NEXT:    jmp .LBB78_21
3197; AVX1-NEXT:  .LBB78_19:
3198; AVX1-NEXT:    shrq %rax
3199; AVX1-NEXT:    orq %rax, %rcx
3200; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3201; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
3202; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm5
3203; AVX1-NEXT:  .LBB78_21:
3204; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
3205; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
3206; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
3207; AVX1-NEXT:    movl %eax, %ecx
3208; AVX1-NEXT:    andl $1, %ecx
3209; AVX1-NEXT:    testq %rax, %rax
3210; AVX1-NEXT:    js .LBB78_22
3211; AVX1-NEXT:  # BB#23:
3212; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
3213; AVX1-NEXT:    jmp .LBB78_24
3214; AVX1-NEXT:  .LBB78_22:
3215; AVX1-NEXT:    shrq %rax
3216; AVX1-NEXT:    orq %rax, %rcx
3217; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
3218; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
3219; AVX1-NEXT:  .LBB78_24:
3220; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
3221; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3222; AVX1-NEXT:    retq
3223;
3224; AVX2-LABEL: uitofp_load_8i64_to_8f32:
3225; AVX2:       # BB#0:
3226; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
3227; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
3228; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
3229; AVX2-NEXT:    movl %eax, %ecx
3230; AVX2-NEXT:    andl $1, %ecx
3231; AVX2-NEXT:    testq %rax, %rax
3232; AVX2-NEXT:    js .LBB78_1
3233; AVX2-NEXT:  # BB#2:
3234; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
3235; AVX2-NEXT:    jmp .LBB78_3
3236; AVX2-NEXT:  .LBB78_1:
3237; AVX2-NEXT:    shrq %rax
3238; AVX2-NEXT:    orq %rax, %rcx
3239; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
3240; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
3241; AVX2-NEXT:  .LBB78_3:
3242; AVX2-NEXT:    vmovq %xmm2, %rax
3243; AVX2-NEXT:    movl %eax, %ecx
3244; AVX2-NEXT:    andl $1, %ecx
3245; AVX2-NEXT:    testq %rax, %rax
3246; AVX2-NEXT:    js .LBB78_4
3247; AVX2-NEXT:  # BB#5:
3248; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
3249; AVX2-NEXT:    jmp .LBB78_6
3250; AVX2-NEXT:  .LBB78_4:
3251; AVX2-NEXT:    shrq %rax
3252; AVX2-NEXT:    orq %rax, %rcx
3253; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm3
3254; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm3
3255; AVX2-NEXT:  .LBB78_6:
3256; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
3257; AVX2-NEXT:    vmovq %xmm2, %rax
3258; AVX2-NEXT:    movl %eax, %ecx
3259; AVX2-NEXT:    andl $1, %ecx
3260; AVX2-NEXT:    testq %rax, %rax
3261; AVX2-NEXT:    js .LBB78_7
3262; AVX2-NEXT:  # BB#8:
3263; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm4
3264; AVX2-NEXT:    jmp .LBB78_9
3265; AVX2-NEXT:  .LBB78_7:
3266; AVX2-NEXT:    shrq %rax
3267; AVX2-NEXT:    orq %rax, %rcx
3268; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm4
3269; AVX2-NEXT:    vaddss %xmm4, %xmm4, %xmm4
3270; AVX2-NEXT:  .LBB78_9:
3271; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
3272; AVX2-NEXT:    movl %eax, %ecx
3273; AVX2-NEXT:    andl $1, %ecx
3274; AVX2-NEXT:    testq %rax, %rax
3275; AVX2-NEXT:    js .LBB78_10
3276; AVX2-NEXT:  # BB#11:
3277; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
3278; AVX2-NEXT:    jmp .LBB78_12
3279; AVX2-NEXT:  .LBB78_10:
3280; AVX2-NEXT:    shrq %rax
3281; AVX2-NEXT:    orq %rax, %rcx
3282; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
3283; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
3284; AVX2-NEXT:  .LBB78_12:
3285; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
3286; AVX2-NEXT:    movl %eax, %ecx
3287; AVX2-NEXT:    andl $1, %ecx
3288; AVX2-NEXT:    testq %rax, %rax
3289; AVX2-NEXT:    js .LBB78_13
3290; AVX2-NEXT:  # BB#14:
3291; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm5
3292; AVX2-NEXT:    jmp .LBB78_15
3293; AVX2-NEXT:  .LBB78_13:
3294; AVX2-NEXT:    shrq %rax
3295; AVX2-NEXT:    orq %rax, %rcx
3296; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm5
3297; AVX2-NEXT:    vaddss %xmm5, %xmm5, %xmm5
3298; AVX2-NEXT:  .LBB78_15:
3299; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
3300; AVX2-NEXT:    vmovq %xmm0, %rax
3301; AVX2-NEXT:    movl %eax, %ecx
3302; AVX2-NEXT:    andl $1, %ecx
3303; AVX2-NEXT:    testq %rax, %rax
3304; AVX2-NEXT:    js .LBB78_16
3305; AVX2-NEXT:  # BB#17:
3306; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
3307; AVX2-NEXT:    jmp .LBB78_18
3308; AVX2-NEXT:  .LBB78_16:
3309; AVX2-NEXT:    shrq %rax
3310; AVX2-NEXT:    orq %rax, %rcx
3311; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm3
3312; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm3
3313; AVX2-NEXT:  .LBB78_18:
3314; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
3315; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
3316; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
3317; AVX2-NEXT:    vmovq %xmm4, %rax
3318; AVX2-NEXT:    movl %eax, %ecx
3319; AVX2-NEXT:    andl $1, %ecx
3320; AVX2-NEXT:    testq %rax, %rax
3321; AVX2-NEXT:    js .LBB78_19
3322; AVX2-NEXT:  # BB#20:
3323; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3324; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm5
3325; AVX2-NEXT:    jmp .LBB78_21
3326; AVX2-NEXT:  .LBB78_19:
3327; AVX2-NEXT:    shrq %rax
3328; AVX2-NEXT:    orq %rax, %rcx
3329; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3330; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
3331; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm5
3332; AVX2-NEXT:  .LBB78_21:
3333; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
3334; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
3335; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
3336; AVX2-NEXT:    movl %eax, %ecx
3337; AVX2-NEXT:    andl $1, %ecx
3338; AVX2-NEXT:    testq %rax, %rax
3339; AVX2-NEXT:    js .LBB78_22
3340; AVX2-NEXT:  # BB#23:
3341; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
3342; AVX2-NEXT:    jmp .LBB78_24
3343; AVX2-NEXT:  .LBB78_22:
3344; AVX2-NEXT:    shrq %rax
3345; AVX2-NEXT:    orq %rax, %rcx
3346; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
3347; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
3348; AVX2-NEXT:  .LBB78_24:
3349; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
3350; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3351; AVX2-NEXT:    retq
3352  %ld = load <8 x i64>, <8 x i64> *%a
3353  %cvt = uitofp <8 x i64> %ld to <8 x float>
3354  ret <8 x float> %cvt
3355}
3356
3357define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
3358; SSE-LABEL: uitofp_load_8i32_to_8f32:
3359; SSE:       # BB#0:
3360; SSE-NEXT:    movdqa (%rdi), %xmm0
3361; SSE-NEXT:    movdqa 16(%rdi), %xmm1
3362; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
3363; SSE-NEXT:    movdqa %xmm0, %xmm3
3364; SSE-NEXT:    pand %xmm2, %xmm3
3365; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
3366; SSE-NEXT:    por %xmm4, %xmm3
3367; SSE-NEXT:    psrld $16, %xmm0
3368; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
3369; SSE-NEXT:    por %xmm5, %xmm0
3370; SSE-NEXT:    movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
3371; SSE-NEXT:    addps %xmm6, %xmm0
3372; SSE-NEXT:    addps %xmm3, %xmm0
3373; SSE-NEXT:    pand %xmm1, %xmm2
3374; SSE-NEXT:    por %xmm4, %xmm2
3375; SSE-NEXT:    psrld $16, %xmm1
3376; SSE-NEXT:    por %xmm5, %xmm1
3377; SSE-NEXT:    addps %xmm6, %xmm1
3378; SSE-NEXT:    addps %xmm2, %xmm1
3379; SSE-NEXT:    retq
3380;
3381; AVX1-LABEL: uitofp_load_8i32_to_8f32:
3382; AVX1:       # BB#0:
3383; AVX1-NEXT:    vmovaps (%rdi), %ymm0
3384; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm1
3385; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
3386; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
3387; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3388; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
3389; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
3390; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
3391; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
3392; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
3393; AVX1-NEXT:    retq
3394;
3395; AVX2-LABEL: uitofp_load_8i32_to_8f32:
3396; AVX2:       # BB#0:
3397; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
3398; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
3399; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3400; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
3401; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
3402; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
3403; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
3404; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
3405; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
3406; AVX2-NEXT:    retq
3407  %ld = load <8 x i32>, <8 x i32> *%a
3408  %cvt = uitofp <8 x i32> %ld to <8 x float>
3409  ret <8 x float> %cvt
3410}
3411
3412define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
3413; SSE-LABEL: uitofp_load_8i16_to_8f32:
3414; SSE:       # BB#0:
3415; SSE-NEXT:    movdqa (%rdi), %xmm1
3416; SSE-NEXT:    pxor %xmm2, %xmm2
3417; SSE-NEXT:    movdqa %xmm1, %xmm0
3418; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3419; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
3420; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3421; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
3422; SSE-NEXT:    retq
3423;
3424; AVX1-LABEL: uitofp_load_8i16_to_8f32:
3425; AVX1:       # BB#0:
3426; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
3427; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
3428; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3429; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
3430; AVX1-NEXT:    retq
3431;
3432; AVX2-LABEL: uitofp_load_8i16_to_8f32:
3433; AVX2:       # BB#0:
3434; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
3435; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
3436; AVX2-NEXT:    retq
3437  %ld = load <8 x i16>, <8 x i16> *%a
3438  %cvt = uitofp <8 x i16> %ld to <8 x float>
3439  ret <8 x float> %cvt
3440}
3441
3442define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
3443; SSE-LABEL: uitofp_load_8i8_to_8f32:
3444; SSE:       # BB#0:
3445; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
3446; SSE-NEXT:    pxor %xmm2, %xmm2
3447; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3448; SSE-NEXT:    movdqa %xmm1, %xmm0
3449; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3450; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
3451; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3452; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
3453; SSE-NEXT:    retq
3454;
3455; AVX1-LABEL: uitofp_load_8i8_to_8f32:
3456; AVX1:       # BB#0:
3457; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
3458; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
3459; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3460; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
3461; AVX1-NEXT:    retq
3462;
3463; AVX2-LABEL: uitofp_load_8i8_to_8f32:
3464; AVX2:       # BB#0:
3465; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
3466; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
3467; AVX2-NEXT:    retq
3468  %ld = load <8 x i8>, <8 x i8> *%a
3469  %cvt = uitofp <8 x i8> %ld to <8 x float>
3470  ret <8 x float> %cvt
3471}
3472
3473;
3474; Aggregates
3475;
3476
3477%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
3478define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
3479; SSE-LABEL: aggregate_sitofp_8i16_to_8f32:
3480; SSE:       # BB#0:
3481; SSE-NEXT:    movq 24(%rdi), %rax
3482; SSE-NEXT:    movdqu 8(%rdi), %xmm0
3483; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3484; SSE-NEXT:    psrad $16, %xmm1
3485; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
3486; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
3487; SSE-NEXT:    psrad $16, %xmm0
3488; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
3489; SSE-NEXT:    movaps %xmm0, 16(%rax)
3490; SSE-NEXT:    movaps %xmm1, (%rax)
3491; SSE-NEXT:    retq
3492;
3493; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
3494; AVX1:       # BB#0:
3495; AVX1-NEXT:    movq 24(%rdi), %rax
3496; AVX1-NEXT:    vmovdqu 8(%rdi), %xmm0
3497; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
3498; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3499; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
3500; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3501; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
3502; AVX1-NEXT:    vmovaps %ymm0, (%rax)
3503; AVX1-NEXT:    vzeroupper
3504; AVX1-NEXT:    retq
3505;
3506; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
3507; AVX2:       # BB#0:
3508; AVX2-NEXT:    movq 24(%rdi), %rax
3509; AVX2-NEXT:    vpmovsxwd 8(%rdi), %ymm0
3510; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
3511; AVX2-NEXT:    vmovaps %ymm0, (%rax)
3512; AVX2-NEXT:    vzeroupper
3513; AVX2-NEXT:    retq
3514 %1 = load %Arguments, %Arguments* %a0, align 1
3515 %2 = extractvalue %Arguments %1, 1
3516 %3 = extractvalue %Arguments %1, 2
3517 %4 = sitofp <8 x i16> %2 to <8 x float>
3518 store <8 x float> %4, <8 x float>* %3, align 32
3519 ret void
3520}
3521