• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
4
5; If we are transferring XMM conversion results to MMX registers we could use the MMX equivalents
6; (CVTPD2PI/CVTTPD2PI + CVTPS2PI/CVTTPS2PI) without affecting rounding/exceptions etc.
7
8define void @cvt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
9; X86-LABEL: cvt_v2f64_v2i32:
10; X86:       # %bb.0:
11; X86-NEXT:    pushl %ebp
12; X86-NEXT:    movl %esp, %ebp
13; X86-NEXT:    andl $-8, %esp
14; X86-NEXT:    subl $8, %esp
15; X86-NEXT:    movl 8(%ebp), %eax
16; X86-NEXT:    cvtpd2pi %xmm0, %mm0
17; X86-NEXT:    paddd %mm0, %mm0
18; X86-NEXT:    movq %mm0, (%esp)
19; X86-NEXT:    movl (%esp), %ecx
20; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
21; X86-NEXT:    movl %edx, 4(%eax)
22; X86-NEXT:    movl %ecx, (%eax)
23; X86-NEXT:    movl %ebp, %esp
24; X86-NEXT:    popl %ebp
25; X86-NEXT:    retl
26;
27; X64-LABEL: cvt_v2f64_v2i32:
28; X64:       # %bb.0:
29; X64-NEXT:    cvtpd2pi %xmm0, %mm0
30; X64-NEXT:    paddd %mm0, %mm0
31; X64-NEXT:    movq %mm0, (%rdi)
32; X64-NEXT:    retq
33  %3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0)
34  %4 = bitcast <4 x i32> %3 to <2 x i64>
35  %5 = extractelement <2 x i64> %4, i32 0
36  %6 = bitcast i64 %5 to x86_mmx
37  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
38  %8 = bitcast x86_mmx %7 to i64
39  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
40  store <1 x i64> %9, <1 x i64>* %1
41  ret void
42}
43
44define void @cvtt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
45; X86-LABEL: cvtt_v2f64_v2i32:
46; X86:       # %bb.0:
47; X86-NEXT:    pushl %ebp
48; X86-NEXT:    movl %esp, %ebp
49; X86-NEXT:    andl $-8, %esp
50; X86-NEXT:    subl $8, %esp
51; X86-NEXT:    movl 8(%ebp), %eax
52; X86-NEXT:    cvttpd2pi %xmm0, %mm0
53; X86-NEXT:    paddd %mm0, %mm0
54; X86-NEXT:    movq %mm0, (%esp)
55; X86-NEXT:    movl (%esp), %ecx
56; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
57; X86-NEXT:    movl %edx, 4(%eax)
58; X86-NEXT:    movl %ecx, (%eax)
59; X86-NEXT:    movl %ebp, %esp
60; X86-NEXT:    popl %ebp
61; X86-NEXT:    retl
62;
63; X64-LABEL: cvtt_v2f64_v2i32:
64; X64:       # %bb.0:
65; X64-NEXT:    cvttpd2pi %xmm0, %mm0
66; X64-NEXT:    paddd %mm0, %mm0
67; X64-NEXT:    movq %mm0, (%rdi)
68; X64-NEXT:    retq
69  %3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0)
70  %4 = bitcast <4 x i32> %3 to <2 x i64>
71  %5 = extractelement <2 x i64> %4, i32 0
72  %6 = bitcast i64 %5 to x86_mmx
73  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
74  %8 = bitcast x86_mmx %7 to i64
75  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
76  store <1 x i64> %9, <1 x i64>* %1
77  ret void
78}
79
80define void @fptosi_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
81; X86-LABEL: fptosi_v2f64_v2i32:
82; X86:       # %bb.0:
83; X86-NEXT:    pushl %ebp
84; X86-NEXT:    movl %esp, %ebp
85; X86-NEXT:    andl $-8, %esp
86; X86-NEXT:    subl $8, %esp
87; X86-NEXT:    movl 8(%ebp), %eax
88; X86-NEXT:    cvttpd2pi %xmm0, %mm0
89; X86-NEXT:    paddd %mm0, %mm0
90; X86-NEXT:    movq %mm0, (%esp)
91; X86-NEXT:    movl (%esp), %ecx
92; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
93; X86-NEXT:    movl %edx, 4(%eax)
94; X86-NEXT:    movl %ecx, (%eax)
95; X86-NEXT:    movl %ebp, %esp
96; X86-NEXT:    popl %ebp
97; X86-NEXT:    retl
98;
99; X64-LABEL: fptosi_v2f64_v2i32:
100; X64:       # %bb.0:
101; X64-NEXT:    cvttpd2pi %xmm0, %mm0
102; X64-NEXT:    paddd %mm0, %mm0
103; X64-NEXT:    movq %mm0, (%rdi)
104; X64-NEXT:    retq
105  %3 = fptosi <2 x double> %0 to <2 x i32>
106  %4 = bitcast <2 x i32> %3 to x86_mmx
107  %5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4)
108  %6 = bitcast x86_mmx %5 to i64
109  %7 = insertelement <1 x i64> undef, i64 %6, i32 0
110  store <1 x i64> %7, <1 x i64>* %1
111  ret void
112}
113
114define void @cvt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
115; X86-LABEL: cvt_v2f32_v2i32:
116; X86:       # %bb.0:
117; X86-NEXT:    pushl %ebp
118; X86-NEXT:    movl %esp, %ebp
119; X86-NEXT:    andl $-8, %esp
120; X86-NEXT:    subl $8, %esp
121; X86-NEXT:    movl 8(%ebp), %eax
122; X86-NEXT:    cvtps2pi %xmm0, %mm0
123; X86-NEXT:    paddd %mm0, %mm0
124; X86-NEXT:    movq %mm0, (%esp)
125; X86-NEXT:    movl (%esp), %ecx
126; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
127; X86-NEXT:    movl %edx, 4(%eax)
128; X86-NEXT:    movl %ecx, (%eax)
129; X86-NEXT:    movl %ebp, %esp
130; X86-NEXT:    popl %ebp
131; X86-NEXT:    retl
132;
133; X64-LABEL: cvt_v2f32_v2i32:
134; X64:       # %bb.0:
135; X64-NEXT:    cvtps2pi %xmm0, %mm0
136; X64-NEXT:    paddd %mm0, %mm0
137; X64-NEXT:    movq %mm0, (%rdi)
138; X64-NEXT:    retq
139  %3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0)
140  %4 = bitcast <4 x i32> %3 to <2 x i64>
141  %5 = extractelement <2 x i64> %4, i32 0
142  %6 = bitcast i64 %5 to x86_mmx
143  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
144  %8 = bitcast x86_mmx %7 to i64
145  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
146  store <1 x i64> %9, <1 x i64>* %1
147  ret void
148}
149
150define void @cvtt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
151; X86-LABEL: cvtt_v2f32_v2i32:
152; X86:       # %bb.0:
153; X86-NEXT:    pushl %ebp
154; X86-NEXT:    movl %esp, %ebp
155; X86-NEXT:    andl $-8, %esp
156; X86-NEXT:    subl $8, %esp
157; X86-NEXT:    movl 8(%ebp), %eax
158; X86-NEXT:    cvttps2pi %xmm0, %mm0
159; X86-NEXT:    paddd %mm0, %mm0
160; X86-NEXT:    movq %mm0, (%esp)
161; X86-NEXT:    movl (%esp), %ecx
162; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
163; X86-NEXT:    movl %edx, 4(%eax)
164; X86-NEXT:    movl %ecx, (%eax)
165; X86-NEXT:    movl %ebp, %esp
166; X86-NEXT:    popl %ebp
167; X86-NEXT:    retl
168;
169; X64-LABEL: cvtt_v2f32_v2i32:
170; X64:       # %bb.0:
171; X64-NEXT:    cvttps2pi %xmm0, %mm0
172; X64-NEXT:    paddd %mm0, %mm0
173; X64-NEXT:    movq %mm0, (%rdi)
174; X64-NEXT:    retq
175  %3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0)
176  %4 = bitcast <4 x i32> %3 to <2 x i64>
177  %5 = extractelement <2 x i64> %4, i32 0
178  %6 = bitcast i64 %5 to x86_mmx
179  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
180  %8 = bitcast x86_mmx %7 to i64
181  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
182  store <1 x i64> %9, <1 x i64>* %1
183  ret void
184}
185
186define void @fptosi_v4f32_v4i32(<4 x float>, <1 x i64>*) nounwind {
187; X86-LABEL: fptosi_v4f32_v4i32:
188; X86:       # %bb.0:
189; X86-NEXT:    pushl %ebp
190; X86-NEXT:    movl %esp, %ebp
191; X86-NEXT:    andl $-8, %esp
192; X86-NEXT:    subl $8, %esp
193; X86-NEXT:    movl 8(%ebp), %eax
194; X86-NEXT:    cvttps2pi %xmm0, %mm0
195; X86-NEXT:    paddd %mm0, %mm0
196; X86-NEXT:    movq %mm0, (%esp)
197; X86-NEXT:    movl (%esp), %ecx
198; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
199; X86-NEXT:    movl %edx, 4(%eax)
200; X86-NEXT:    movl %ecx, (%eax)
201; X86-NEXT:    movl %ebp, %esp
202; X86-NEXT:    popl %ebp
203; X86-NEXT:    retl
204;
205; X64-LABEL: fptosi_v4f32_v4i32:
206; X64:       # %bb.0:
207; X64-NEXT:    cvttps2pi %xmm0, %mm0
208; X64-NEXT:    paddd %mm0, %mm0
209; X64-NEXT:    movq %mm0, (%rdi)
210; X64-NEXT:    retq
211  %3 = fptosi <4 x float> %0 to <4 x i32>
212  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
213  %5 = bitcast <2 x i32> %4 to x86_mmx
214  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
215  %7 = bitcast x86_mmx %6 to i64
216  %8 = insertelement <1 x i64> undef, i64 %7, i32 0
217  store <1 x i64> %8, <1 x i64>* %1
218  ret void
219}
220
221define void @fptosi_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
222; X86-LABEL: fptosi_v2f32_v2i32:
223; X86:       # %bb.0:
224; X86-NEXT:    pushl %ebp
225; X86-NEXT:    movl %esp, %ebp
226; X86-NEXT:    andl $-8, %esp
227; X86-NEXT:    subl $8, %esp
228; X86-NEXT:    movl 8(%ebp), %eax
229; X86-NEXT:    cvttps2pi %xmm0, %mm0
230; X86-NEXT:    paddd %mm0, %mm0
231; X86-NEXT:    movq %mm0, (%esp)
232; X86-NEXT:    movl (%esp), %ecx
233; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
234; X86-NEXT:    movl %edx, 4(%eax)
235; X86-NEXT:    movl %ecx, (%eax)
236; X86-NEXT:    movl %ebp, %esp
237; X86-NEXT:    popl %ebp
238; X86-NEXT:    retl
239;
240; X64-LABEL: fptosi_v2f32_v2i32:
241; X64:       # %bb.0:
242; X64-NEXT:    cvttps2pi %xmm0, %mm0
243; X64-NEXT:    paddd %mm0, %mm0
244; X64-NEXT:    movq %mm0, (%rdi)
245; X64-NEXT:    retq
246  %3 = fptosi <4 x float> %0 to <4 x i32>
247  %4 = bitcast <4 x i32> %3 to <2 x i64>
248  %5 = extractelement <2 x i64> %4, i32 0
249  %6 = bitcast i64 %5 to x86_mmx
250  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
251  %8 = bitcast x86_mmx %7 to i64
252  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
253  store <1 x i64> %9, <1 x i64>* %1
254  ret void
255}
256
257; FIXME: If we are transferring MMX registers to XMM for conversion we could use the MMX equivalents
258; (CVTPI2PD + CVTPI2PS) without affecting rounding/exceptions etc.
259
260define <2 x double> @sitofp_v2i32_v2f64(<1 x i64>*) nounwind {
261; X86-LABEL: sitofp_v2i32_v2f64:
262; X86:       # %bb.0:
263; X86-NEXT:    pushl %ebp
264; X86-NEXT:    movl %esp, %ebp
265; X86-NEXT:    andl $-8, %esp
266; X86-NEXT:    subl $8, %esp
267; X86-NEXT:    movl 8(%ebp), %eax
268; X86-NEXT:    movq (%eax), %mm0
269; X86-NEXT:    paddd %mm0, %mm0
270; X86-NEXT:    movq %mm0, (%esp)
271; X86-NEXT:    cvtdq2pd (%esp), %xmm0
272; X86-NEXT:    movl %ebp, %esp
273; X86-NEXT:    popl %ebp
274; X86-NEXT:    retl
275;
276; X64-LABEL: sitofp_v2i32_v2f64:
277; X64:       # %bb.0:
278; X64-NEXT:    movq (%rdi), %mm0
279; X64-NEXT:    paddd %mm0, %mm0
280; X64-NEXT:    movq2dq %mm0, %xmm0
281; X64-NEXT:    cvtdq2pd %xmm0, %xmm0
282; X64-NEXT:    retq
283  %2 = bitcast <1 x i64>* %0 to x86_mmx*
284  %3 = load x86_mmx, x86_mmx* %2, align 8
285  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
286  %5 = bitcast x86_mmx %4 to i64
287  %6 = insertelement <2 x i64> undef, i64 %5, i32 0
288  %7 = bitcast <2 x i64> %6 to <4 x i32>
289  %8 = shufflevector <4 x i32> %7, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
290  %9 = sitofp <2 x i32> %8 to <2 x double>
291  ret <2 x double> %9
292}
293
294define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind {
295; X86-LABEL: sitofp_v2i32_v2f32:
296; X86:       # %bb.0:
297; X86-NEXT:    pushl %ebp
298; X86-NEXT:    movl %esp, %ebp
299; X86-NEXT:    andl $-8, %esp
300; X86-NEXT:    subl $8, %esp
301; X86-NEXT:    movl 8(%ebp), %eax
302; X86-NEXT:    movq (%eax), %mm0
303; X86-NEXT:    paddd %mm0, %mm0
304; X86-NEXT:    movq %mm0, (%esp)
305; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
306; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
307; X86-NEXT:    movl %ebp, %esp
308; X86-NEXT:    popl %ebp
309; X86-NEXT:    retl
310;
311; X64-LABEL: sitofp_v2i32_v2f32:
312; X64:       # %bb.0:
313; X64-NEXT:    movq (%rdi), %mm0
314; X64-NEXT:    paddd %mm0, %mm0
315; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
316; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
317; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
318; X64-NEXT:    retq
319  %2 = bitcast <1 x i64>* %0 to x86_mmx*
320  %3 = load x86_mmx, x86_mmx* %2, align 8
321  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
322  %5 = bitcast x86_mmx %4 to <2 x i32>
323  %6 = shufflevector <2 x i32> %5, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
324  %7 = sitofp <4 x i32> %6 to <4 x float>
325  ret <4 x float> %7
326}
327
328define <4 x float> @cvt_v2i32_v2f32(<1 x i64>*) nounwind {
329; X86-LABEL: cvt_v2i32_v2f32:
330; X86:       # %bb.0:
331; X86-NEXT:    pushl %ebp
332; X86-NEXT:    movl %esp, %ebp
333; X86-NEXT:    andl $-8, %esp
334; X86-NEXT:    subl $8, %esp
335; X86-NEXT:    movl 8(%ebp), %eax
336; X86-NEXT:    movq (%eax), %mm0
337; X86-NEXT:    paddd %mm0, %mm0
338; X86-NEXT:    movq %mm0, (%esp)
339; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
340; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
341; X86-NEXT:    movl %ebp, %esp
342; X86-NEXT:    popl %ebp
343; X86-NEXT:    retl
344;
345; X64-LABEL: cvt_v2i32_v2f32:
346; X64:       # %bb.0:
347; X64-NEXT:    movq (%rdi), %mm0
348; X64-NEXT:    paddd %mm0, %mm0
349; X64-NEXT:    movq %mm0, %rax
350; X64-NEXT:    movq %rax, %xmm0
351; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
352; X64-NEXT:    retq
353  %2 = bitcast <1 x i64>* %0 to x86_mmx*
354  %3 = load x86_mmx, x86_mmx* %2, align 8
355  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
356  %5 = bitcast x86_mmx %4 to i64
357  %6 = insertelement <2 x i64> undef, i64 %5, i32 0
358  %7 = insertelement <2 x i64> %6, i64 0, i32 1
359  %8 = bitcast <2 x i64> %7 to <4 x i32>
360  %9 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %8)
361  ret <4 x float> %9
362}
363
364declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
365declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>)
366declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>)
367declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
368declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
369declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
370