• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
4
5; If we are transferring XMM conversion results to MMX registers we could use the MMX equivalents
6; (CVTPD2PI/CVTTPD2PI + CVTPS2PI/CVTTPS2PI) without affecting rounding/exceptions etc.
7
8define void @cvt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
9; X86-LABEL: cvt_v2f64_v2i32:
10; X86:       # %bb.0:
11; X86-NEXT:    pushl %ebp
12; X86-NEXT:    movl %esp, %ebp
13; X86-NEXT:    andl $-8, %esp
14; X86-NEXT:    subl $8, %esp
15; X86-NEXT:    movl 8(%ebp), %eax
16; X86-NEXT:    cvtpd2pi %xmm0, %mm0
17; X86-NEXT:    paddd %mm0, %mm0
18; X86-NEXT:    movq %mm0, (%esp)
19; X86-NEXT:    movl (%esp), %ecx
20; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
21; X86-NEXT:    movl %edx, 4(%eax)
22; X86-NEXT:    movl %ecx, (%eax)
23; X86-NEXT:    movl %ebp, %esp
24; X86-NEXT:    popl %ebp
25; X86-NEXT:    retl
26;
27; X64-LABEL: cvt_v2f64_v2i32:
28; X64:       # %bb.0:
29; X64-NEXT:    cvtpd2pi %xmm0, %mm0
30; X64-NEXT:    paddd %mm0, %mm0
31; X64-NEXT:    movq %mm0, (%rdi)
32; X64-NEXT:    retq
33  %3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0)
34  %4 = bitcast <4 x i32> %3 to <2 x i64>
35  %5 = extractelement <2 x i64> %4, i32 0
36  %6 = bitcast i64 %5 to x86_mmx
37  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
38  %8 = bitcast x86_mmx %7 to i64
39  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
40  store <1 x i64> %9, <1 x i64>* %1
41  ret void
42}
43
44define void @cvtt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
45; X86-LABEL: cvtt_v2f64_v2i32:
46; X86:       # %bb.0:
47; X86-NEXT:    pushl %ebp
48; X86-NEXT:    movl %esp, %ebp
49; X86-NEXT:    andl $-8, %esp
50; X86-NEXT:    subl $8, %esp
51; X86-NEXT:    movl 8(%ebp), %eax
52; X86-NEXT:    cvttpd2pi %xmm0, %mm0
53; X86-NEXT:    paddd %mm0, %mm0
54; X86-NEXT:    movq %mm0, (%esp)
55; X86-NEXT:    movl (%esp), %ecx
56; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
57; X86-NEXT:    movl %edx, 4(%eax)
58; X86-NEXT:    movl %ecx, (%eax)
59; X86-NEXT:    movl %ebp, %esp
60; X86-NEXT:    popl %ebp
61; X86-NEXT:    retl
62;
63; X64-LABEL: cvtt_v2f64_v2i32:
64; X64:       # %bb.0:
65; X64-NEXT:    cvttpd2pi %xmm0, %mm0
66; X64-NEXT:    paddd %mm0, %mm0
67; X64-NEXT:    movq %mm0, (%rdi)
68; X64-NEXT:    retq
69  %3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0)
70  %4 = bitcast <4 x i32> %3 to <2 x i64>
71  %5 = extractelement <2 x i64> %4, i32 0
72  %6 = bitcast i64 %5 to x86_mmx
73  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
74  %8 = bitcast x86_mmx %7 to i64
75  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
76  store <1 x i64> %9, <1 x i64>* %1
77  ret void
78}
79
80define void @fptosi_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
81; X86-LABEL: fptosi_v2f64_v2i32:
82; X86:       # %bb.0:
83; X86-NEXT:    pushl %ebp
84; X86-NEXT:    movl %esp, %ebp
85; X86-NEXT:    andl $-8, %esp
86; X86-NEXT:    subl $8, %esp
87; X86-NEXT:    movl 8(%ebp), %eax
88; X86-NEXT:    cvttpd2pi %xmm0, %mm0
89; X86-NEXT:    paddd %mm0, %mm0
90; X86-NEXT:    movq %mm0, (%esp)
91; X86-NEXT:    movl (%esp), %ecx
92; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
93; X86-NEXT:    movl %edx, 4(%eax)
94; X86-NEXT:    movl %ecx, (%eax)
95; X86-NEXT:    movl %ebp, %esp
96; X86-NEXT:    popl %ebp
97; X86-NEXT:    retl
98;
99; X64-LABEL: fptosi_v2f64_v2i32:
100; X64:       # %bb.0:
101; X64-NEXT:    cvttpd2pi %xmm0, %mm0
102; X64-NEXT:    paddd %mm0, %mm0
103; X64-NEXT:    movq %mm0, (%rdi)
104; X64-NEXT:    retq
105  %3 = fptosi <2 x double> %0 to <2 x i32>
106  %4 = bitcast <2 x i32> %3 to x86_mmx
107  %5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4)
108  %6 = bitcast x86_mmx %5 to i64
109  %7 = insertelement <1 x i64> undef, i64 %6, i32 0
110  store <1 x i64> %7, <1 x i64>* %1
111  ret void
112}
113
114define void @cvt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
115; X86-LABEL: cvt_v2f32_v2i32:
116; X86:       # %bb.0:
117; X86-NEXT:    pushl %ebp
118; X86-NEXT:    movl %esp, %ebp
119; X86-NEXT:    andl $-8, %esp
120; X86-NEXT:    subl $8, %esp
121; X86-NEXT:    movl 8(%ebp), %eax
122; X86-NEXT:    cvtps2pi %xmm0, %mm0
123; X86-NEXT:    paddd %mm0, %mm0
124; X86-NEXT:    movq %mm0, (%esp)
125; X86-NEXT:    movl (%esp), %ecx
126; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
127; X86-NEXT:    movl %edx, 4(%eax)
128; X86-NEXT:    movl %ecx, (%eax)
129; X86-NEXT:    movl %ebp, %esp
130; X86-NEXT:    popl %ebp
131; X86-NEXT:    retl
132;
133; X64-LABEL: cvt_v2f32_v2i32:
134; X64:       # %bb.0:
135; X64-NEXT:    cvtps2pi %xmm0, %mm0
136; X64-NEXT:    paddd %mm0, %mm0
137; X64-NEXT:    movq %mm0, (%rdi)
138; X64-NEXT:    retq
139  %3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0)
140  %4 = bitcast <4 x i32> %3 to <2 x i64>
141  %5 = extractelement <2 x i64> %4, i32 0
142  %6 = bitcast i64 %5 to x86_mmx
143  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
144  %8 = bitcast x86_mmx %7 to i64
145  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
146  store <1 x i64> %9, <1 x i64>* %1
147  ret void
148}
149
150define void @cvtt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
151; X86-LABEL: cvtt_v2f32_v2i32:
152; X86:       # %bb.0:
153; X86-NEXT:    pushl %ebp
154; X86-NEXT:    movl %esp, %ebp
155; X86-NEXT:    andl $-8, %esp
156; X86-NEXT:    subl $8, %esp
157; X86-NEXT:    movl 8(%ebp), %eax
158; X86-NEXT:    cvttps2pi %xmm0, %mm0
159; X86-NEXT:    paddd %mm0, %mm0
160; X86-NEXT:    movq %mm0, (%esp)
161; X86-NEXT:    movl (%esp), %ecx
162; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
163; X86-NEXT:    movl %edx, 4(%eax)
164; X86-NEXT:    movl %ecx, (%eax)
165; X86-NEXT:    movl %ebp, %esp
166; X86-NEXT:    popl %ebp
167; X86-NEXT:    retl
168;
169; X64-LABEL: cvtt_v2f32_v2i32:
170; X64:       # %bb.0:
171; X64-NEXT:    cvttps2pi %xmm0, %mm0
172; X64-NEXT:    paddd %mm0, %mm0
173; X64-NEXT:    movq %mm0, (%rdi)
174; X64-NEXT:    retq
175  %3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0)
176  %4 = bitcast <4 x i32> %3 to <2 x i64>
177  %5 = extractelement <2 x i64> %4, i32 0
178  %6 = bitcast i64 %5 to x86_mmx
179  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
180  %8 = bitcast x86_mmx %7 to i64
181  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
182  store <1 x i64> %9, <1 x i64>* %1
183  ret void
184}
185
186define void @fptosi_v4f32_v4i32(<4 x float>, <1 x i64>*) nounwind {
187; X86-LABEL: fptosi_v4f32_v4i32:
188; X86:       # %bb.0:
189; X86-NEXT:    pushl %ebp
190; X86-NEXT:    movl %esp, %ebp
191; X86-NEXT:    andl $-8, %esp
192; X86-NEXT:    subl $8, %esp
193; X86-NEXT:    movl 8(%ebp), %eax
194; X86-NEXT:    cvttps2pi %xmm0, %mm0
195; X86-NEXT:    paddd %mm0, %mm0
196; X86-NEXT:    movq %mm0, (%esp)
197; X86-NEXT:    movl (%esp), %ecx
198; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
199; X86-NEXT:    movl %edx, 4(%eax)
200; X86-NEXT:    movl %ecx, (%eax)
201; X86-NEXT:    movl %ebp, %esp
202; X86-NEXT:    popl %ebp
203; X86-NEXT:    retl
204;
205; X64-LABEL: fptosi_v4f32_v4i32:
206; X64:       # %bb.0:
207; X64-NEXT:    cvttps2pi %xmm0, %mm0
208; X64-NEXT:    paddd %mm0, %mm0
209; X64-NEXT:    movq %mm0, (%rdi)
210; X64-NEXT:    retq
211  %3 = fptosi <4 x float> %0 to <4 x i32>
212  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
213  %5 = bitcast <2 x i32> %4 to x86_mmx
214  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
215  %7 = bitcast x86_mmx %6 to i64
216  %8 = insertelement <1 x i64> undef, i64 %7, i32 0
217  store <1 x i64> %8, <1 x i64>* %1
218  ret void
219}
220
221define void @fptosi_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
222; X86-LABEL: fptosi_v2f32_v2i32:
223; X86:       # %bb.0:
224; X86-NEXT:    pushl %ebp
225; X86-NEXT:    movl %esp, %ebp
226; X86-NEXT:    andl $-8, %esp
227; X86-NEXT:    subl $8, %esp
228; X86-NEXT:    movl 8(%ebp), %eax
229; X86-NEXT:    cvttps2pi %xmm0, %mm0
230; X86-NEXT:    paddd %mm0, %mm0
231; X86-NEXT:    movq %mm0, (%esp)
232; X86-NEXT:    movl (%esp), %ecx
233; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
234; X86-NEXT:    movl %edx, 4(%eax)
235; X86-NEXT:    movl %ecx, (%eax)
236; X86-NEXT:    movl %ebp, %esp
237; X86-NEXT:    popl %ebp
238; X86-NEXT:    retl
239;
240; X64-LABEL: fptosi_v2f32_v2i32:
241; X64:       # %bb.0:
242; X64-NEXT:    cvttps2pi %xmm0, %mm0
243; X64-NEXT:    paddd %mm0, %mm0
244; X64-NEXT:    movq %mm0, (%rdi)
245; X64-NEXT:    retq
246  %3 = fptosi <4 x float> %0 to <4 x i32>
247  %4 = bitcast <4 x i32> %3 to <2 x i64>
248  %5 = extractelement <2 x i64> %4, i32 0
249  %6 = bitcast i64 %5 to x86_mmx
250  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
251  %8 = bitcast x86_mmx %7 to i64
252  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
253  store <1 x i64> %9, <1 x i64>* %1
254  ret void
255}
256
257; FIXME: If we are transferring MMX registers to XMM for conversion we could use the MMX equivalents
258; (CVTPI2PD + CVTPI2PS) without affecting rounding/exceptions etc.
259
260define <2 x double> @sitofp_v2i32_v2f64(<1 x i64>*) nounwind {
261; X86-LABEL: sitofp_v2i32_v2f64:
262; X86:       # %bb.0:
263; X86-NEXT:    pushl %ebp
264; X86-NEXT:    movl %esp, %ebp
265; X86-NEXT:    andl $-8, %esp
266; X86-NEXT:    subl $8, %esp
267; X86-NEXT:    movl 8(%ebp), %eax
268; X86-NEXT:    movq (%eax), %mm0
269; X86-NEXT:    paddd %mm0, %mm0
270; X86-NEXT:    movq %mm0, (%esp)
271; X86-NEXT:    cvtdq2pd (%esp), %xmm0
272; X86-NEXT:    movl %ebp, %esp
273; X86-NEXT:    popl %ebp
274; X86-NEXT:    retl
275;
276; X64-LABEL: sitofp_v2i32_v2f64:
277; X64:       # %bb.0:
278; X64-NEXT:    movq (%rdi), %mm0
279; X64-NEXT:    paddd %mm0, %mm0
280; X64-NEXT:    movq2dq %mm0, %xmm0
281; X64-NEXT:    cvtdq2pd %xmm0, %xmm0
282; X64-NEXT:    retq
283  %2 = bitcast <1 x i64>* %0 to x86_mmx*
284  %3 = load x86_mmx, x86_mmx* %2, align 8
285  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
286  %5 = bitcast x86_mmx %4 to i64
287  %6 = insertelement <2 x i64> undef, i64 %5, i32 0
288  %7 = bitcast <2 x i64> %6 to <4 x i32>
289  %8 = shufflevector <4 x i32> %7, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
290  %9 = sitofp <2 x i32> %8 to <2 x double>
291  ret <2 x double> %9
292}
293
294define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind {
295; X86-LABEL: sitofp_v2i32_v2f32:
296; X86:       # %bb.0:
297; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
298; X86-NEXT:    movq (%eax), %mm0
299; X86-NEXT:    paddd %mm0, %mm0
300; X86-NEXT:    movq2dq %mm0, %xmm0
301; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
302; X86-NEXT:    retl
303;
304; X64-LABEL: sitofp_v2i32_v2f32:
305; X64:       # %bb.0:
306; X64-NEXT:    movq (%rdi), %mm0
307; X64-NEXT:    paddd %mm0, %mm0
308; X64-NEXT:    movq2dq %mm0, %xmm0
309; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
310; X64-NEXT:    retq
311  %2 = bitcast <1 x i64>* %0 to x86_mmx*
312  %3 = load x86_mmx, x86_mmx* %2, align 8
313  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
314  %5 = bitcast x86_mmx %4 to <2 x i32>
315  %6 = shufflevector <2 x i32> %5, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
316  %7 = sitofp <4 x i32> %6 to <4 x float>
317  ret <4 x float> %7
318}
319
320define <4 x float> @cvt_v2i32_v2f32(<1 x i64>*) nounwind {
321; X86-LABEL: cvt_v2i32_v2f32:
322; X86:       # %bb.0:
323; X86-NEXT:    pushl %ebp
324; X86-NEXT:    movl %esp, %ebp
325; X86-NEXT:    andl $-8, %esp
326; X86-NEXT:    subl $8, %esp
327; X86-NEXT:    movl 8(%ebp), %eax
328; X86-NEXT:    movq (%eax), %mm0
329; X86-NEXT:    paddd %mm0, %mm0
330; X86-NEXT:    movq %mm0, (%esp)
331; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
332; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
333; X86-NEXT:    movl %ebp, %esp
334; X86-NEXT:    popl %ebp
335; X86-NEXT:    retl
336;
337; X64-LABEL: cvt_v2i32_v2f32:
338; X64:       # %bb.0:
339; X64-NEXT:    movq (%rdi), %mm0
340; X64-NEXT:    paddd %mm0, %mm0
341; X64-NEXT:    movq2dq %mm0, %xmm0
342; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
343; X64-NEXT:    retq
344  %2 = bitcast <1 x i64>* %0 to x86_mmx*
345  %3 = load x86_mmx, x86_mmx* %2, align 8
346  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
347  %5 = bitcast x86_mmx %4 to i64
348  %6 = insertelement <2 x i64> undef, i64 %5, i32 0
349  %7 = insertelement <2 x i64> %6, i64 0, i32 1
350  %8 = bitcast <2 x i64> %7 to <4 x i32>
351  %9 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %8)
352  ret <4 x float> %9
353}
354
355declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
356declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>)
357declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>)
358declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
359declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
360declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
361