1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64 4 5; If we are transferring XMM conversion results to MMX registers we could use the MMX equivalents 6; (CVTPD2PI/CVTTPD2PI + CVTPS2PI/CVTTPS2PI) without affecting rounding/exceptions etc. 7 8define void @cvt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind { 9; X86-LABEL: cvt_v2f64_v2i32: 10; X86: # %bb.0: 11; X86-NEXT: pushl %ebp 12; X86-NEXT: movl %esp, %ebp 13; X86-NEXT: andl $-8, %esp 14; X86-NEXT: subl $8, %esp 15; X86-NEXT: movl 8(%ebp), %eax 16; X86-NEXT: cvtpd2pi %xmm0, %mm0 17; X86-NEXT: paddd %mm0, %mm0 18; X86-NEXT: movq %mm0, (%esp) 19; X86-NEXT: movl (%esp), %ecx 20; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 21; X86-NEXT: movl %edx, 4(%eax) 22; X86-NEXT: movl %ecx, (%eax) 23; X86-NEXT: movl %ebp, %esp 24; X86-NEXT: popl %ebp 25; X86-NEXT: retl 26; 27; X64-LABEL: cvt_v2f64_v2i32: 28; X64: # %bb.0: 29; X64-NEXT: cvtpd2pi %xmm0, %mm0 30; X64-NEXT: paddd %mm0, %mm0 31; X64-NEXT: movq %mm0, (%rdi) 32; X64-NEXT: retq 33 %3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0) 34 %4 = bitcast <4 x i32> %3 to <2 x i64> 35 %5 = extractelement <2 x i64> %4, i32 0 36 %6 = bitcast i64 %5 to x86_mmx 37 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) 38 %8 = bitcast x86_mmx %7 to i64 39 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 40 store <1 x i64> %9, <1 x i64>* %1 41 ret void 42} 43 44define void @cvtt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind { 45; X86-LABEL: cvtt_v2f64_v2i32: 46; X86: # %bb.0: 47; X86-NEXT: pushl %ebp 48; X86-NEXT: movl %esp, %ebp 49; X86-NEXT: andl $-8, %esp 50; X86-NEXT: subl $8, %esp 51; X86-NEXT: movl 8(%ebp), %eax 52; X86-NEXT: cvttpd2pi %xmm0, %mm0 53; X86-NEXT: paddd %mm0, %mm0 54; X86-NEXT: movq %mm0, (%esp) 55; X86-NEXT: movl (%esp), %ecx 56; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 57; X86-NEXT: movl %edx, 4(%eax) 58; X86-NEXT: movl %ecx, (%eax) 59; X86-NEXT: movl %ebp, %esp 60; X86-NEXT: popl %ebp 61; X86-NEXT: retl 62; 63; X64-LABEL: cvtt_v2f64_v2i32: 64; X64: # %bb.0: 65; X64-NEXT: cvttpd2pi %xmm0, %mm0 66; X64-NEXT: paddd %mm0, %mm0 67; X64-NEXT: movq %mm0, (%rdi) 68; X64-NEXT: retq 69 %3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0) 70 %4 = bitcast <4 x i32> %3 to <2 x i64> 71 %5 = extractelement <2 x i64> %4, i32 0 72 %6 = bitcast i64 %5 to x86_mmx 73 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) 74 %8 = bitcast x86_mmx %7 to i64 75 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 76 store <1 x i64> %9, <1 x i64>* %1 77 ret void 78} 79 80define void @fptosi_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind { 81; X86-LABEL: fptosi_v2f64_v2i32: 82; X86: # %bb.0: 83; X86-NEXT: pushl %ebp 84; X86-NEXT: movl %esp, %ebp 85; X86-NEXT: andl $-8, %esp 86; X86-NEXT: subl $8, %esp 87; X86-NEXT: movl 8(%ebp), %eax 88; X86-NEXT: cvttpd2pi %xmm0, %mm0 89; X86-NEXT: paddd %mm0, %mm0 90; X86-NEXT: movq %mm0, (%esp) 91; X86-NEXT: movl (%esp), %ecx 92; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 93; X86-NEXT: movl %edx, 4(%eax) 94; X86-NEXT: movl %ecx, (%eax) 95; X86-NEXT: movl %ebp, %esp 96; X86-NEXT: popl %ebp 97; X86-NEXT: retl 98; 99; X64-LABEL: fptosi_v2f64_v2i32: 100; X64: # %bb.0: 101; X64-NEXT: cvttpd2pi %xmm0, %mm0 102; X64-NEXT: paddd %mm0, %mm0 103; X64-NEXT: movq %mm0, (%rdi) 104; X64-NEXT: retq 105 %3 = fptosi <2 x double> %0 to <2 x i32> 106 %4 = bitcast <2 x i32> %3 to x86_mmx 107 %5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4) 108 %6 = bitcast x86_mmx %5 to i64 109 %7 = insertelement <1 x i64> undef, i64 %6, i32 0 110 store <1 x i64> %7, <1 x i64>* %1 111 ret void 112} 113 114define void @cvt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind { 115; X86-LABEL: cvt_v2f32_v2i32: 116; X86: # %bb.0: 117; X86-NEXT: pushl %ebp 118; X86-NEXT: movl %esp, %ebp 119; X86-NEXT: andl $-8, %esp 120; X86-NEXT: subl $8, %esp 121; X86-NEXT: movl 8(%ebp), %eax 122; X86-NEXT: cvtps2pi %xmm0, %mm0 123; X86-NEXT: paddd %mm0, %mm0 124; X86-NEXT: movq %mm0, (%esp) 125; X86-NEXT: movl (%esp), %ecx 126; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 127; X86-NEXT: movl %edx, 4(%eax) 128; X86-NEXT: movl %ecx, (%eax) 129; X86-NEXT: movl %ebp, %esp 130; X86-NEXT: popl %ebp 131; X86-NEXT: retl 132; 133; X64-LABEL: cvt_v2f32_v2i32: 134; X64: # %bb.0: 135; X64-NEXT: cvtps2pi %xmm0, %mm0 136; X64-NEXT: paddd %mm0, %mm0 137; X64-NEXT: movq %mm0, (%rdi) 138; X64-NEXT: retq 139 %3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0) 140 %4 = bitcast <4 x i32> %3 to <2 x i64> 141 %5 = extractelement <2 x i64> %4, i32 0 142 %6 = bitcast i64 %5 to x86_mmx 143 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) 144 %8 = bitcast x86_mmx %7 to i64 145 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 146 store <1 x i64> %9, <1 x i64>* %1 147 ret void 148} 149 150define void @cvtt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind { 151; X86-LABEL: cvtt_v2f32_v2i32: 152; X86: # %bb.0: 153; X86-NEXT: pushl %ebp 154; X86-NEXT: movl %esp, %ebp 155; X86-NEXT: andl $-8, %esp 156; X86-NEXT: subl $8, %esp 157; X86-NEXT: movl 8(%ebp), %eax 158; X86-NEXT: cvttps2pi %xmm0, %mm0 159; X86-NEXT: paddd %mm0, %mm0 160; X86-NEXT: movq %mm0, (%esp) 161; X86-NEXT: movl (%esp), %ecx 162; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 163; X86-NEXT: movl %edx, 4(%eax) 164; X86-NEXT: movl %ecx, (%eax) 165; X86-NEXT: movl %ebp, %esp 166; X86-NEXT: popl %ebp 167; X86-NEXT: retl 168; 169; X64-LABEL: cvtt_v2f32_v2i32: 170; X64: # %bb.0: 171; X64-NEXT: cvttps2pi %xmm0, %mm0 172; X64-NEXT: paddd %mm0, %mm0 173; X64-NEXT: movq %mm0, (%rdi) 174; X64-NEXT: retq 175 %3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0) 176 %4 = bitcast <4 x i32> %3 to <2 x i64> 177 %5 = extractelement <2 x i64> %4, i32 0 178 %6 = bitcast i64 %5 to x86_mmx 179 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) 180 %8 = bitcast x86_mmx %7 to i64 181 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 182 store <1 x i64> %9, <1 x i64>* %1 183 ret void 184} 185 186define void @fptosi_v4f32_v4i32(<4 x float>, <1 x i64>*) nounwind { 187; X86-LABEL: fptosi_v4f32_v4i32: 188; X86: # %bb.0: 189; X86-NEXT: pushl %ebp 190; X86-NEXT: movl %esp, %ebp 191; X86-NEXT: andl $-8, %esp 192; X86-NEXT: subl $8, %esp 193; X86-NEXT: movl 8(%ebp), %eax 194; X86-NEXT: cvttps2pi %xmm0, %mm0 195; X86-NEXT: paddd %mm0, %mm0 196; X86-NEXT: movq %mm0, (%esp) 197; X86-NEXT: movl (%esp), %ecx 198; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 199; X86-NEXT: movl %edx, 4(%eax) 200; X86-NEXT: movl %ecx, (%eax) 201; X86-NEXT: movl %ebp, %esp 202; X86-NEXT: popl %ebp 203; X86-NEXT: retl 204; 205; X64-LABEL: fptosi_v4f32_v4i32: 206; X64: # %bb.0: 207; X64-NEXT: cvttps2pi %xmm0, %mm0 208; X64-NEXT: paddd %mm0, %mm0 209; X64-NEXT: movq %mm0, (%rdi) 210; X64-NEXT: retq 211 %3 = fptosi <4 x float> %0 to <4 x i32> 212 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 213 %5 = bitcast <2 x i32> %4 to x86_mmx 214 %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5) 215 %7 = bitcast x86_mmx %6 to i64 216 %8 = insertelement <1 x i64> undef, i64 %7, i32 0 217 store <1 x i64> %8, <1 x i64>* %1 218 ret void 219} 220 221define void @fptosi_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind { 222; X86-LABEL: fptosi_v2f32_v2i32: 223; X86: # %bb.0: 224; X86-NEXT: pushl %ebp 225; X86-NEXT: movl %esp, %ebp 226; X86-NEXT: andl $-8, %esp 227; X86-NEXT: subl $8, %esp 228; X86-NEXT: movl 8(%ebp), %eax 229; X86-NEXT: cvttps2pi %xmm0, %mm0 230; X86-NEXT: paddd %mm0, %mm0 231; X86-NEXT: movq %mm0, (%esp) 232; X86-NEXT: movl (%esp), %ecx 233; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 234; X86-NEXT: movl %edx, 4(%eax) 235; X86-NEXT: movl %ecx, (%eax) 236; X86-NEXT: movl %ebp, %esp 237; X86-NEXT: popl %ebp 238; X86-NEXT: retl 239; 240; X64-LABEL: fptosi_v2f32_v2i32: 241; X64: # %bb.0: 242; X64-NEXT: cvttps2pi %xmm0, %mm0 243; X64-NEXT: paddd %mm0, %mm0 244; X64-NEXT: movq %mm0, (%rdi) 245; X64-NEXT: retq 246 %3 = fptosi <4 x float> %0 to <4 x i32> 247 %4 = bitcast <4 x i32> %3 to <2 x i64> 248 %5 = extractelement <2 x i64> %4, i32 0 249 %6 = bitcast i64 %5 to x86_mmx 250 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) 251 %8 = bitcast x86_mmx %7 to i64 252 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 253 store <1 x i64> %9, <1 x i64>* %1 254 ret void 255} 256 257; FIXME: If we are transferring MMX registers to XMM for conversion we could use the MMX equivalents 258; (CVTPI2PD + CVTPI2PS) without affecting rounding/exceptions etc. 259 260define <2 x double> @sitofp_v2i32_v2f64(<1 x i64>*) nounwind { 261; X86-LABEL: sitofp_v2i32_v2f64: 262; X86: # %bb.0: 263; X86-NEXT: pushl %ebp 264; X86-NEXT: movl %esp, %ebp 265; X86-NEXT: andl $-8, %esp 266; X86-NEXT: subl $8, %esp 267; X86-NEXT: movl 8(%ebp), %eax 268; X86-NEXT: movq (%eax), %mm0 269; X86-NEXT: paddd %mm0, %mm0 270; X86-NEXT: movq %mm0, (%esp) 271; X86-NEXT: cvtdq2pd (%esp), %xmm0 272; X86-NEXT: movl %ebp, %esp 273; X86-NEXT: popl %ebp 274; X86-NEXT: retl 275; 276; X64-LABEL: sitofp_v2i32_v2f64: 277; X64: # %bb.0: 278; X64-NEXT: movq (%rdi), %mm0 279; X64-NEXT: paddd %mm0, %mm0 280; X64-NEXT: movq2dq %mm0, %xmm0 281; X64-NEXT: cvtdq2pd %xmm0, %xmm0 282; X64-NEXT: retq 283 %2 = bitcast <1 x i64>* %0 to x86_mmx* 284 %3 = load x86_mmx, x86_mmx* %2, align 8 285 %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) 286 %5 = bitcast x86_mmx %4 to i64 287 %6 = insertelement <2 x i64> undef, i64 %5, i32 0 288 %7 = bitcast <2 x i64> %6 to <4 x i32> 289 %8 = shufflevector <4 x i32> %7, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 290 %9 = sitofp <2 x i32> %8 to <2 x double> 291 ret <2 x double> %9 292} 293 294define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind { 295; X86-LABEL: sitofp_v2i32_v2f32: 296; X86: # %bb.0: 297; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 298; X86-NEXT: movq (%eax), %mm0 299; X86-NEXT: paddd %mm0, %mm0 300; X86-NEXT: movq2dq %mm0, %xmm0 301; X86-NEXT: cvtdq2ps %xmm0, %xmm0 302; X86-NEXT: retl 303; 304; X64-LABEL: sitofp_v2i32_v2f32: 305; X64: # %bb.0: 306; X64-NEXT: movq (%rdi), %mm0 307; X64-NEXT: paddd %mm0, %mm0 308; X64-NEXT: movq2dq %mm0, %xmm0 309; X64-NEXT: cvtdq2ps %xmm0, %xmm0 310; X64-NEXT: retq 311 %2 = bitcast <1 x i64>* %0 to x86_mmx* 312 %3 = load x86_mmx, x86_mmx* %2, align 8 313 %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) 314 %5 = bitcast x86_mmx %4 to <2 x i32> 315 %6 = shufflevector <2 x i32> %5, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 316 %7 = sitofp <4 x i32> %6 to <4 x float> 317 ret <4 x float> %7 318} 319 320define <4 x float> @cvt_v2i32_v2f32(<1 x i64>*) nounwind { 321; X86-LABEL: cvt_v2i32_v2f32: 322; X86: # %bb.0: 323; X86-NEXT: pushl %ebp 324; X86-NEXT: movl %esp, %ebp 325; X86-NEXT: andl $-8, %esp 326; X86-NEXT: subl $8, %esp 327; X86-NEXT: movl 8(%ebp), %eax 328; X86-NEXT: movq (%eax), %mm0 329; X86-NEXT: paddd %mm0, %mm0 330; X86-NEXT: movq %mm0, (%esp) 331; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 332; X86-NEXT: cvtdq2ps %xmm0, %xmm0 333; X86-NEXT: movl %ebp, %esp 334; X86-NEXT: popl %ebp 335; X86-NEXT: retl 336; 337; X64-LABEL: cvt_v2i32_v2f32: 338; X64: # %bb.0: 339; X64-NEXT: movq (%rdi), %mm0 340; X64-NEXT: paddd %mm0, %mm0 341; X64-NEXT: movq2dq %mm0, %xmm0 342; X64-NEXT: cvtdq2ps %xmm0, %xmm0 343; X64-NEXT: retq 344 %2 = bitcast <1 x i64>* %0 to x86_mmx* 345 %3 = load x86_mmx, x86_mmx* %2, align 8 346 %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) 347 %5 = bitcast x86_mmx %4 to i64 348 %6 = insertelement <2 x i64> undef, i64 %5, i32 0 349 %7 = insertelement <2 x i64> %6, i64 0, i32 1 350 %8 = bitcast <2 x i64> %7 to <4 x i32> 351 %9 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %8) 352 ret <4 x float> %9 353} 354 355declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) 356declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) 357declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) 358declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) 359declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) 360declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) 361