1//===---------------------------------------------------------------------===// 2// Random ideas for the X86 backend: SSE-specific stuff. 3//===---------------------------------------------------------------------===// 4 5//===---------------------------------------------------------------------===// 6 7SSE Variable shift can be custom lowered to something like this, which uses a 8small table + unaligned load + shuffle instead of going through memory. 9 10__m128i_shift_right: 11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 13 14... 15__m128i shift_right(__m128i value, unsigned long offset) { 16 return _mm_shuffle_epi8(value, 17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); 18} 19 20//===---------------------------------------------------------------------===// 21 22SSE has instructions for doing operations on complex numbers, we should pattern 23match them. For example, this should turn into a horizontal add: 24 25typedef float __attribute__((vector_size(16))) v4f32; 26float f32(v4f32 A) { 27 return A[0]+A[1]+A[2]+A[3]; 28} 29 30Instead we get this: 31 32_f32: ## @f32 33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] 34 addss %xmm0, %xmm1 35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] 36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] 37 movaps %xmm0, %xmm3 38 addss %xmm1, %xmm3 39 movdqa %xmm2, %xmm0 40 addss %xmm3, %xmm0 41 ret 42 43Also, there are cases where some simple local SLP would improve codegen a bit. 44compiling this: 45 46_Complex float f32(_Complex float A, _Complex float B) { 47 return A+B; 48} 49 50into: 51 52_f32: ## @f32 53 movdqa %xmm0, %xmm2 54 addss %xmm1, %xmm2 55 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] 56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] 57 addss %xmm1, %xmm3 58 movaps %xmm2, %xmm0 59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 60 ret 61 62seems silly when it could just be one addps. 63 64 65//===---------------------------------------------------------------------===// 66 67Expand libm rounding functions inline: Significant speedups possible. 68http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html 69 70//===---------------------------------------------------------------------===// 71 72When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and 73other fast SSE modes. 74 75//===---------------------------------------------------------------------===// 76 77Think about doing i64 math in SSE regs on x86-32. 78 79//===---------------------------------------------------------------------===// 80 81This testcase should have no SSE instructions in it, and only one load from 82a constant pool: 83 84double %test3(bool %B) { 85 %C = select bool %B, double 123.412, double 523.01123123 86 ret double %C 87} 88 89Currently, the select is being lowered, which prevents the dag combiner from 90turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' 91 92The pattern isel got this one right. 93 94//===---------------------------------------------------------------------===// 95 96SSE should implement 'select_cc' using 'emulated conditional moves' that use 97pcmp/pand/pandn/por to do a selection instead of a conditional branch: 98 99double %X(double %Y, double %Z, double %A, double %B) { 100 %C = setlt double %A, %B 101 %z = fadd double %Z, 0.0 ;; select operand is not a load 102 %D = select bool %C, double %Y, double %z 103 ret double %D 104} 105 106We currently emit: 107 108_X: 109 subl $12, %esp 110 xorpd %xmm0, %xmm0 111 addsd 24(%esp), %xmm0 112 movsd 32(%esp), %xmm1 113 movsd 16(%esp), %xmm2 114 ucomisd 40(%esp), %xmm1 115 jb LBB_X_2 116LBB_X_1: 117 movsd %xmm0, %xmm2 118LBB_X_2: 119 movsd %xmm2, (%esp) 120 fldl (%esp) 121 addl $12, %esp 122 ret 123 124//===---------------------------------------------------------------------===// 125 126Lower memcpy / memset to a series of SSE 128 bit move instructions when it's 127feasible. 128 129//===---------------------------------------------------------------------===// 130 131Codegen: 132 if (copysign(1.0, x) == copysign(1.0, y)) 133into: 134 if (x^y & mask) 135when using SSE. 136 137//===---------------------------------------------------------------------===// 138 139Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half 140of a v4sf value. 141 142//===---------------------------------------------------------------------===// 143 144Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. 145Perhaps use pxor / xorp* to clear a XMM register first? 146 147//===---------------------------------------------------------------------===// 148 149External test Nurbs exposed some problems. Look for 150__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc 151emits: 152 153 movaps (%edx), %xmm2 #59.21 154 movaps (%edx), %xmm5 #60.21 155 movaps (%edx), %xmm4 #61.21 156 movaps (%edx), %xmm3 #62.21 157 movl 40(%ecx), %ebp #69.49 158 shufps $0, %xmm2, %xmm5 #60.21 159 movl 100(%esp), %ebx #69.20 160 movl (%ebx), %edi #69.20 161 imull %ebp, %edi #69.49 162 addl (%eax), %edi #70.33 163 shufps $85, %xmm2, %xmm4 #61.21 164 shufps $170, %xmm2, %xmm3 #62.21 165 shufps $255, %xmm2, %xmm2 #63.21 166 lea (%ebp,%ebp,2), %ebx #69.49 167 negl %ebx #69.49 168 lea -3(%edi,%ebx), %ebx #70.33 169 shll $4, %ebx #68.37 170 addl 32(%ecx), %ebx #68.37 171 testb $15, %bl #91.13 172 jne L_B1.24 # Prob 5% #91.13 173 174This is the llvm code after instruction scheduling: 175 176cond_next140 (0xa910740, LLVM BB @0xa90beb0): 177 %reg1078 = MOV32ri -3 178 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 179 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 180 %reg1080 = IMUL32rr %reg1079, %reg1037 181 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 182 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 183 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 184 %reg1082 = SHL32ri %reg1038, 4 185 %reg1039 = ADD32rr %reg1036, %reg1082 186 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 187 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 188 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 189 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 190 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 191 %reg1040 = MOV32rr %reg1039 192 %reg1084 = AND32ri8 %reg1039, 15 193 CMP32ri8 %reg1084, 0 194 JE mbb<cond_next204,0xa914d30> 195 196Still ok. After register allocation: 197 198cond_next140 (0xa910740, LLVM BB @0xa90beb0): 199 %EAX = MOV32ri -3 200 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0 201 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0 202 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0 203 %EDX = MOV32rm %EDX, 1, %NOREG, 40 204 IMUL32rr %EAX<def&use>, %EDX 205 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0 206 %ESI = MOV32rm %ESI, 1, %NOREG, 0 207 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI 208 %EAX = LEA32r %ESI, 1, %EAX, -3 209 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0 210 %ESI = MOV32rm %ESI, 1, %NOREG, 32 211 %EDI = MOV32rr %EAX 212 SHL32ri %EDI<def&use>, 4 213 ADD32rr %EDI<def&use>, %ESI 214 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 215 %XMM1 = MOVAPSrr %XMM0 216 SHUFPSrr %XMM1<def&use>, %XMM1, 170 217 %XMM2 = MOVAPSrr %XMM0 218 SHUFPSrr %XMM2<def&use>, %XMM2, 0 219 %XMM3 = MOVAPSrr %XMM0 220 SHUFPSrr %XMM3<def&use>, %XMM3, 255 221 SHUFPSrr %XMM0<def&use>, %XMM0, 85 222 %EBX = MOV32rr %EDI 223 AND32ri8 %EBX<def&use>, 15 224 CMP32ri8 %EBX, 0 225 JE mbb<cond_next204,0xa914d30> 226 227This looks really bad. The problem is shufps is a destructive opcode. Since it 228appears as operand two in more than one shufps ops. It resulted in a number of 229copies. Note icc also suffers from the same problem. Either the instruction 230selector should select pshufd or The register allocator can made the two-address 231to three-address transformation. 232 233It also exposes some other problems. See MOV32ri -3 and the spills. 234 235//===---------------------------------------------------------------------===// 236 237Consider: 238 239__m128 test(float a) { 240 return _mm_set_ps(0.0, 0.0, 0.0, a*a); 241} 242 243This compiles into: 244 245movss 4(%esp), %xmm1 246mulss %xmm1, %xmm1 247xorps %xmm0, %xmm0 248movss %xmm1, %xmm0 249ret 250 251Because mulss doesn't modify the top 3 elements, the top elements of 252xmm1 are already zero'd. We could compile this to: 253 254movss 4(%esp), %xmm0 255mulss %xmm0, %xmm0 256ret 257 258//===---------------------------------------------------------------------===// 259 260Here's a sick and twisted idea. Consider code like this: 261 262__m128 test(__m128 a) { 263 float b = *(float*)&A; 264 ... 265 return _mm_set_ps(0.0, 0.0, 0.0, b); 266} 267 268This might compile to this code: 269 270movaps c(%esp), %xmm1 271xorps %xmm0, %xmm0 272movss %xmm1, %xmm0 273ret 274 275Now consider if the ... code caused xmm1 to get spilled. This might produce 276this code: 277 278movaps c(%esp), %xmm1 279movaps %xmm1, c2(%esp) 280... 281 282xorps %xmm0, %xmm0 283movaps c2(%esp), %xmm1 284movss %xmm1, %xmm0 285ret 286 287However, since the reload is only used by these instructions, we could 288"fold" it into the uses, producing something like this: 289 290movaps c(%esp), %xmm1 291movaps %xmm1, c2(%esp) 292... 293 294movss c2(%esp), %xmm0 295ret 296 297... saving two instructions. 298 299The basic idea is that a reload from a spill slot, can, if only one 4-byte 300chunk is used, bring in 3 zeros the one element instead of 4 elements. 301This can be used to simplify a variety of shuffle operations, where the 302elements are fixed zeros. 303 304//===---------------------------------------------------------------------===// 305 306This code generates ugly code, probably due to costs being off or something: 307 308define void @test(float* %P, <4 x float>* %P2 ) { 309 %xFloat0.688 = load float* %P 310 %tmp = load <4 x float>* %P2 311 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 312 store <4 x float> %inFloat3.713, <4 x float>* %P2 313 ret void 314} 315 316Generates: 317 318_test: 319 movl 8(%esp), %eax 320 movaps (%eax), %xmm0 321 pxor %xmm1, %xmm1 322 movaps %xmm0, %xmm2 323 shufps $50, %xmm1, %xmm2 324 shufps $132, %xmm2, %xmm0 325 movaps %xmm0, (%eax) 326 ret 327 328Would it be better to generate: 329 330_test: 331 movl 8(%esp), %ecx 332 movaps (%ecx), %xmm0 333 xor %eax, %eax 334 pinsrw $6, %eax, %xmm0 335 pinsrw $7, %eax, %xmm0 336 movaps %xmm0, (%ecx) 337 ret 338 339? 340 341//===---------------------------------------------------------------------===// 342 343Some useful information in the Apple Altivec / SSE Migration Guide: 344 345http://developer.apple.com/documentation/Performance/Conceptual/ 346Accelerate_sse_migration/index.html 347 348e.g. SSE select using and, andnot, or. Various SSE compare translations. 349 350//===---------------------------------------------------------------------===// 351 352Add hooks to commute some CMPP operations. 353 354//===---------------------------------------------------------------------===// 355 356Apply the same transformation that merged four float into a single 128-bit load 357to loads from constant pool. 358 359//===---------------------------------------------------------------------===// 360 361Floating point max / min are commutable when -enable-unsafe-fp-path is 362specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other 363nodes which are selected to max / min instructions that are marked commutable. 364 365//===---------------------------------------------------------------------===// 366 367We should materialize vector constants like "all ones" and "signbit" with 368code like: 369 370 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 371 372and: 373 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 374 psrlq xmm1, 31 ; xmm1 = all 100000000000... 375 376instead of using a load from the constant pool. The later is important for 377ABS/NEG/copysign etc. 378 379//===---------------------------------------------------------------------===// 380 381These functions: 382 383#include <xmmintrin.h> 384__m128i a; 385void x(unsigned short n) { 386 a = _mm_slli_epi32 (a, n); 387} 388void y(unsigned n) { 389 a = _mm_slli_epi32 (a, n); 390} 391 392compile to ( -O3 -static -fomit-frame-pointer): 393_x: 394 movzwl 4(%esp), %eax 395 movd %eax, %xmm0 396 movaps _a, %xmm1 397 pslld %xmm0, %xmm1 398 movaps %xmm1, _a 399 ret 400_y: 401 movd 4(%esp), %xmm0 402 movaps _a, %xmm1 403 pslld %xmm0, %xmm1 404 movaps %xmm1, _a 405 ret 406 407"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems 408like movd would be sufficient in both cases as the value is already zero 409extended in the 32-bit stack slot IIRC. For signed short, it should also be 410save, as a really-signed value would be undefined for pslld. 411 412 413//===---------------------------------------------------------------------===// 414 415#include <math.h> 416int t1(double d) { return signbit(d); } 417 418This currently compiles to: 419 subl $12, %esp 420 movsd 16(%esp), %xmm0 421 movsd %xmm0, (%esp) 422 movl 4(%esp), %eax 423 shrl $31, %eax 424 addl $12, %esp 425 ret 426 427We should use movmskp{s|d} instead. 428 429//===---------------------------------------------------------------------===// 430 431CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single 432(aligned) vector load. This functionality has a couple of problems. 433 4341. The code to infer alignment from loads of globals is in the X86 backend, 435 not the dag combiner. This is because dagcombine2 needs to be able to see 436 through the X86ISD::Wrapper node, which DAGCombine can't really do. 4372. The code for turning 4 x load into a single vector load is target 438 independent and should be moved to the dag combiner. 4393. The code for turning 4 x load into a vector load can only handle a direct 440 load from a global or a direct load from the stack. It should be generalized 441 to handle any load from P, P+4, P+8, P+12, where P can be anything. 4424. The alignment inference code cannot handle loads from globals in non-static 443 mode because it doesn't look through the extra dyld stub load. If you try 444 vec_align.ll without -relocation-model=static, you'll see what I mean. 445 446//===---------------------------------------------------------------------===// 447 448We should lower store(fneg(load p), q) into an integer load+xor+store, which 449eliminates a constant pool load. For example, consider: 450 451define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { 452entry: 453 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] 454 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly 455 ret i64 %tmp20 456} 457declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly 458 459This currently compiles to: 460 461LCPI1_0: # <4 x float> 462 .long 2147483648 # float -0 463 .long 2147483648 # float -0 464 .long 2147483648 # float -0 465 .long 2147483648 # float -0 466_ccosf: 467 subl $12, %esp 468 movss 16(%esp), %xmm0 469 movss %xmm0, 4(%esp) 470 movss 20(%esp), %xmm0 471 xorps LCPI1_0, %xmm0 472 movss %xmm0, (%esp) 473 call L_ccoshf$stub 474 addl $12, %esp 475 ret 476 477Note the load into xmm0, then xor (to negate), then store. In PIC mode, 478this code computes the pic base and does two loads to do the constant pool 479load, so the improvement is much bigger. 480 481The tricky part about this xform is that the argument load/store isn't exposed 482until post-legalize, and at that point, the fneg has been custom expanded into 483an X86 fxor. This means that we need to handle this case in the x86 backend 484instead of in target independent code. 485 486//===---------------------------------------------------------------------===// 487 488Non-SSE4 insert into 16 x i8 is atrociously bad. 489 490//===---------------------------------------------------------------------===// 491 492<2 x i64> extract is substantially worse than <2 x f64>, even if the destination 493is memory. 494 495//===---------------------------------------------------------------------===// 496 497INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert 498any number of 0.0 simultaneously. Currently we only use it for simple 499insertions. 500 501See comments in LowerINSERT_VECTOR_ELT_SSE4. 502 503//===---------------------------------------------------------------------===// 504 505On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not 506Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are 507legal, it'll just take a few extra patterns written in the .td file. 508 509Note: this is not a code quality issue; the custom lowered code happens to be 510right, but we shouldn't have to custom lower anything. This is probably related 511to <2 x i64> ops being so bad. 512 513//===---------------------------------------------------------------------===// 514 515LLVM currently generates stack realignment code, when it is not necessary 516needed. The problem is that we need to know about stack alignment too early, 517before RA runs. 518 519At that point we don't know, whether there will be vector spill, or not. 520Stack realignment logic is overly conservative here, but otherwise we can 521produce unaligned loads/stores. 522 523Fixing this will require some huge RA changes. 524 525Testcase: 526#include <emmintrin.h> 527 528typedef short vSInt16 __attribute__ ((__vector_size__ (16))); 529 530static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, 531- 22725, - 12873};; 532 533vSInt16 madd(vSInt16 b) 534{ 535 return _mm_madd_epi16(a, b); 536} 537 538Generated code (x86-32, linux): 539madd: 540 pushl %ebp 541 movl %esp, %ebp 542 andl $-16, %esp 543 movaps .LCPI1_0, %xmm1 544 pmaddwd %xmm1, %xmm0 545 movl %ebp, %esp 546 popl %ebp 547 ret 548 549//===---------------------------------------------------------------------===// 550 551Consider: 552#include <emmintrin.h> 553__m128 foo2 (float x) { 554 return _mm_set_ps (0, 0, x, 0); 555} 556 557In x86-32 mode, we generate this spiffy code: 558 559_foo2: 560 movss 4(%esp), %xmm0 561 pshufd $81, %xmm0, %xmm0 562 ret 563 564in x86-64 mode, we generate this code, which could be better: 565 566_foo2: 567 xorps %xmm1, %xmm1 568 movss %xmm0, %xmm1 569 pshufd $81, %xmm1, %xmm0 570 ret 571 572In sse4 mode, we could use insertps to make both better. 573 574Here's another testcase that could use insertps [mem]: 575 576#include <xmmintrin.h> 577extern float x2, x3; 578__m128 foo1 (float x1, float x4) { 579 return _mm_set_ps (x2, x1, x3, x4); 580} 581 582gcc mainline compiles it to: 583 584foo1: 585 insertps $0x10, x2(%rip), %xmm0 586 insertps $0x10, x3(%rip), %xmm1 587 movaps %xmm1, %xmm2 588 movlhps %xmm0, %xmm2 589 movaps %xmm2, %xmm0 590 ret 591 592//===---------------------------------------------------------------------===// 593 594We compile vector multiply-by-constant into poor code: 595 596define <4 x i32> @f(<4 x i32> %i) nounwind { 597 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > 598 ret <4 x i32> %A 599} 600 601On targets without SSE4.1, this compiles into: 602 603LCPI1_0: ## <4 x i32> 604 .long 10 605 .long 10 606 .long 10 607 .long 10 608 .text 609 .align 4,0x90 610 .globl _f 611_f: 612 pshufd $3, %xmm0, %xmm1 613 movd %xmm1, %eax 614 imull LCPI1_0+12, %eax 615 movd %eax, %xmm1 616 pshufd $1, %xmm0, %xmm2 617 movd %xmm2, %eax 618 imull LCPI1_0+4, %eax 619 movd %eax, %xmm2 620 punpckldq %xmm1, %xmm2 621 movd %xmm0, %eax 622 imull LCPI1_0, %eax 623 movd %eax, %xmm1 624 movhlps %xmm0, %xmm0 625 movd %xmm0, %eax 626 imull LCPI1_0+8, %eax 627 movd %eax, %xmm0 628 punpckldq %xmm0, %xmm1 629 movaps %xmm1, %xmm0 630 punpckldq %xmm2, %xmm0 631 ret 632 633It would be better to synthesize integer vector multiplication by constants 634using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, 635simple cases such as multiplication by powers of two would be better as 636vector shifts than as multiplications. 637 638//===---------------------------------------------------------------------===// 639 640We compile this: 641 642__m128i 643foo2 (char x) 644{ 645 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); 646} 647 648into: 649 movl $1, %eax 650 xorps %xmm0, %xmm0 651 pinsrw $2, %eax, %xmm0 652 movzbl 4(%esp), %eax 653 pinsrw $3, %eax, %xmm0 654 movl $256, %eax 655 pinsrw $7, %eax, %xmm0 656 ret 657 658 659gcc-4.2: 660 subl $12, %esp 661 movzbl 16(%esp), %eax 662 movdqa LC0, %xmm0 663 pinsrw $3, %eax, %xmm0 664 addl $12, %esp 665 ret 666 .const 667 .align 4 668LC0: 669 .word 0 670 .word 0 671 .word 1 672 .word 0 673 .word 0 674 .word 0 675 .word 0 676 .word 256 677 678With SSE4, it should be 679 movdqa .LC0(%rip), %xmm0 680 pinsrb $6, %edi, %xmm0 681 682//===---------------------------------------------------------------------===// 683 684We should transform a shuffle of two vectors of constants into a single vector 685of constants. Also, insertelement of a constant into a vector of constants 686should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. 687 688We compiled it to something horrible: 689 690 .align 4 691LCPI1_1: ## float 692 .long 1065353216 ## float 1 693 .const 694 695 .align 4 696LCPI1_0: ## <4 x float> 697 .space 4 698 .long 1065353216 ## float 1 699 .space 4 700 .long 1065353216 ## float 1 701 .text 702 .align 4,0x90 703 .globl _t 704_t: 705 xorps %xmm0, %xmm0 706 movhps LCPI1_0, %xmm0 707 movss LCPI1_1, %xmm1 708 movaps %xmm0, %xmm2 709 shufps $2, %xmm1, %xmm2 710 shufps $132, %xmm2, %xmm0 711 movaps %xmm0, 0 712 713//===---------------------------------------------------------------------===// 714rdar://5907648 715 716This function: 717 718float foo(unsigned char x) { 719 return x; 720} 721 722compiles to (x86-32): 723 724define float @foo(i8 zeroext %x) nounwind { 725 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] 726 ret float %tmp12 727} 728 729compiles to: 730 731_foo: 732 subl $4, %esp 733 movzbl 8(%esp), %eax 734 cvtsi2ss %eax, %xmm0 735 movss %xmm0, (%esp) 736 flds (%esp) 737 addl $4, %esp 738 ret 739 740We should be able to use: 741 cvtsi2ss 8($esp), %xmm0 742since we know the stack slot is already zext'd. 743 744//===---------------------------------------------------------------------===// 745 746Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) 747when code size is critical. movlps is slower than movsd on core2 but it's one 748byte shorter. 749 750//===---------------------------------------------------------------------===// 751 752We should use a dynamic programming based approach to tell when using FPStack 753operations is cheaper than SSE. SciMark montecarlo contains code like this 754for example: 755 756double MonteCarlo_num_flops(int Num_samples) { 757 return ((double) Num_samples)* 4.0; 758} 759 760In fpstack mode, this compiles into: 761 762LCPI1_0: 763 .long 1082130432 ## float 4.000000e+00 764_MonteCarlo_num_flops: 765 subl $4, %esp 766 movl 8(%esp), %eax 767 movl %eax, (%esp) 768 fildl (%esp) 769 fmuls LCPI1_0 770 addl $4, %esp 771 ret 772 773in SSE mode, it compiles into significantly slower code: 774 775_MonteCarlo_num_flops: 776 subl $12, %esp 777 cvtsi2sd 16(%esp), %xmm0 778 mulsd LCPI1_0, %xmm0 779 movsd %xmm0, (%esp) 780 fldl (%esp) 781 addl $12, %esp 782 ret 783 784There are also other cases in scimark where using fpstack is better, it is 785cheaper to do fld1 than load from a constant pool for example, so 786"load, add 1.0, store" is better done in the fp stack, etc. 787 788//===---------------------------------------------------------------------===// 789 790The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to 791"cmpsd". For example, this code: 792 793double d1(double x) { return x == x ? x : x + x; } 794 795Compiles into: 796 797_d1: 798 ucomisd %xmm0, %xmm0 799 jnp LBB1_2 800 addsd %xmm0, %xmm0 801 ret 802LBB1_2: 803 ret 804 805Also, the 'ret's should be shared. This is PR6032. 806 807//===---------------------------------------------------------------------===// 808 809These should compile into the same code (PR6214): Perhaps instcombine should 810canonicalize the former into the later? 811 812define float @foo(float %x) nounwind { 813 %t = bitcast float %x to i32 814 %s = and i32 %t, 2147483647 815 %d = bitcast i32 %s to float 816 ret float %d 817} 818 819declare float @fabsf(float %n) 820define float @bar(float %x) nounwind { 821 %d = call float @fabsf(float %x) 822 ret float %d 823} 824 825//===---------------------------------------------------------------------===// 826 827This IR (from PR6194): 828 829target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 830target triple = "x86_64-apple-darwin10.0.0" 831 832%0 = type { double, double } 833%struct.float3 = type { float, float, float } 834 835define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { 836entry: 837 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] 838 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] 839 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] 840 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] 841 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] 842 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] 843 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] 844 store float %tmp12, float* %tmp5 845 ret void 846} 847 848Compiles to: 849 850_test: ## @test 851 movd %xmm0, %rax 852 shrq $32, %rax 853 movl %eax, 4(%rdi) 854 ret 855 856This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and 857doing a shuffle from v[1] to v[0] then a float store. 858 859//===---------------------------------------------------------------------===// 860 861On SSE4 machines, we compile this code: 862 863define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, 864 <2 x float> *%P) nounwind { 865 %Z = fadd <2 x float> %Q, %R 866 867 store <2 x float> %Z, <2 x float> *%P 868 ret <2 x float> %Z 869} 870 871into: 872 873_test2: ## @test2 874## BB#0: 875 insertps $0, %xmm2, %xmm2 876 insertps $16, %xmm3, %xmm2 877 insertps $0, %xmm0, %xmm3 878 insertps $16, %xmm1, %xmm3 879 addps %xmm2, %xmm3 880 movq %xmm3, (%rdi) 881 movaps %xmm3, %xmm0 882 pshufd $1, %xmm3, %xmm1 883 ## kill: XMM1<def> XMM1<kill> 884 ret 885 886The insertps's of $0 are pointless complex copies. 887 888//===---------------------------------------------------------------------===// 889 890[UNSAFE FP] 891 892void foo(double, double, double); 893void norm(double x, double y, double z) { 894 double scale = __builtin_sqrt(x*x + y*y + z*z); 895 foo(x/scale, y/scale, z/scale); 896} 897 898We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is 899slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first 900and emit 3 mulsd in place of the divs. This can be done as a target-independent 901transform. 902 903If we're dealing with floats instead of doubles we could even replace the sqrtss 904and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the 905cost of reduced accuracy. 906 907//===---------------------------------------------------------------------===// 908 909This function should be matched to haddpd when the appropriate CPU is enabled: 910 911#include <x86intrin.h> 912double f (__m128d p) { 913 return p[0] + p[1]; 914} 915 916similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should 917turn into hsubpd also. 918 919//===---------------------------------------------------------------------===// 920 921define <2 x i32> @foo(<2 x double> %in) { 922 %x = fptosi <2 x double> %in to <2 x i32> 923 ret <2 x i32> %x 924} 925 926Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si. 927 928//===---------------------------------------------------------------------===// 929