1//===---------------------------------------------------------------------===// 2// Random ideas for the X86 backend: SSE-specific stuff. 3//===---------------------------------------------------------------------===// 4 5//===---------------------------------------------------------------------===// 6 7SSE Variable shift can be custom lowered to something like this, which uses a 8small table + unaligned load + shuffle instead of going through memory. 9 10__m128i_shift_right: 11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 13 14... 15__m128i shift_right(__m128i value, unsigned long offset) { 16 return _mm_shuffle_epi8(value, 17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); 18} 19 20//===---------------------------------------------------------------------===// 21 22SSE has instructions for doing operations on complex numbers, we should pattern 23match them. For example, this should turn into a horizontal add: 24 25typedef float __attribute__((vector_size(16))) v4f32; 26float f32(v4f32 A) { 27 return A[0]+A[1]+A[2]+A[3]; 28} 29 30Instead we get this: 31 32_f32: ## @f32 33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] 34 addss %xmm0, %xmm1 35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] 36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] 37 movaps %xmm0, %xmm3 38 addss %xmm1, %xmm3 39 movdqa %xmm2, %xmm0 40 addss %xmm3, %xmm0 41 ret 42 43Also, there are cases where some simple local SLP would improve codegen a bit. 44compiling this: 45 46_Complex float f32(_Complex float A, _Complex float B) { 47 return A+B; 48} 49 50into: 51 52_f32: ## @f32 53 movdqa %xmm0, %xmm2 54 addss %xmm1, %xmm2 55 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] 56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] 57 addss %xmm1, %xmm3 58 movaps %xmm2, %xmm0 59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 60 ret 61 62seems silly when it could just be one addps. 63 64 65//===---------------------------------------------------------------------===// 66 67Expand libm rounding functions inline: Significant speedups possible. 68http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html 69 70//===---------------------------------------------------------------------===// 71 72When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and 73other fast SSE modes. 74 75//===---------------------------------------------------------------------===// 76 77Think about doing i64 math in SSE regs on x86-32. 78 79//===---------------------------------------------------------------------===// 80 81This testcase should have no SSE instructions in it, and only one load from 82a constant pool: 83 84double %test3(bool %B) { 85 %C = select bool %B, double 123.412, double 523.01123123 86 ret double %C 87} 88 89Currently, the select is being lowered, which prevents the dag combiner from 90turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' 91 92The pattern isel got this one right. 93 94//===---------------------------------------------------------------------===// 95 96Lower memcpy / memset to a series of SSE 128 bit move instructions when it's 97feasible. 98 99//===---------------------------------------------------------------------===// 100 101Codegen: 102 if (copysign(1.0, x) == copysign(1.0, y)) 103into: 104 if (x^y & mask) 105when using SSE. 106 107//===---------------------------------------------------------------------===// 108 109Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half 110of a v4sf value. 111 112//===---------------------------------------------------------------------===// 113 114Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. 115Perhaps use pxor / xorp* to clear a XMM register first? 116 117//===---------------------------------------------------------------------===// 118 119External test Nurbs exposed some problems. Look for 120__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc 121emits: 122 123 movaps (%edx), %xmm2 #59.21 124 movaps (%edx), %xmm5 #60.21 125 movaps (%edx), %xmm4 #61.21 126 movaps (%edx), %xmm3 #62.21 127 movl 40(%ecx), %ebp #69.49 128 shufps $0, %xmm2, %xmm5 #60.21 129 movl 100(%esp), %ebx #69.20 130 movl (%ebx), %edi #69.20 131 imull %ebp, %edi #69.49 132 addl (%eax), %edi #70.33 133 shufps $85, %xmm2, %xmm4 #61.21 134 shufps $170, %xmm2, %xmm3 #62.21 135 shufps $255, %xmm2, %xmm2 #63.21 136 lea (%ebp,%ebp,2), %ebx #69.49 137 negl %ebx #69.49 138 lea -3(%edi,%ebx), %ebx #70.33 139 shll $4, %ebx #68.37 140 addl 32(%ecx), %ebx #68.37 141 testb $15, %bl #91.13 142 jne L_B1.24 # Prob 5% #91.13 143 144This is the llvm code after instruction scheduling: 145 146cond_next140 (0xa910740, LLVM BB @0xa90beb0): 147 %reg1078 = MOV32ri -3 148 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0 149 %reg1037 = MOV32rm %reg1024, 1, %noreg, 40 150 %reg1080 = IMUL32rr %reg1079, %reg1037 151 %reg1081 = MOV32rm %reg1058, 1, %noreg, 0 152 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 153 %reg1036 = MOV32rm %reg1024, 1, %noreg, 32 154 %reg1082 = SHL32ri %reg1038, 4 155 %reg1039 = ADD32rr %reg1036, %reg1082 156 %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0 157 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 158 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 159 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 160 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 161 %reg1040 = MOV32rr %reg1039 162 %reg1084 = AND32ri8 %reg1039, 15 163 CMP32ri8 %reg1084, 0 164 JE mbb<cond_next204,0xa914d30> 165 166Still ok. After register allocation: 167 168cond_next140 (0xa910740, LLVM BB @0xa90beb0): 169 %eax = MOV32ri -3 170 %edx = MOV32rm %stack.3, 1, %noreg, 0 171 ADD32rm %eax<def&use>, %edx, 1, %noreg, 0 172 %edx = MOV32rm %stack.7, 1, %noreg, 0 173 %edx = MOV32rm %edx, 1, %noreg, 40 174 IMUL32rr %eax<def&use>, %edx 175 %esi = MOV32rm %stack.5, 1, %noreg, 0 176 %esi = MOV32rm %esi, 1, %noreg, 0 177 MOV32mr %stack.4, 1, %noreg, 0, %esi 178 %eax = LEA32r %esi, 1, %eax, -3 179 %esi = MOV32rm %stack.7, 1, %noreg, 0 180 %esi = MOV32rm %esi, 1, %noreg, 32 181 %edi = MOV32rr %eax 182 SHL32ri %edi<def&use>, 4 183 ADD32rr %edi<def&use>, %esi 184 %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0 185 %xmm1 = MOVAPSrr %xmm0 186 SHUFPSrr %xmm1<def&use>, %xmm1, 170 187 %xmm2 = MOVAPSrr %xmm0 188 SHUFPSrr %xmm2<def&use>, %xmm2, 0 189 %xmm3 = MOVAPSrr %xmm0 190 SHUFPSrr %xmm3<def&use>, %xmm3, 255 191 SHUFPSrr %xmm0<def&use>, %xmm0, 85 192 %ebx = MOV32rr %edi 193 AND32ri8 %ebx<def&use>, 15 194 CMP32ri8 %ebx, 0 195 JE mbb<cond_next204,0xa914d30> 196 197This looks really bad. The problem is shufps is a destructive opcode. Since it 198appears as operand two in more than one shufps ops. It resulted in a number of 199copies. Note icc also suffers from the same problem. Either the instruction 200selector should select pshufd or The register allocator can made the two-address 201to three-address transformation. 202 203It also exposes some other problems. See MOV32ri -3 and the spills. 204 205//===---------------------------------------------------------------------===// 206 207Consider: 208 209__m128 test(float a) { 210 return _mm_set_ps(0.0, 0.0, 0.0, a*a); 211} 212 213This compiles into: 214 215movss 4(%esp), %xmm1 216mulss %xmm1, %xmm1 217xorps %xmm0, %xmm0 218movss %xmm1, %xmm0 219ret 220 221Because mulss doesn't modify the top 3 elements, the top elements of 222xmm1 are already zero'd. We could compile this to: 223 224movss 4(%esp), %xmm0 225mulss %xmm0, %xmm0 226ret 227 228//===---------------------------------------------------------------------===// 229 230Here's a sick and twisted idea. Consider code like this: 231 232__m128 test(__m128 a) { 233 float b = *(float*)&A; 234 ... 235 return _mm_set_ps(0.0, 0.0, 0.0, b); 236} 237 238This might compile to this code: 239 240movaps c(%esp), %xmm1 241xorps %xmm0, %xmm0 242movss %xmm1, %xmm0 243ret 244 245Now consider if the ... code caused xmm1 to get spilled. This might produce 246this code: 247 248movaps c(%esp), %xmm1 249movaps %xmm1, c2(%esp) 250... 251 252xorps %xmm0, %xmm0 253movaps c2(%esp), %xmm1 254movss %xmm1, %xmm0 255ret 256 257However, since the reload is only used by these instructions, we could 258"fold" it into the uses, producing something like this: 259 260movaps c(%esp), %xmm1 261movaps %xmm1, c2(%esp) 262... 263 264movss c2(%esp), %xmm0 265ret 266 267... saving two instructions. 268 269The basic idea is that a reload from a spill slot, can, if only one 4-byte 270chunk is used, bring in 3 zeros the one element instead of 4 elements. 271This can be used to simplify a variety of shuffle operations, where the 272elements are fixed zeros. 273 274//===---------------------------------------------------------------------===// 275 276This code generates ugly code, probably due to costs being off or something: 277 278define void @test(float* %P, <4 x float>* %P2 ) { 279 %xFloat0.688 = load float* %P 280 %tmp = load <4 x float>* %P2 281 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 282 store <4 x float> %inFloat3.713, <4 x float>* %P2 283 ret void 284} 285 286Generates: 287 288_test: 289 movl 8(%esp), %eax 290 movaps (%eax), %xmm0 291 pxor %xmm1, %xmm1 292 movaps %xmm0, %xmm2 293 shufps $50, %xmm1, %xmm2 294 shufps $132, %xmm2, %xmm0 295 movaps %xmm0, (%eax) 296 ret 297 298Would it be better to generate: 299 300_test: 301 movl 8(%esp), %ecx 302 movaps (%ecx), %xmm0 303 xor %eax, %eax 304 pinsrw $6, %eax, %xmm0 305 pinsrw $7, %eax, %xmm0 306 movaps %xmm0, (%ecx) 307 ret 308 309? 310 311//===---------------------------------------------------------------------===// 312 313Some useful information in the Apple Altivec / SSE Migration Guide: 314 315http://developer.apple.com/documentation/Performance/Conceptual/ 316Accelerate_sse_migration/index.html 317 318e.g. SSE select using and, andnot, or. Various SSE compare translations. 319 320//===---------------------------------------------------------------------===// 321 322Add hooks to commute some CMPP operations. 323 324//===---------------------------------------------------------------------===// 325 326Apply the same transformation that merged four float into a single 128-bit load 327to loads from constant pool. 328 329//===---------------------------------------------------------------------===// 330 331Floating point max / min are commutable when -enable-unsafe-fp-path is 332specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other 333nodes which are selected to max / min instructions that are marked commutable. 334 335//===---------------------------------------------------------------------===// 336 337We should materialize vector constants like "all ones" and "signbit" with 338code like: 339 340 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 341 342and: 343 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 344 psrlq xmm1, 31 ; xmm1 = all 100000000000... 345 346instead of using a load from the constant pool. The later is important for 347ABS/NEG/copysign etc. 348 349//===---------------------------------------------------------------------===// 350 351These functions: 352 353#include <xmmintrin.h> 354__m128i a; 355void x(unsigned short n) { 356 a = _mm_slli_epi32 (a, n); 357} 358void y(unsigned n) { 359 a = _mm_slli_epi32 (a, n); 360} 361 362compile to ( -O3 -static -fomit-frame-pointer): 363_x: 364 movzwl 4(%esp), %eax 365 movd %eax, %xmm0 366 movaps _a, %xmm1 367 pslld %xmm0, %xmm1 368 movaps %xmm1, _a 369 ret 370_y: 371 movd 4(%esp), %xmm0 372 movaps _a, %xmm1 373 pslld %xmm0, %xmm1 374 movaps %xmm1, _a 375 ret 376 377"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems 378like movd would be sufficient in both cases as the value is already zero 379extended in the 32-bit stack slot IIRC. For signed short, it should also be 380save, as a really-signed value would be undefined for pslld. 381 382 383//===---------------------------------------------------------------------===// 384 385#include <math.h> 386int t1(double d) { return signbit(d); } 387 388This currently compiles to: 389 subl $12, %esp 390 movsd 16(%esp), %xmm0 391 movsd %xmm0, (%esp) 392 movl 4(%esp), %eax 393 shrl $31, %eax 394 addl $12, %esp 395 ret 396 397We should use movmskp{s|d} instead. 398 399//===---------------------------------------------------------------------===// 400 401CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single 402(aligned) vector load. This functionality has a couple of problems. 403 4041. The code to infer alignment from loads of globals is in the X86 backend, 405 not the dag combiner. This is because dagcombine2 needs to be able to see 406 through the X86ISD::Wrapper node, which DAGCombine can't really do. 4072. The code for turning 4 x load into a single vector load is target 408 independent and should be moved to the dag combiner. 4093. The code for turning 4 x load into a vector load can only handle a direct 410 load from a global or a direct load from the stack. It should be generalized 411 to handle any load from P, P+4, P+8, P+12, where P can be anything. 4124. The alignment inference code cannot handle loads from globals in non-static 413 mode because it doesn't look through the extra dyld stub load. If you try 414 vec_align.ll without -relocation-model=static, you'll see what I mean. 415 416//===---------------------------------------------------------------------===// 417 418We should lower store(fneg(load p), q) into an integer load+xor+store, which 419eliminates a constant pool load. For example, consider: 420 421define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { 422entry: 423 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] 424 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly 425 ret i64 %tmp20 426} 427declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly 428 429This currently compiles to: 430 431LCPI1_0: # <4 x float> 432 .long 2147483648 # float -0 433 .long 2147483648 # float -0 434 .long 2147483648 # float -0 435 .long 2147483648 # float -0 436_ccosf: 437 subl $12, %esp 438 movss 16(%esp), %xmm0 439 movss %xmm0, 4(%esp) 440 movss 20(%esp), %xmm0 441 xorps LCPI1_0, %xmm0 442 movss %xmm0, (%esp) 443 call L_ccoshf$stub 444 addl $12, %esp 445 ret 446 447Note the load into xmm0, then xor (to negate), then store. In PIC mode, 448this code computes the pic base and does two loads to do the constant pool 449load, so the improvement is much bigger. 450 451The tricky part about this xform is that the argument load/store isn't exposed 452until post-legalize, and at that point, the fneg has been custom expanded into 453an X86 fxor. This means that we need to handle this case in the x86 backend 454instead of in target independent code. 455 456//===---------------------------------------------------------------------===// 457 458Non-SSE4 insert into 16 x i8 is atrociously bad. 459 460//===---------------------------------------------------------------------===// 461 462<2 x i64> extract is substantially worse than <2 x f64>, even if the destination 463is memory. 464 465//===---------------------------------------------------------------------===// 466 467INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert 468any number of 0.0 simultaneously. Currently we only use it for simple 469insertions. 470 471See comments in LowerINSERT_VECTOR_ELT_SSE4. 472 473//===---------------------------------------------------------------------===// 474 475On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not 476Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are 477legal, it'll just take a few extra patterns written in the .td file. 478 479Note: this is not a code quality issue; the custom lowered code happens to be 480right, but we shouldn't have to custom lower anything. This is probably related 481to <2 x i64> ops being so bad. 482 483//===---------------------------------------------------------------------===// 484 485LLVM currently generates stack realignment code, when it is not necessary 486needed. The problem is that we need to know about stack alignment too early, 487before RA runs. 488 489At that point we don't know, whether there will be vector spill, or not. 490Stack realignment logic is overly conservative here, but otherwise we can 491produce unaligned loads/stores. 492 493Fixing this will require some huge RA changes. 494 495Testcase: 496#include <emmintrin.h> 497 498typedef short vSInt16 __attribute__ ((__vector_size__ (16))); 499 500static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, 501- 22725, - 12873};; 502 503vSInt16 madd(vSInt16 b) 504{ 505 return _mm_madd_epi16(a, b); 506} 507 508Generated code (x86-32, linux): 509madd: 510 pushl %ebp 511 movl %esp, %ebp 512 andl $-16, %esp 513 movaps .LCPI1_0, %xmm1 514 pmaddwd %xmm1, %xmm0 515 movl %ebp, %esp 516 popl %ebp 517 ret 518 519//===---------------------------------------------------------------------===// 520 521Consider: 522#include <emmintrin.h> 523__m128 foo2 (float x) { 524 return _mm_set_ps (0, 0, x, 0); 525} 526 527In x86-32 mode, we generate this spiffy code: 528 529_foo2: 530 movss 4(%esp), %xmm0 531 pshufd $81, %xmm0, %xmm0 532 ret 533 534in x86-64 mode, we generate this code, which could be better: 535 536_foo2: 537 xorps %xmm1, %xmm1 538 movss %xmm0, %xmm1 539 pshufd $81, %xmm1, %xmm0 540 ret 541 542In sse4 mode, we could use insertps to make both better. 543 544Here's another testcase that could use insertps [mem]: 545 546#include <xmmintrin.h> 547extern float x2, x3; 548__m128 foo1 (float x1, float x4) { 549 return _mm_set_ps (x2, x1, x3, x4); 550} 551 552gcc mainline compiles it to: 553 554foo1: 555 insertps $0x10, x2(%rip), %xmm0 556 insertps $0x10, x3(%rip), %xmm1 557 movaps %xmm1, %xmm2 558 movlhps %xmm0, %xmm2 559 movaps %xmm2, %xmm0 560 ret 561 562//===---------------------------------------------------------------------===// 563 564We compile vector multiply-by-constant into poor code: 565 566define <4 x i32> @f(<4 x i32> %i) nounwind { 567 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > 568 ret <4 x i32> %A 569} 570 571On targets without SSE4.1, this compiles into: 572 573LCPI1_0: ## <4 x i32> 574 .long 10 575 .long 10 576 .long 10 577 .long 10 578 .text 579 .align 4,0x90 580 .globl _f 581_f: 582 pshufd $3, %xmm0, %xmm1 583 movd %xmm1, %eax 584 imull LCPI1_0+12, %eax 585 movd %eax, %xmm1 586 pshufd $1, %xmm0, %xmm2 587 movd %xmm2, %eax 588 imull LCPI1_0+4, %eax 589 movd %eax, %xmm2 590 punpckldq %xmm1, %xmm2 591 movd %xmm0, %eax 592 imull LCPI1_0, %eax 593 movd %eax, %xmm1 594 movhlps %xmm0, %xmm0 595 movd %xmm0, %eax 596 imull LCPI1_0+8, %eax 597 movd %eax, %xmm0 598 punpckldq %xmm0, %xmm1 599 movaps %xmm1, %xmm0 600 punpckldq %xmm2, %xmm0 601 ret 602 603It would be better to synthesize integer vector multiplication by constants 604using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, 605simple cases such as multiplication by powers of two would be better as 606vector shifts than as multiplications. 607 608//===---------------------------------------------------------------------===// 609 610We compile this: 611 612__m128i 613foo2 (char x) 614{ 615 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); 616} 617 618into: 619 movl $1, %eax 620 xorps %xmm0, %xmm0 621 pinsrw $2, %eax, %xmm0 622 movzbl 4(%esp), %eax 623 pinsrw $3, %eax, %xmm0 624 movl $256, %eax 625 pinsrw $7, %eax, %xmm0 626 ret 627 628 629gcc-4.2: 630 subl $12, %esp 631 movzbl 16(%esp), %eax 632 movdqa LC0, %xmm0 633 pinsrw $3, %eax, %xmm0 634 addl $12, %esp 635 ret 636 .const 637 .align 4 638LC0: 639 .word 0 640 .word 0 641 .word 1 642 .word 0 643 .word 0 644 .word 0 645 .word 0 646 .word 256 647 648With SSE4, it should be 649 movdqa .LC0(%rip), %xmm0 650 pinsrb $6, %edi, %xmm0 651 652//===---------------------------------------------------------------------===// 653 654We should transform a shuffle of two vectors of constants into a single vector 655of constants. Also, insertelement of a constant into a vector of constants 656should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. 657 658We compiled it to something horrible: 659 660 .align 4 661LCPI1_1: ## float 662 .long 1065353216 ## float 1 663 .const 664 665 .align 4 666LCPI1_0: ## <4 x float> 667 .space 4 668 .long 1065353216 ## float 1 669 .space 4 670 .long 1065353216 ## float 1 671 .text 672 .align 4,0x90 673 .globl _t 674_t: 675 xorps %xmm0, %xmm0 676 movhps LCPI1_0, %xmm0 677 movss LCPI1_1, %xmm1 678 movaps %xmm0, %xmm2 679 shufps $2, %xmm1, %xmm2 680 shufps $132, %xmm2, %xmm0 681 movaps %xmm0, 0 682 683//===---------------------------------------------------------------------===// 684rdar://5907648 685 686This function: 687 688float foo(unsigned char x) { 689 return x; 690} 691 692compiles to (x86-32): 693 694define float @foo(i8 zeroext %x) nounwind { 695 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] 696 ret float %tmp12 697} 698 699compiles to: 700 701_foo: 702 subl $4, %esp 703 movzbl 8(%esp), %eax 704 cvtsi2ss %eax, %xmm0 705 movss %xmm0, (%esp) 706 flds (%esp) 707 addl $4, %esp 708 ret 709 710We should be able to use: 711 cvtsi2ss 8($esp), %xmm0 712since we know the stack slot is already zext'd. 713 714//===---------------------------------------------------------------------===// 715 716Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) 717when code size is critical. movlps is slower than movsd on core2 but it's one 718byte shorter. 719 720//===---------------------------------------------------------------------===// 721 722We should use a dynamic programming based approach to tell when using FPStack 723operations is cheaper than SSE. SciMark montecarlo contains code like this 724for example: 725 726double MonteCarlo_num_flops(int Num_samples) { 727 return ((double) Num_samples)* 4.0; 728} 729 730In fpstack mode, this compiles into: 731 732LCPI1_0: 733 .long 1082130432 ## float 4.000000e+00 734_MonteCarlo_num_flops: 735 subl $4, %esp 736 movl 8(%esp), %eax 737 movl %eax, (%esp) 738 fildl (%esp) 739 fmuls LCPI1_0 740 addl $4, %esp 741 ret 742 743in SSE mode, it compiles into significantly slower code: 744 745_MonteCarlo_num_flops: 746 subl $12, %esp 747 cvtsi2sd 16(%esp), %xmm0 748 mulsd LCPI1_0, %xmm0 749 movsd %xmm0, (%esp) 750 fldl (%esp) 751 addl $12, %esp 752 ret 753 754There are also other cases in scimark where using fpstack is better, it is 755cheaper to do fld1 than load from a constant pool for example, so 756"load, add 1.0, store" is better done in the fp stack, etc. 757 758//===---------------------------------------------------------------------===// 759 760These should compile into the same code (PR6214): Perhaps instcombine should 761canonicalize the former into the later? 762 763define float @foo(float %x) nounwind { 764 %t = bitcast float %x to i32 765 %s = and i32 %t, 2147483647 766 %d = bitcast i32 %s to float 767 ret float %d 768} 769 770declare float @fabsf(float %n) 771define float @bar(float %x) nounwind { 772 %d = call float @fabsf(float %x) 773 ret float %d 774} 775 776//===---------------------------------------------------------------------===// 777 778This IR (from PR6194): 779 780target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 781target triple = "x86_64-apple-darwin10.0.0" 782 783%0 = type { double, double } 784%struct.float3 = type { float, float, float } 785 786define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { 787entry: 788 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] 789 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] 790 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] 791 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] 792 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] 793 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] 794 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] 795 store float %tmp12, float* %tmp5 796 ret void 797} 798 799Compiles to: 800 801_test: ## @test 802 movd %xmm0, %rax 803 shrq $32, %rax 804 movl %eax, 4(%rdi) 805 ret 806 807This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and 808doing a shuffle from v[1] to v[0] then a float store. 809 810//===---------------------------------------------------------------------===// 811 812[UNSAFE FP] 813 814void foo(double, double, double); 815void norm(double x, double y, double z) { 816 double scale = __builtin_sqrt(x*x + y*y + z*z); 817 foo(x/scale, y/scale, z/scale); 818} 819 820We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is 821slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first 822and emit 3 mulsd in place of the divs. This can be done as a target-independent 823transform. 824 825If we're dealing with floats instead of doubles we could even replace the sqrtss 826and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the 827cost of reduced accuracy. 828 829//===---------------------------------------------------------------------===// 830 831This function should be matched to haddpd when the appropriate CPU is enabled: 832 833#include <x86intrin.h> 834double f (__m128d p) { 835 return p[0] + p[1]; 836} 837 838similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should 839turn into hsubpd also. 840 841//===---------------------------------------------------------------------===// 842