1//===---------------------------------------------------------------------===// 2// Random ideas for the X86 backend. 3//===---------------------------------------------------------------------===// 4 5This should be one DIV/IDIV instruction, not a libcall: 6 7unsigned test(unsigned long long X, unsigned Y) { 8 return X/Y; 9} 10 11This can be done trivially with a custom legalizer. What about overflow 12though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224 13 14//===---------------------------------------------------------------------===// 15 16Improvements to the multiply -> shift/add algorithm: 17http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html 18 19//===---------------------------------------------------------------------===// 20 21Improve code like this (occurs fairly frequently, e.g. in LLVM): 22long long foo(int x) { return 1LL << x; } 23 24http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html 25http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html 26http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html 27 28Another useful one would be ~0ULL >> X and ~0ULL << X. 29 30One better solution for 1LL << x is: 31 xorl %eax, %eax 32 xorl %edx, %edx 33 testb $32, %cl 34 sete %al 35 setne %dl 36 sall %cl, %eax 37 sall %cl, %edx 38 39But that requires good 8-bit subreg support. 40 41Also, this might be better. It's an extra shift, but it's one instruction 42shorter, and doesn't stress 8-bit subreg support. 43(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html, 44but without the unnecessary and.) 45 movl %ecx, %eax 46 shrl $5, %eax 47 movl %eax, %edx 48 xorl $1, %edx 49 sall %cl, %eax 50 sall %cl. %edx 51 5264-bit shifts (in general) expand to really bad code. Instead of using 53cmovs, we should expand to a conditional branch like GCC produces. 54 55//===---------------------------------------------------------------------===// 56 57Some isel ideas: 58 591. Dynamic programming based approach when compile time is not an 60 issue. 612. Code duplication (addressing mode) during isel. 623. Other ideas from "Register-Sensitive Selection, Duplication, and 63 Sequencing of Instructions". 644. Scheduling for reduced register pressure. E.g. "Minimum Register 65 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 66 and other related papers. 67 http://citeseer.ist.psu.edu/govindarajan01minimum.html 68 69//===---------------------------------------------------------------------===// 70 71Should we promote i16 to i32 to avoid partial register update stalls? 72 73//===---------------------------------------------------------------------===// 74 75Leave any_extend as pseudo instruction and hint to register 76allocator. Delay codegen until post register allocation. 77Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach 78the coalescer how to deal with it though. 79 80//===---------------------------------------------------------------------===// 81 82It appears icc use push for parameter passing. Need to investigate. 83 84//===---------------------------------------------------------------------===// 85 86This: 87 88void foo(void); 89void bar(int x, int *P) { 90 x >>= 2; 91 if (x) 92 foo(); 93 *P = x; 94} 95 96compiles into: 97 98 movq %rsi, %rbx 99 movl %edi, %r14d 100 sarl $2, %r14d 101 testl %r14d, %r14d 102 je LBB0_2 103 104Instead of doing an explicit test, we can use the flags off the sar. This 105occurs in a bigger testcase like this, which is pretty common: 106 107#include <vector> 108int test1(std::vector<int> &X) { 109 int Sum = 0; 110 for (long i = 0, e = X.size(); i != e; ++i) 111 X[i] = 0; 112 return Sum; 113} 114 115//===---------------------------------------------------------------------===// 116 117Only use inc/neg/not instructions on processors where they are faster than 118add/sub/xor. They are slower on the P4 due to only updating some processor 119flags. 120 121//===---------------------------------------------------------------------===// 122 123The instruction selector sometimes misses folding a load into a compare. The 124pattern is written as (cmp reg, (load p)). Because the compare isn't 125commutative, it is not matched with the load on both sides. The dag combiner 126should be made smart enough to cannonicalize the load into the RHS of a compare 127when it can invert the result of the compare for free. 128 129//===---------------------------------------------------------------------===// 130 131In many cases, LLVM generates code like this: 132 133_test: 134 movl 8(%esp), %eax 135 cmpl %eax, 4(%esp) 136 setl %al 137 movzbl %al, %eax 138 ret 139 140on some processors (which ones?), it is more efficient to do this: 141 142_test: 143 movl 8(%esp), %ebx 144 xor %eax, %eax 145 cmpl %ebx, 4(%esp) 146 setl %al 147 ret 148 149Doing this correctly is tricky though, as the xor clobbers the flags. 150 151//===---------------------------------------------------------------------===// 152 153We should generate bts/btr/etc instructions on targets where they are cheap or 154when codesize is important. e.g., for: 155 156void setbit(int *target, int bit) { 157 *target |= (1 << bit); 158} 159void clearbit(int *target, int bit) { 160 *target &= ~(1 << bit); 161} 162 163//===---------------------------------------------------------------------===// 164 165Instead of the following for memset char*, 1, 10: 166 167 movl $16843009, 4(%edx) 168 movl $16843009, (%edx) 169 movw $257, 8(%edx) 170 171It might be better to generate 172 173 movl $16843009, %eax 174 movl %eax, 4(%edx) 175 movl %eax, (%edx) 176 movw al, 8(%edx) 177 178when we can spare a register. It reduces code size. 179 180//===---------------------------------------------------------------------===// 181 182Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently 183get this: 184 185define i32 @test1(i32 %X) { 186 %Y = sdiv i32 %X, 8 187 ret i32 %Y 188} 189 190_test1: 191 movl 4(%esp), %eax 192 movl %eax, %ecx 193 sarl $31, %ecx 194 shrl $29, %ecx 195 addl %ecx, %eax 196 sarl $3, %eax 197 ret 198 199GCC knows several different ways to codegen it, one of which is this: 200 201_test1: 202 movl 4(%esp), %eax 203 cmpl $-1, %eax 204 leal 7(%eax), %ecx 205 cmovle %ecx, %eax 206 sarl $3, %eax 207 ret 208 209which is probably slower, but it's interesting at least :) 210 211//===---------------------------------------------------------------------===// 212 213We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl 214We should leave these as libcalls for everything over a much lower threshold, 215since libc is hand tuned for medium and large mem ops (avoiding RFO for large 216stores, TLB preheating, etc) 217 218//===---------------------------------------------------------------------===// 219 220Optimize this into something reasonable: 221 x * copysign(1.0, y) * copysign(1.0, z) 222 223//===---------------------------------------------------------------------===// 224 225Optimize copysign(x, *y) to use an integer load from y. 226 227//===---------------------------------------------------------------------===// 228 229The following tests perform worse with LSR: 230 231lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. 232 233//===---------------------------------------------------------------------===// 234 235Adding to the list of cmp / test poor codegen issues: 236 237int test(__m128 *A, __m128 *B) { 238 if (_mm_comige_ss(*A, *B)) 239 return 3; 240 else 241 return 4; 242} 243 244_test: 245 movl 8(%esp), %eax 246 movaps (%eax), %xmm0 247 movl 4(%esp), %eax 248 movaps (%eax), %xmm1 249 comiss %xmm0, %xmm1 250 setae %al 251 movzbl %al, %ecx 252 movl $3, %eax 253 movl $4, %edx 254 cmpl $0, %ecx 255 cmove %edx, %eax 256 ret 257 258Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There 259are a number of issues. 1) We are introducing a setcc between the result of the 260intrisic call and select. 2) The intrinsic is expected to produce a i32 value 261so a any extend (which becomes a zero extend) is added. 262 263We probably need some kind of target DAG combine hook to fix this. 264 265//===---------------------------------------------------------------------===// 266 267We generate significantly worse code for this than GCC: 268http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150 269http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701 270 271There is also one case we do worse on PPC. 272 273//===---------------------------------------------------------------------===// 274 275For this: 276 277int test(int a) 278{ 279 return a * 3; 280} 281 282We currently emits 283 imull $3, 4(%esp), %eax 284 285Perhaps this is what we really should generate is? Is imull three or four 286cycles? Note: ICC generates this: 287 movl 4(%esp), %eax 288 leal (%eax,%eax,2), %eax 289 290The current instruction priority is based on pattern complexity. The former is 291more "complex" because it folds a load so the latter will not be emitted. 292 293Perhaps we should use AddedComplexity to give LEA32r a higher priority? We 294should always try to match LEA first since the LEA matching code does some 295estimate to determine whether the match is profitable. 296 297However, if we care more about code size, then imull is better. It's two bytes 298shorter than movl + leal. 299 300On a Pentium M, both variants have the same characteristics with regard 301to throughput; however, the multiplication has a latency of four cycles, as 302opposed to two cycles for the movl+lea variant. 303 304//===---------------------------------------------------------------------===// 305 306__builtin_ffs codegen is messy. 307 308int ffs_(unsigned X) { return __builtin_ffs(X); } 309 310llvm produces: 311ffs_: 312 movl 4(%esp), %ecx 313 bsfl %ecx, %eax 314 movl $32, %edx 315 cmove %edx, %eax 316 incl %eax 317 xorl %edx, %edx 318 testl %ecx, %ecx 319 cmove %edx, %eax 320 ret 321 322vs gcc: 323 324_ffs_: 325 movl $-1, %edx 326 bsfl 4(%esp), %eax 327 cmove %edx, %eax 328 addl $1, %eax 329 ret 330 331Another example of __builtin_ffs (use predsimplify to eliminate a select): 332 333int foo (unsigned long j) { 334 if (j) 335 return __builtin_ffs (j) - 1; 336 else 337 return 0; 338} 339 340//===---------------------------------------------------------------------===// 341 342It appears gcc place string data with linkonce linkage in 343.section __TEXT,__const_coal,coalesced instead of 344.section __DATA,__const_coal,coalesced. 345Take a look at darwin.h, there are other Darwin assembler directives that we 346do not make use of. 347 348//===---------------------------------------------------------------------===// 349 350define i32 @foo(i32* %a, i32 %t) { 351entry: 352 br label %cond_true 353 354cond_true: ; preds = %cond_true, %entry 355 %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3] 356 %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1] 357 %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1] 358 %tmp3 = load i32* %tmp2 ; <i32> [#uses=1] 359 %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1] 360 %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2] 361 %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2] 362 %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1] 363 br i1 %tmp, label %bb12, label %cond_true 364 365bb12: ; preds = %cond_true 366 ret i32 %tmp7 367} 368is pessimized by -loop-reduce and -indvars 369 370//===---------------------------------------------------------------------===// 371 372u32 to float conversion improvement: 373 374float uint32_2_float( unsigned u ) { 375 float fl = (int) (u & 0xffff); 376 float fh = (int) (u >> 16); 377 fh *= 0x1.0p16f; 378 return fh + fl; 379} 380 38100000000 subl $0x04,%esp 38200000003 movl 0x08(%esp,1),%eax 38300000007 movl %eax,%ecx 38400000009 shrl $0x10,%ecx 3850000000c cvtsi2ss %ecx,%xmm0 38600000010 andl $0x0000ffff,%eax 38700000015 cvtsi2ss %eax,%xmm1 38800000019 mulss 0x00000078,%xmm0 38900000021 addss %xmm1,%xmm0 39000000025 movss %xmm0,(%esp,1) 3910000002a flds (%esp,1) 3920000002d addl $0x04,%esp 39300000030 ret 394 395//===---------------------------------------------------------------------===// 396 397When using fastcc abi, align stack slot of argument of type double on 8 byte 398boundary to improve performance. 399 400//===---------------------------------------------------------------------===// 401 402GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting 403simplifications for integer "x cmp y ? a : b". 404 405//===---------------------------------------------------------------------===// 406 407Consider the expansion of: 408 409define i32 @test3(i32 %X) { 410 %tmp1 = urem i32 %X, 255 411 ret i32 %tmp1 412} 413 414Currently it compiles to: 415 416... 417 movl $2155905153, %ecx 418 movl 8(%esp), %esi 419 movl %esi, %eax 420 mull %ecx 421... 422 423This could be "reassociated" into: 424 425 movl $2155905153, %eax 426 movl 8(%esp), %ecx 427 mull %ecx 428 429to avoid the copy. In fact, the existing two-address stuff would do this 430except that mul isn't a commutative 2-addr instruction. I guess this has 431to be done at isel time based on the #uses to mul? 432 433//===---------------------------------------------------------------------===// 434 435Make sure the instruction which starts a loop does not cross a cacheline 436boundary. This requires knowning the exact length of each machine instruction. 437That is somewhat complicated, but doable. Example 256.bzip2: 438 439In the new trace, the hot loop has an instruction which crosses a cacheline 440boundary. In addition to potential cache misses, this can't help decoding as I 441imagine there has to be some kind of complicated decoder reset and realignment 442to grab the bytes from the next cacheline. 443 444532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines 445942 942 0x3d03 movl %dh, (1809(%esp, %esi) 446937 937 0x3d0a incl %esi 4473 3 0x3d0b cmpb %bl, %dl 44827 27 0x3d0d jnz 0x000062db <main+11707> 449 450//===---------------------------------------------------------------------===// 451 452In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. 453 454//===---------------------------------------------------------------------===// 455 456This could be a single 16-bit load. 457 458int f(char *p) { 459 if ((p[0] == 1) & (p[1] == 2)) return 1; 460 return 0; 461} 462 463//===---------------------------------------------------------------------===// 464 465We should inline lrintf and probably other libc functions. 466 467//===---------------------------------------------------------------------===// 468 469Use the FLAGS values from arithmetic instructions more. For example, compile: 470 471int add_zf(int *x, int y, int a, int b) { 472 if ((*x += y) == 0) 473 return a; 474 else 475 return b; 476} 477 478to: 479 addl %esi, (%rdi) 480 movl %edx, %eax 481 cmovne %ecx, %eax 482 ret 483instead of: 484 485_add_zf: 486 addl (%rdi), %esi 487 movl %esi, (%rdi) 488 testl %esi, %esi 489 cmove %edx, %ecx 490 movl %ecx, %eax 491 ret 492 493As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll 494without a test instruction. 495 496//===---------------------------------------------------------------------===// 497 498These two functions have identical effects: 499 500unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;} 501unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;} 502 503We currently compile them to: 504 505_f: 506 movl 4(%esp), %eax 507 movl %eax, %ecx 508 incl %ecx 509 movl 8(%esp), %edx 510 cmpl %edx, %ecx 511 jne LBB1_2 #UnifiedReturnBlock 512LBB1_1: #cond_true 513 addl $2, %eax 514 ret 515LBB1_2: #UnifiedReturnBlock 516 movl %ecx, %eax 517 ret 518_f2: 519 movl 4(%esp), %eax 520 movl %eax, %ecx 521 incl %ecx 522 cmpl 8(%esp), %ecx 523 sete %cl 524 movzbl %cl, %ecx 525 leal 1(%ecx,%eax), %eax 526 ret 527 528both of which are inferior to GCC's: 529 530_f: 531 movl 4(%esp), %edx 532 leal 1(%edx), %eax 533 addl $2, %edx 534 cmpl 8(%esp), %eax 535 cmove %edx, %eax 536 ret 537_f2: 538 movl 4(%esp), %eax 539 addl $1, %eax 540 xorl %edx, %edx 541 cmpl 8(%esp), %eax 542 sete %dl 543 addl %edx, %eax 544 ret 545 546//===---------------------------------------------------------------------===// 547 548This code: 549 550void test(int X) { 551 if (X) abort(); 552} 553 554is currently compiled to: 555 556_test: 557 subl $12, %esp 558 cmpl $0, 16(%esp) 559 jne LBB1_1 560 addl $12, %esp 561 ret 562LBB1_1: 563 call L_abort$stub 564 565It would be better to produce: 566 567_test: 568 subl $12, %esp 569 cmpl $0, 16(%esp) 570 jne L_abort$stub 571 addl $12, %esp 572 ret 573 574This can be applied to any no-return function call that takes no arguments etc. 575Alternatively, the stack save/restore logic could be shrink-wrapped, producing 576something like this: 577 578_test: 579 cmpl $0, 4(%esp) 580 jne LBB1_1 581 ret 582LBB1_1: 583 subl $12, %esp 584 call L_abort$stub 585 586Both are useful in different situations. Finally, it could be shrink-wrapped 587and tail called, like this: 588 589_test: 590 cmpl $0, 4(%esp) 591 jne LBB1_1 592 ret 593LBB1_1: 594 pop %eax # realign stack. 595 call L_abort$stub 596 597Though this probably isn't worth it. 598 599//===---------------------------------------------------------------------===// 600 601Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with 602a neg instead of a sub instruction. Consider: 603 604int test(char X) { return 7-X; } 605 606we currently produce: 607_test: 608 movl $7, %eax 609 movsbl 4(%esp), %ecx 610 subl %ecx, %eax 611 ret 612 613We would use one fewer register if codegen'd as: 614 615 movsbl 4(%esp), %eax 616 neg %eax 617 add $7, %eax 618 ret 619 620Note that this isn't beneficial if the load can be folded into the sub. In 621this case, we want a sub: 622 623int test(int X) { return 7-X; } 624_test: 625 movl $7, %eax 626 subl 4(%esp), %eax 627 ret 628 629//===---------------------------------------------------------------------===// 630 631Leaf functions that require one 4-byte spill slot have a prolog like this: 632 633_foo: 634 pushl %esi 635 subl $4, %esp 636... 637and an epilog like this: 638 addl $4, %esp 639 popl %esi 640 ret 641 642It would be smaller, and potentially faster, to push eax on entry and to 643pop into a dummy register instead of using addl/subl of esp. Just don't pop 644into any return registers :) 645 646//===---------------------------------------------------------------------===// 647 648The X86 backend should fold (branch (or (setcc, setcc))) into multiple 649branches. We generate really poor code for: 650 651double testf(double a) { 652 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); 653} 654 655For example, the entry BB is: 656 657_testf: 658 subl $20, %esp 659 pxor %xmm0, %xmm0 660 movsd 24(%esp), %xmm1 661 ucomisd %xmm0, %xmm1 662 setnp %al 663 sete %cl 664 testb %cl, %al 665 jne LBB1_5 # UnifiedReturnBlock 666LBB1_1: # cond_true 667 668 669it would be better to replace the last four instructions with: 670 671 jp LBB1_1 672 je LBB1_5 673LBB1_1: 674 675We also codegen the inner ?: into a diamond: 676 677 cvtss2sd LCPI1_0(%rip), %xmm2 678 cvtss2sd LCPI1_1(%rip), %xmm3 679 ucomisd %xmm1, %xmm0 680 ja LBB1_3 # cond_true 681LBB1_2: # cond_true 682 movapd %xmm3, %xmm2 683LBB1_3: # cond_true 684 movapd %xmm2, %xmm0 685 ret 686 687We should sink the load into xmm3 into the LBB1_2 block. This should 688be pretty easy, and will nuke all the copies. 689 690//===---------------------------------------------------------------------===// 691 692This: 693 #include <algorithm> 694 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b) 695 { return std::make_pair(a + b, a + b < a); } 696 bool no_overflow(unsigned a, unsigned b) 697 { return !full_add(a, b).second; } 698 699Should compile to: 700 addl %esi, %edi 701 setae %al 702 movzbl %al, %eax 703 ret 704 705on x86-64, instead of the rather stupid-looking: 706 addl %esi, %edi 707 setb %al 708 xorb $1, %al 709 movzbl %al, %eax 710 ret 711 712 713//===---------------------------------------------------------------------===// 714 715The following code: 716 717bb114.preheader: ; preds = %cond_next94 718 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1] 719 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1] 720 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1] 721 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1] 722 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1] 723 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2] 724 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1] 725 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1] 726 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1] 727 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1] 728 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1] 729 br label %bb114 730 731produces: 732 733LBB3_5: # bb114.preheader 734 movswl -68(%ebp), %eax 735 movl $32, %ecx 736 movl %ecx, -80(%ebp) 737 subl %eax, -80(%ebp) 738 movswl -52(%ebp), %eax 739 movl %ecx, -84(%ebp) 740 subl %eax, -84(%ebp) 741 movswl -70(%ebp), %eax 742 movl %ecx, -88(%ebp) 743 subl %eax, -88(%ebp) 744 movswl -50(%ebp), %eax 745 subl %eax, %ecx 746 movl %ecx, -76(%ebp) 747 movswl -42(%ebp), %eax 748 movl %eax, -92(%ebp) 749 movswl -66(%ebp), %eax 750 movl %eax, -96(%ebp) 751 movw $0, -98(%ebp) 752 753This appears to be bad because the RA is not folding the store to the stack 754slot into the movl. The above instructions could be: 755 movl $32, -80(%ebp) 756... 757 movl $32, -84(%ebp) 758... 759This seems like a cross between remat and spill folding. 760 761This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't 762change, so we could simply subtract %eax from %ecx first and then use %ecx (or 763vice-versa). 764 765//===---------------------------------------------------------------------===// 766 767This code: 768 769 %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1] 770 br i1 %tmp659, label %cond_true662, label %cond_next715 771 772produces this: 773 774 testw %cx, %cx 775 movswl %cx, %esi 776 jns LBB4_109 # cond_next715 777 778Shark tells us that using %cx in the testw instruction is sub-optimal. It 779suggests using the 32-bit register (which is what ICC uses). 780 781//===---------------------------------------------------------------------===// 782 783We compile this: 784 785void compare (long long foo) { 786 if (foo < 4294967297LL) 787 abort(); 788} 789 790to: 791 792compare: 793 subl $4, %esp 794 cmpl $0, 8(%esp) 795 setne %al 796 movzbw %al, %ax 797 cmpl $1, 12(%esp) 798 setg %cl 799 movzbw %cl, %cx 800 cmove %ax, %cx 801 testb $1, %cl 802 jne .LBB1_2 # UnifiedReturnBlock 803.LBB1_1: # ifthen 804 call abort 805.LBB1_2: # UnifiedReturnBlock 806 addl $4, %esp 807 ret 808 809(also really horrible code on ppc). This is due to the expand code for 64-bit 810compares. GCC produces multiple branches, which is much nicer: 811 812compare: 813 subl $12, %esp 814 movl 20(%esp), %edx 815 movl 16(%esp), %eax 816 decl %edx 817 jle .L7 818.L5: 819 addl $12, %esp 820 ret 821 .p2align 4,,7 822.L7: 823 jl .L4 824 cmpl $0, %eax 825 .p2align 4,,8 826 ja .L5 827.L4: 828 .p2align 4,,9 829 call abort 830 831//===---------------------------------------------------------------------===// 832 833Tail call optimization improvements: Tail call optimization currently 834pushes all arguments on the top of the stack (their normal place for 835non-tail call optimized calls) that source from the callers arguments 836or that source from a virtual register (also possibly sourcing from 837callers arguments). 838This is done to prevent overwriting of parameters (see example 839below) that might be used later. 840 841example: 842 843int callee(int32, int64); 844int caller(int32 arg1, int32 arg2) { 845 int64 local = arg2 * 2; 846 return callee(arg2, (int64)local); 847} 848 849[arg1] [!arg2 no longer valid since we moved local onto it] 850[arg2] -> [(int64) 851[RETADDR] local ] 852 853Moving arg1 onto the stack slot of callee function would overwrite 854arg2 of the caller. 855 856Possible optimizations: 857 858 859 - Analyse the actual parameters of the callee to see which would 860 overwrite a caller parameter which is used by the callee and only 861 push them onto the top of the stack. 862 863 int callee (int32 arg1, int32 arg2); 864 int caller (int32 arg1, int32 arg2) { 865 return callee(arg1,arg2); 866 } 867 868 Here we don't need to write any variables to the top of the stack 869 since they don't overwrite each other. 870 871 int callee (int32 arg1, int32 arg2); 872 int caller (int32 arg1, int32 arg2) { 873 return callee(arg2,arg1); 874 } 875 876 Here we need to push the arguments because they overwrite each 877 other. 878 879//===---------------------------------------------------------------------===// 880 881main () 882{ 883 int i = 0; 884 unsigned long int z = 0; 885 886 do { 887 z -= 0x00004000; 888 i++; 889 if (i > 0x00040000) 890 abort (); 891 } while (z > 0); 892 exit (0); 893} 894 895gcc compiles this to: 896 897_main: 898 subl $28, %esp 899 xorl %eax, %eax 900 jmp L2 901L3: 902 cmpl $262144, %eax 903 je L10 904L2: 905 addl $1, %eax 906 cmpl $262145, %eax 907 jne L3 908 call L_abort$stub 909L10: 910 movl $0, (%esp) 911 call L_exit$stub 912 913llvm: 914 915_main: 916 subl $12, %esp 917 movl $1, %eax 918 movl $16384, %ecx 919LBB1_1: # bb 920 cmpl $262145, %eax 921 jge LBB1_4 # cond_true 922LBB1_2: # cond_next 923 incl %eax 924 addl $4294950912, %ecx 925 cmpl $16384, %ecx 926 jne LBB1_1 # bb 927LBB1_3: # bb11 928 xorl %eax, %eax 929 addl $12, %esp 930 ret 931LBB1_4: # cond_true 932 call L_abort$stub 933 9341. LSR should rewrite the first cmp with induction variable %ecx. 9352. DAG combiner should fold 936 leal 1(%eax), %edx 937 cmpl $262145, %edx 938 => 939 cmpl $262144, %eax 940 941//===---------------------------------------------------------------------===// 942 943define i64 @test(double %X) { 944 %Y = fptosi double %X to i64 945 ret i64 %Y 946} 947 948compiles to: 949 950_test: 951 subl $20, %esp 952 movsd 24(%esp), %xmm0 953 movsd %xmm0, 8(%esp) 954 fldl 8(%esp) 955 fisttpll (%esp) 956 movl 4(%esp), %edx 957 movl (%esp), %eax 958 addl $20, %esp 959 #FP_REG_KILL 960 ret 961 962This should just fldl directly from the input stack slot. 963 964//===---------------------------------------------------------------------===// 965 966This code: 967int foo (int x) { return (x & 65535) | 255; } 968 969Should compile into: 970 971_foo: 972 movzwl 4(%esp), %eax 973 orl $255, %eax 974 ret 975 976instead of: 977_foo: 978 movl $65280, %eax 979 andl 4(%esp), %eax 980 orl $255, %eax 981 ret 982 983//===---------------------------------------------------------------------===// 984 985We're codegen'ing multiply of long longs inefficiently: 986 987unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) { 988 return arg1 * arg2; 989} 990 991We compile to (fomit-frame-pointer): 992 993_LLM: 994 pushl %esi 995 movl 8(%esp), %ecx 996 movl 16(%esp), %esi 997 movl %esi, %eax 998 mull %ecx 999 imull 12(%esp), %esi 1000 addl %edx, %esi 1001 imull 20(%esp), %ecx 1002 movl %esi, %edx 1003 addl %ecx, %edx 1004 popl %esi 1005 ret 1006 1007This looks like a scheduling deficiency and lack of remat of the load from 1008the argument area. ICC apparently produces: 1009 1010 movl 8(%esp), %ecx 1011 imull 12(%esp), %ecx 1012 movl 16(%esp), %eax 1013 imull 4(%esp), %eax 1014 addl %eax, %ecx 1015 movl 4(%esp), %eax 1016 mull 12(%esp) 1017 addl %ecx, %edx 1018 ret 1019 1020Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR: 1021http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236 1022 1023//===---------------------------------------------------------------------===// 1024 1025We can fold a store into "zeroing a reg". Instead of: 1026 1027xorl %eax, %eax 1028movl %eax, 124(%esp) 1029 1030we should get: 1031 1032movl $0, 124(%esp) 1033 1034if the flags of the xor are dead. 1035 1036Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should 1037be folded into: shl [mem], 1 1038 1039//===---------------------------------------------------------------------===// 1040 1041In SSE mode, we turn abs and neg into a load from the constant pool plus a xor 1042or and instruction, for example: 1043 1044 xorpd LCPI1_0, %xmm2 1045 1046However, if xmm2 gets spilled, we end up with really ugly code like this: 1047 1048 movsd (%esp), %xmm0 1049 xorpd LCPI1_0, %xmm0 1050 movsd %xmm0, (%esp) 1051 1052Since we 'know' that this is a 'neg', we can actually "fold" the spill into 1053the neg/abs instruction, turning it into an *integer* operation, like this: 1054 1055 xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31) 1056 1057you could also use xorb, but xorl is less likely to lead to a partial register 1058stall. Here is a contrived testcase: 1059 1060double a, b, c; 1061void test(double *P) { 1062 double X = *P; 1063 a = X; 1064 bar(); 1065 X = -X; 1066 b = X; 1067 bar(); 1068 c = X; 1069} 1070 1071//===---------------------------------------------------------------------===// 1072 1073The generated code on x86 for checking for signed overflow on a multiply the 1074obvious way is much longer than it needs to be. 1075 1076int x(int a, int b) { 1077 long long prod = (long long)a*b; 1078 return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1); 1079} 1080 1081See PR2053 for more details. 1082 1083//===---------------------------------------------------------------------===// 1084 1085We should investigate using cdq/ctld (effect: edx = sar eax, 31) 1086more aggressively; it should cost the same as a move+shift on any modern 1087processor, but it's a lot shorter. Downside is that it puts more 1088pressure on register allocation because it has fixed operands. 1089 1090Example: 1091int abs(int x) {return x < 0 ? -x : x;} 1092 1093gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.: 1094abs: 1095 movl 4(%esp), %eax 1096 cltd 1097 xorl %edx, %eax 1098 subl %edx, %eax 1099 ret 1100 1101//===---------------------------------------------------------------------===// 1102 1103Take the following code (from 1104http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): 1105 1106extern unsigned char first_one[65536]; 1107int FirstOnet(unsigned long long arg1) 1108{ 1109 if (arg1 >> 48) 1110 return (first_one[arg1 >> 48]); 1111 return 0; 1112} 1113 1114 1115The following code is currently generated: 1116FirstOnet: 1117 movl 8(%esp), %eax 1118 cmpl $65536, %eax 1119 movl 4(%esp), %ecx 1120 jb .LBB1_2 # UnifiedReturnBlock 1121.LBB1_1: # ifthen 1122 shrl $16, %eax 1123 movzbl first_one(%eax), %eax 1124 ret 1125.LBB1_2: # UnifiedReturnBlock 1126 xorl %eax, %eax 1127 ret 1128 1129We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this 1130lets us change the cmpl into a testl, which is shorter, and eliminate the shift. 1131 1132//===---------------------------------------------------------------------===// 1133 1134We compile this function: 1135 1136define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind { 1137entry: 1138 %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1] 1139 br i1 %tmp2, label %bb7, label %bb 1140 1141bb: ; preds = %entry 1142 %tmp6 = add i32 %b, %a ; <i32> [#uses=1] 1143 ret i32 %tmp6 1144 1145bb7: ; preds = %entry 1146 %tmp10 = sub i32 %a, %c ; <i32> [#uses=1] 1147 ret i32 %tmp10 1148} 1149 1150to: 1151 1152foo: # @foo 1153# BB#0: # %entry 1154 movl 4(%esp), %ecx 1155 cmpb $0, 16(%esp) 1156 je .LBB0_2 1157# BB#1: # %bb 1158 movl 8(%esp), %eax 1159 addl %ecx, %eax 1160 ret 1161.LBB0_2: # %bb7 1162 movl 12(%esp), %edx 1163 movl %ecx, %eax 1164 subl %edx, %eax 1165 ret 1166 1167There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a 1168couple more movls by putting 4(%esp) into %eax instead of %ecx. 1169 1170//===---------------------------------------------------------------------===// 1171 1172See rdar://4653682. 1173 1174From flops: 1175 1176LBB1_15: # bb310 1177 cvtss2sd LCPI1_0, %xmm1 1178 addsd %xmm1, %xmm0 1179 movsd 176(%esp), %xmm2 1180 mulsd %xmm0, %xmm2 1181 movapd %xmm2, %xmm3 1182 mulsd %xmm3, %xmm3 1183 movapd %xmm3, %xmm4 1184 mulsd LCPI1_23, %xmm4 1185 addsd LCPI1_24, %xmm4 1186 mulsd %xmm3, %xmm4 1187 addsd LCPI1_25, %xmm4 1188 mulsd %xmm3, %xmm4 1189 addsd LCPI1_26, %xmm4 1190 mulsd %xmm3, %xmm4 1191 addsd LCPI1_27, %xmm4 1192 mulsd %xmm3, %xmm4 1193 addsd LCPI1_28, %xmm4 1194 mulsd %xmm3, %xmm4 1195 addsd %xmm1, %xmm4 1196 mulsd %xmm2, %xmm4 1197 movsd 152(%esp), %xmm1 1198 addsd %xmm4, %xmm1 1199 movsd %xmm1, 152(%esp) 1200 incl %eax 1201 cmpl %eax, %esi 1202 jge LBB1_15 # bb310 1203LBB1_16: # bb358.loopexit 1204 movsd 152(%esp), %xmm0 1205 addsd %xmm0, %xmm0 1206 addsd LCPI1_22, %xmm0 1207 movsd %xmm0, 152(%esp) 1208 1209Rather than spilling the result of the last addsd in the loop, we should have 1210insert a copy to split the interval (one for the duration of the loop, one 1211extending to the fall through). The register pressure in the loop isn't high 1212enough to warrant the spill. 1213 1214Also check why xmm7 is not used at all in the function. 1215 1216//===---------------------------------------------------------------------===// 1217 1218Take the following: 1219 1220target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128" 1221target triple = "i386-apple-darwin8" 1222@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2] 1223define fastcc void @abort_gzip() noreturn nounwind { 1224entry: 1225 %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1] 1226 br i1 %tmp.b.i, label %bb.i, label %bb4.i 1227bb.i: ; preds = %entry 1228 tail call void @exit( i32 1 ) noreturn nounwind 1229 unreachable 1230bb4.i: ; preds = %entry 1231 store i1 true, i1* @in_exit.4870.b 1232 tail call void @exit( i32 1 ) noreturn nounwind 1233 unreachable 1234} 1235declare void @exit(i32) noreturn nounwind 1236 1237This compiles into: 1238_abort_gzip: ## @abort_gzip 1239## BB#0: ## %entry 1240 subl $12, %esp 1241 movb _in_exit.4870.b, %al 1242 cmpb $1, %al 1243 jne LBB0_2 1244 1245We somehow miss folding the movb into the cmpb. 1246 1247//===---------------------------------------------------------------------===// 1248 1249We compile: 1250 1251int test(int x, int y) { 1252 return x-y-1; 1253} 1254 1255into (-m64): 1256 1257_test: 1258 decl %edi 1259 movl %edi, %eax 1260 subl %esi, %eax 1261 ret 1262 1263it would be better to codegen as: x+~y (notl+addl) 1264 1265//===---------------------------------------------------------------------===// 1266 1267This code: 1268 1269int foo(const char *str,...) 1270{ 1271 __builtin_va_list a; int x; 1272 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a); 1273 return x; 1274} 1275 1276gets compiled into this on x86-64: 1277 subq $200, %rsp 1278 movaps %xmm7, 160(%rsp) 1279 movaps %xmm6, 144(%rsp) 1280 movaps %xmm5, 128(%rsp) 1281 movaps %xmm4, 112(%rsp) 1282 movaps %xmm3, 96(%rsp) 1283 movaps %xmm2, 80(%rsp) 1284 movaps %xmm1, 64(%rsp) 1285 movaps %xmm0, 48(%rsp) 1286 movq %r9, 40(%rsp) 1287 movq %r8, 32(%rsp) 1288 movq %rcx, 24(%rsp) 1289 movq %rdx, 16(%rsp) 1290 movq %rsi, 8(%rsp) 1291 leaq (%rsp), %rax 1292 movq %rax, 192(%rsp) 1293 leaq 208(%rsp), %rax 1294 movq %rax, 184(%rsp) 1295 movl $48, 180(%rsp) 1296 movl $8, 176(%rsp) 1297 movl 176(%rsp), %eax 1298 cmpl $47, %eax 1299 jbe .LBB1_3 # bb 1300.LBB1_1: # bb3 1301 movq 184(%rsp), %rcx 1302 leaq 8(%rcx), %rax 1303 movq %rax, 184(%rsp) 1304.LBB1_2: # bb4 1305 movl (%rcx), %eax 1306 addq $200, %rsp 1307 ret 1308.LBB1_3: # bb 1309 movl %eax, %ecx 1310 addl $8, %eax 1311 addq 192(%rsp), %rcx 1312 movl %eax, 176(%rsp) 1313 jmp .LBB1_2 # bb4 1314 1315gcc 4.3 generates: 1316 subq $96, %rsp 1317.LCFI0: 1318 leaq 104(%rsp), %rax 1319 movq %rsi, -80(%rsp) 1320 movl $8, -120(%rsp) 1321 movq %rax, -112(%rsp) 1322 leaq -88(%rsp), %rax 1323 movq %rax, -104(%rsp) 1324 movl $8, %eax 1325 cmpl $48, %eax 1326 jb .L6 1327 movq -112(%rsp), %rdx 1328 movl (%rdx), %eax 1329 addq $96, %rsp 1330 ret 1331 .p2align 4,,10 1332 .p2align 3 1333.L6: 1334 mov %eax, %edx 1335 addq -104(%rsp), %rdx 1336 addl $8, %eax 1337 movl %eax, -120(%rsp) 1338 movl (%rdx), %eax 1339 addq $96, %rsp 1340 ret 1341 1342and it gets compiled into this on x86: 1343 pushl %ebp 1344 movl %esp, %ebp 1345 subl $4, %esp 1346 leal 12(%ebp), %eax 1347 movl %eax, -4(%ebp) 1348 leal 16(%ebp), %eax 1349 movl %eax, -4(%ebp) 1350 movl 12(%ebp), %eax 1351 addl $4, %esp 1352 popl %ebp 1353 ret 1354 1355gcc 4.3 generates: 1356 pushl %ebp 1357 movl %esp, %ebp 1358 movl 12(%ebp), %eax 1359 popl %ebp 1360 ret 1361 1362//===---------------------------------------------------------------------===// 1363 1364Teach tblgen not to check bitconvert source type in some cases. This allows us 1365to consolidate the following patterns in X86InstrMMX.td: 1366 1367def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1368 (iPTR 0))))), 1369 (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>; 1370def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1371 (iPTR 0))))), 1372 (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>; 1373def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1374 (iPTR 0))))), 1375 (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>; 1376 1377There are other cases in various td files. 1378 1379//===---------------------------------------------------------------------===// 1380 1381Take something like the following on x86-32: 1382unsigned a(unsigned long long x, unsigned y) {return x % y;} 1383 1384We currently generate a libcall, but we really shouldn't: the expansion is 1385shorter and likely faster than the libcall. The expected code is something 1386like the following: 1387 1388 movl 12(%ebp), %eax 1389 movl 16(%ebp), %ecx 1390 xorl %edx, %edx 1391 divl %ecx 1392 movl 8(%ebp), %eax 1393 divl %ecx 1394 movl %edx, %eax 1395 ret 1396 1397A similar code sequence works for division. 1398 1399//===---------------------------------------------------------------------===// 1400 1401These should compile to the same code, but the later codegen's to useless 1402instructions on X86. This may be a trivial dag combine (GCC PR7061): 1403 1404struct s1 { unsigned char a, b; }; 1405unsigned long f1(struct s1 x) { 1406 return x.a + x.b; 1407} 1408struct s2 { unsigned a: 8, b: 8; }; 1409unsigned long f2(struct s2 x) { 1410 return x.a + x.b; 1411} 1412 1413//===---------------------------------------------------------------------===// 1414 1415We currently compile this: 1416 1417define i32 @func1(i32 %v1, i32 %v2) nounwind { 1418entry: 1419 %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) 1420 %sum = extractvalue {i32, i1} %t, 0 1421 %obit = extractvalue {i32, i1} %t, 1 1422 br i1 %obit, label %overflow, label %normal 1423normal: 1424 ret i32 %sum 1425overflow: 1426 call void @llvm.trap() 1427 unreachable 1428} 1429declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) 1430declare void @llvm.trap() 1431 1432to: 1433 1434_func1: 1435 movl 4(%esp), %eax 1436 addl 8(%esp), %eax 1437 jo LBB1_2 ## overflow 1438LBB1_1: ## normal 1439 ret 1440LBB1_2: ## overflow 1441 ud2 1442 1443it would be nice to produce "into" someday. 1444 1445//===---------------------------------------------------------------------===// 1446 1447This code: 1448 1449void vec_mpys1(int y[], const int x[], int scaler) { 1450int i; 1451for (i = 0; i < 150; i++) 1452 y[i] += (((long long)scaler * (long long)x[i]) >> 31); 1453} 1454 1455Compiles to this loop with GCC 3.x: 1456 1457.L5: 1458 movl %ebx, %eax 1459 imull (%edi,%ecx,4) 1460 shrdl $31, %edx, %eax 1461 addl %eax, (%esi,%ecx,4) 1462 incl %ecx 1463 cmpl $149, %ecx 1464 jle .L5 1465 1466llvm-gcc compiles it to the much uglier: 1467 1468LBB1_1: ## bb1 1469 movl 24(%esp), %eax 1470 movl (%eax,%edi,4), %ebx 1471 movl %ebx, %ebp 1472 imull %esi, %ebp 1473 movl %ebx, %eax 1474 mull %ecx 1475 addl %ebp, %edx 1476 sarl $31, %ebx 1477 imull %ecx, %ebx 1478 addl %edx, %ebx 1479 shldl $1, %eax, %ebx 1480 movl 20(%esp), %eax 1481 addl %ebx, (%eax,%edi,4) 1482 incl %edi 1483 cmpl $150, %edi 1484 jne LBB1_1 ## bb1 1485 1486The issue is that we hoist the cast of "scaler" to long long outside of the 1487loop, the value comes into the loop as two values, and 1488RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the 1489constructed BUILD_PAIR which represents the cast value. 1490 1491This can be handled by making CodeGenPrepare sink the cast. 1492 1493//===---------------------------------------------------------------------===// 1494 1495Test instructions can be eliminated by using EFLAGS values from arithmetic 1496instructions. This is currently not done for mul, and, or, xor, neg, shl, 1497sra, srl, shld, shrd, atomic ops, and others. It is also currently not done 1498for read-modify-write instructions. It is also current not done if the 1499OF or CF flags are needed. 1500 1501The shift operators have the complication that when the shift count is 1502zero, EFLAGS is not set, so they can only subsume a test instruction if 1503the shift count is known to be non-zero. Also, using the EFLAGS value 1504from a shift is apparently very slow on some x86 implementations. 1505 1506In read-modify-write instructions, the root node in the isel match is 1507the store, and isel has no way for the use of the EFLAGS result of the 1508arithmetic to be remapped to the new node. 1509 1510Add and subtract instructions set OF on signed overflow and CF on unsiged 1511overflow, while test instructions always clear OF and CF. In order to 1512replace a test with an add or subtract in a situation where OF or CF is 1513needed, codegen must be able to prove that the operation cannot see 1514signed or unsigned overflow, respectively. 1515 1516//===---------------------------------------------------------------------===// 1517 1518memcpy/memmove do not lower to SSE copies when possible. A silly example is: 1519define <16 x float> @foo(<16 x float> %A) nounwind { 1520 %tmp = alloca <16 x float>, align 16 1521 %tmp2 = alloca <16 x float>, align 16 1522 store <16 x float> %A, <16 x float>* %tmp 1523 %s = bitcast <16 x float>* %tmp to i8* 1524 %s2 = bitcast <16 x float>* %tmp2 to i8* 1525 call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16) 1526 %R = load <16 x float>* %tmp2 1527 ret <16 x float> %R 1528} 1529 1530declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind 1531 1532which compiles to: 1533 1534_foo: 1535 subl $140, %esp 1536 movaps %xmm3, 112(%esp) 1537 movaps %xmm2, 96(%esp) 1538 movaps %xmm1, 80(%esp) 1539 movaps %xmm0, 64(%esp) 1540 movl 60(%esp), %eax 1541 movl %eax, 124(%esp) 1542 movl 56(%esp), %eax 1543 movl %eax, 120(%esp) 1544 movl 52(%esp), %eax 1545 <many many more 32-bit copies> 1546 movaps (%esp), %xmm0 1547 movaps 16(%esp), %xmm1 1548 movaps 32(%esp), %xmm2 1549 movaps 48(%esp), %xmm3 1550 addl $140, %esp 1551 ret 1552 1553On Nehalem, it may even be cheaper to just use movups when unaligned than to 1554fall back to lower-granularity chunks. 1555 1556//===---------------------------------------------------------------------===// 1557 1558Implement processor-specific optimizations for parity with GCC on these 1559processors. GCC does two optimizations: 1560 15611. ix86_pad_returns inserts a noop before ret instructions if immediately 1562 preceded by a conditional branch or is the target of a jump. 15632. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of 1564 code contains more than 3 branches. 1565 1566The first one is done for all AMDs, Core2, and "Generic" 1567The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, 1568 Core 2, and "Generic" 1569 1570//===---------------------------------------------------------------------===// 1571Testcase: 1572int x(int a) { return (a&0xf0)>>4; } 1573 1574Current output: 1575 movl 4(%esp), %eax 1576 shrl $4, %eax 1577 andl $15, %eax 1578 ret 1579 1580Ideal output: 1581 movzbl 4(%esp), %eax 1582 shrl $4, %eax 1583 ret 1584 1585//===---------------------------------------------------------------------===// 1586 1587Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch 1588properly. 1589 1590When the return value is not used (i.e. only care about the value in the 1591memory), x86 does not have to use add to implement these. Instead, it can use 1592add, sub, inc, dec instructions with the "lock" prefix. 1593 1594This is currently implemented using a bit of instruction selection trick. The 1595issue is the target independent pattern produces one output and a chain and we 1596want to map it into one that just output a chain. The current trick is to select 1597it into a MERGE_VALUES with the first definition being an implicit_def. The 1598proper solution is to add new ISD opcodes for the no-output variant. DAG 1599combiner can then transform the node before it gets to target node selection. 1600 1601Problem #2 is we are adding a whole bunch of x86 atomic instructions when in 1602fact these instructions are identical to the non-lock versions. We need a way to 1603add target specific information to target nodes and have this information 1604carried over to machine instructions. Asm printer (or JIT) can use this 1605information to add the "lock" prefix. 1606 1607//===---------------------------------------------------------------------===// 1608 1609struct B { 1610 unsigned char y0 : 1; 1611}; 1612 1613int bar(struct B* a) { return a->y0; } 1614 1615define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize { 1616 %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0 1617 %2 = load i8* %1, align 1 1618 %3 = and i8 %2, 1 1619 %4 = zext i8 %3 to i32 1620 ret i32 %4 1621} 1622 1623bar: # @bar 1624# BB#0: 1625 movb (%rdi), %al 1626 andb $1, %al 1627 movzbl %al, %eax 1628 ret 1629 1630Missed optimization: should be movl+andl. 1631 1632//===---------------------------------------------------------------------===// 1633 1634The x86_64 abi says: 1635 1636Booleans, when stored in a memory object, are stored as single byte objects the 1637value of which is always 0 (false) or 1 (true). 1638 1639We are not using this fact: 1640 1641int bar(_Bool *a) { return *a; } 1642 1643define i32 @bar(i8* nocapture %a) nounwind readonly optsize { 1644 %1 = load i8* %a, align 1, !tbaa !0 1645 %tmp = and i8 %1, 1 1646 %2 = zext i8 %tmp to i32 1647 ret i32 %2 1648} 1649 1650bar: 1651 movb (%rdi), %al 1652 andb $1, %al 1653 movzbl %al, %eax 1654 ret 1655 1656GCC produces 1657 1658bar: 1659 movzbl (%rdi), %eax 1660 ret 1661 1662//===---------------------------------------------------------------------===// 1663 1664Consider the following two functions compiled with clang: 1665_Bool foo(int *x) { return !(*x & 4); } 1666unsigned bar(int *x) { return !(*x & 4); } 1667 1668foo: 1669 movl 4(%esp), %eax 1670 testb $4, (%eax) 1671 sete %al 1672 movzbl %al, %eax 1673 ret 1674 1675bar: 1676 movl 4(%esp), %eax 1677 movl (%eax), %eax 1678 shrl $2, %eax 1679 andl $1, %eax 1680 xorl $1, %eax 1681 ret 1682 1683The second function generates more code even though the two functions are 1684are functionally identical. 1685 1686//===---------------------------------------------------------------------===// 1687 1688Take the following C code: 1689int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } 1690 1691We generate the following IR with clang: 1692define i32 @f(i32 %a, i32 %b) nounwind readnone { 1693entry: 1694 %tmp = xor i32 %b, %a ; <i32> [#uses=1] 1695 %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1] 1696 %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1] 1697 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] 1698 ret i32 %conv5 1699} 1700 1701And the following x86 code: 1702 xorl %esi, %edi 1703 testb $-1, %dil 1704 sete %al 1705 movzbl %al, %eax 1706 ret 1707 1708A cmpb instead of the xorl+testb would be one instruction shorter. 1709 1710//===---------------------------------------------------------------------===// 1711 1712Given the following C code: 1713int f(int a, int b) { return (signed char)a == (signed char)b; } 1714 1715We generate the following IR with clang: 1716define i32 @f(i32 %a, i32 %b) nounwind readnone { 1717entry: 1718 %sext = shl i32 %a, 24 ; <i32> [#uses=1] 1719 %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1] 1720 %sext6 = shl i32 %b, 24 ; <i32> [#uses=1] 1721 %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1] 1722 %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1] 1723 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] 1724 ret i32 %conv5 1725} 1726 1727And the following x86 code: 1728 movsbl %sil, %eax 1729 movsbl %dil, %ecx 1730 cmpl %eax, %ecx 1731 sete %al 1732 movzbl %al, %eax 1733 ret 1734 1735 1736It should be possible to eliminate the sign extensions. 1737 1738//===---------------------------------------------------------------------===// 1739 1740LLVM misses a load+store narrowing opportunity in this code: 1741 1742%struct.bf = type { i64, i16, i16, i32 } 1743 1744@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] 1745 1746define void @t1() nounwind ssp { 1747entry: 1748 %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] 1749 %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1] 1750 %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2] 1751 %3 = load i32* %2, align 1 ; <i32> [#uses=1] 1752 %4 = and i32 %3, -65537 ; <i32> [#uses=1] 1753 store i32 %4, i32* %2, align 1 1754 %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] 1755 %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1] 1756 %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2] 1757 %8 = load i32* %7, align 1 ; <i32> [#uses=1] 1758 %9 = and i32 %8, -131073 ; <i32> [#uses=1] 1759 store i32 %9, i32* %7, align 1 1760 ret void 1761} 1762 1763LLVM currently emits this: 1764 1765 movq bfi(%rip), %rax 1766 andl $-65537, 8(%rax) 1767 movq bfi(%rip), %rax 1768 andl $-131073, 8(%rax) 1769 ret 1770 1771It could narrow the loads and stores to emit this: 1772 1773 movq bfi(%rip), %rax 1774 andb $-2, 10(%rax) 1775 movq bfi(%rip), %rax 1776 andb $-3, 10(%rax) 1777 ret 1778 1779The trouble is that there is a TokenFactor between the store and the 1780load, making it non-trivial to determine if there's anything between 1781the load and the store which would prohibit narrowing. 1782 1783//===---------------------------------------------------------------------===// 1784 1785This code: 1786void foo(unsigned x) { 1787 if (x == 0) bar(); 1788 else if (x == 1) qux(); 1789} 1790 1791currently compiles into: 1792_foo: 1793 movl 4(%esp), %eax 1794 cmpl $1, %eax 1795 je LBB0_3 1796 testl %eax, %eax 1797 jne LBB0_4 1798 1799the testl could be removed: 1800_foo: 1801 movl 4(%esp), %eax 1802 cmpl $1, %eax 1803 je LBB0_3 1804 jb LBB0_4 1805 18060 is the only unsigned number < 1. 1807 1808//===---------------------------------------------------------------------===// 1809 1810This code: 1811 1812%0 = type { i32, i1 } 1813 1814define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp { 1815entry: 1816 %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x) 1817 %cmp = extractvalue %0 %uadd, 1 1818 %inc = zext i1 %cmp to i32 1819 %add = add i32 %x, %sum 1820 %z.0 = add i32 %add, %inc 1821 ret i32 %z.0 1822} 1823 1824declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone 1825 1826compiles to: 1827 1828_add32carry: ## @add32carry 1829 addl %esi, %edi 1830 sbbl %ecx, %ecx 1831 movl %edi, %eax 1832 subl %ecx, %eax 1833 ret 1834 1835But it could be: 1836 1837_add32carry: 1838 leal (%rsi,%rdi), %eax 1839 cmpl %esi, %eax 1840 adcl $0, %eax 1841 ret 1842 1843//===---------------------------------------------------------------------===// 1844 1845The hot loop of 256.bzip2 contains code that looks a bit like this: 1846 1847int foo(char *P, char *Q, int x, int y) { 1848 if (P[0] != Q[0]) 1849 return P[0] < Q[0]; 1850 if (P[1] != Q[1]) 1851 return P[1] < Q[1]; 1852 if (P[2] != Q[2]) 1853 return P[2] < Q[2]; 1854 return P[3] < Q[3]; 1855} 1856 1857In the real code, we get a lot more wrong than this. However, even in this 1858code we generate: 1859 1860_foo: ## @foo 1861## BB#0: ## %entry 1862 movb (%rsi), %al 1863 movb (%rdi), %cl 1864 cmpb %al, %cl 1865 je LBB0_2 1866LBB0_1: ## %if.then 1867 cmpb %al, %cl 1868 jmp LBB0_5 1869LBB0_2: ## %if.end 1870 movb 1(%rsi), %al 1871 movb 1(%rdi), %cl 1872 cmpb %al, %cl 1873 jne LBB0_1 1874## BB#3: ## %if.end38 1875 movb 2(%rsi), %al 1876 movb 2(%rdi), %cl 1877 cmpb %al, %cl 1878 jne LBB0_1 1879## BB#4: ## %if.end60 1880 movb 3(%rdi), %al 1881 cmpb 3(%rsi), %al 1882LBB0_5: ## %if.end60 1883 setl %al 1884 movzbl %al, %eax 1885 ret 1886 1887Note that we generate jumps to LBB0_1 which does a redundant compare. The 1888redundant compare also forces the register values to be live, which prevents 1889folding one of the loads into the compare. In contrast, GCC 4.2 produces: 1890 1891_foo: 1892 movzbl (%rsi), %eax 1893 cmpb %al, (%rdi) 1894 jne L10 1895L12: 1896 movzbl 1(%rsi), %eax 1897 cmpb %al, 1(%rdi) 1898 jne L10 1899 movzbl 2(%rsi), %eax 1900 cmpb %al, 2(%rdi) 1901 jne L10 1902 movzbl 3(%rdi), %eax 1903 cmpb 3(%rsi), %al 1904L10: 1905 setl %al 1906 movzbl %al, %eax 1907 ret 1908 1909which is "perfect". 1910 1911//===---------------------------------------------------------------------===// 1912 1913For the branch in the following code: 1914int a(); 1915int b(int x, int y) { 1916 if (x & (1<<(y&7))) 1917 return a(); 1918 return y; 1919} 1920 1921We currently generate: 1922 movb %sil, %al 1923 andb $7, %al 1924 movzbl %al, %eax 1925 btl %eax, %edi 1926 jae .LBB0_2 1927 1928movl+andl would be shorter than the movb+andb+movzbl sequence. 1929 1930//===---------------------------------------------------------------------===// 1931 1932For the following: 1933struct u1 { 1934 float x, y; 1935}; 1936float foo(struct u1 u) { 1937 return u.x + u.y; 1938} 1939 1940We currently generate: 1941 movdqa %xmm0, %xmm1 1942 pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0] 1943 addss %xmm1, %xmm0 1944 ret 1945 1946We could save an instruction here by commuting the addss. 1947 1948//===---------------------------------------------------------------------===// 1949 1950This (from PR9661): 1951 1952float clamp_float(float a) { 1953 if (a > 1.0f) 1954 return 1.0f; 1955 else if (a < 0.0f) 1956 return 0.0f; 1957 else 1958 return a; 1959} 1960 1961Could compile to: 1962 1963clamp_float: # @clamp_float 1964 movss .LCPI0_0(%rip), %xmm1 1965 minss %xmm1, %xmm0 1966 pxor %xmm1, %xmm1 1967 maxss %xmm1, %xmm0 1968 ret 1969 1970with -ffast-math. 1971 1972//===---------------------------------------------------------------------===// 1973 1974This function (from PR9803): 1975 1976int clamp2(int a) { 1977 if (a > 5) 1978 a = 5; 1979 if (a < 0) 1980 return 0; 1981 return a; 1982} 1983 1984Compiles to: 1985 1986_clamp2: ## @clamp2 1987 pushq %rbp 1988 movq %rsp, %rbp 1989 cmpl $5, %edi 1990 movl $5, %ecx 1991 cmovlel %edi, %ecx 1992 testl %ecx, %ecx 1993 movl $0, %eax 1994 cmovnsl %ecx, %eax 1995 popq %rbp 1996 ret 1997 1998The move of 0 could be scheduled above the test to make it is xor reg,reg. 1999 2000//===---------------------------------------------------------------------===// 2001 2002GCC PR48986. We currently compile this: 2003 2004void bar(void); 2005void yyy(int* p) { 2006 if (__sync_fetch_and_add(p, -1) == 1) 2007 bar(); 2008} 2009 2010into: 2011 movl $-1, %eax 2012 lock 2013 xaddl %eax, (%rdi) 2014 cmpl $1, %eax 2015 je LBB0_2 2016 2017Instead we could generate: 2018 2019 lock 2020 dec %rdi 2021 je LBB0_2 2022 2023The trick is to match "fetch_and_add(X, -C) == C". 2024 2025//===---------------------------------------------------------------------===// 2026 2027unsigned t(unsigned a, unsigned b) { 2028 return a <= b ? 5 : -5; 2029} 2030 2031We generate: 2032 movl $5, %ecx 2033 cmpl %esi, %edi 2034 movl $-5, %eax 2035 cmovbel %ecx, %eax 2036 2037GCC: 2038 cmpl %edi, %esi 2039 sbbl %eax, %eax 2040 andl $-10, %eax 2041 addl $5, %eax 2042 2043//===---------------------------------------------------------------------===// 2044