1//===---------------------------------------------------------------------===// 2// Random ideas for the X86 backend. 3//===---------------------------------------------------------------------===// 4 5Improvements to the multiply -> shift/add algorithm: 6http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html 7 8//===---------------------------------------------------------------------===// 9 10Improve code like this (occurs fairly frequently, e.g. in LLVM): 11long long foo(int x) { return 1LL << x; } 12 13http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html 14http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html 15http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html 16 17Another useful one would be ~0ULL >> X and ~0ULL << X. 18 19One better solution for 1LL << x is: 20 xorl %eax, %eax 21 xorl %edx, %edx 22 testb $32, %cl 23 sete %al 24 setne %dl 25 sall %cl, %eax 26 sall %cl, %edx 27 28But that requires good 8-bit subreg support. 29 30Also, this might be better. It's an extra shift, but it's one instruction 31shorter, and doesn't stress 8-bit subreg support. 32(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html, 33but without the unnecessary and.) 34 movl %ecx, %eax 35 shrl $5, %eax 36 movl %eax, %edx 37 xorl $1, %edx 38 sall %cl, %eax 39 sall %cl. %edx 40 4164-bit shifts (in general) expand to really bad code. Instead of using 42cmovs, we should expand to a conditional branch like GCC produces. 43 44//===---------------------------------------------------------------------===// 45 46Some isel ideas: 47 481. Dynamic programming based approach when compile time is not an 49 issue. 502. Code duplication (addressing mode) during isel. 513. Other ideas from "Register-Sensitive Selection, Duplication, and 52 Sequencing of Instructions". 534. Scheduling for reduced register pressure. E.g. "Minimum Register 54 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 55 and other related papers. 56 http://citeseer.ist.psu.edu/govindarajan01minimum.html 57 58//===---------------------------------------------------------------------===// 59 60Should we promote i16 to i32 to avoid partial register update stalls? 61 62//===---------------------------------------------------------------------===// 63 64Leave any_extend as pseudo instruction and hint to register 65allocator. Delay codegen until post register allocation. 66Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach 67the coalescer how to deal with it though. 68 69//===---------------------------------------------------------------------===// 70 71It appears icc use push for parameter passing. Need to investigate. 72 73//===---------------------------------------------------------------------===// 74 75The instruction selector sometimes misses folding a load into a compare. The 76pattern is written as (cmp reg, (load p)). Because the compare isn't 77commutative, it is not matched with the load on both sides. The dag combiner 78should be made smart enough to canonicalize the load into the RHS of a compare 79when it can invert the result of the compare for free. 80 81//===---------------------------------------------------------------------===// 82 83In many cases, LLVM generates code like this: 84 85_test: 86 movl 8(%esp), %eax 87 cmpl %eax, 4(%esp) 88 setl %al 89 movzbl %al, %eax 90 ret 91 92on some processors (which ones?), it is more efficient to do this: 93 94_test: 95 movl 8(%esp), %ebx 96 xor %eax, %eax 97 cmpl %ebx, 4(%esp) 98 setl %al 99 ret 100 101Doing this correctly is tricky though, as the xor clobbers the flags. 102 103//===---------------------------------------------------------------------===// 104 105We should generate bts/btr/etc instructions on targets where they are cheap or 106when codesize is important. e.g., for: 107 108void setbit(int *target, int bit) { 109 *target |= (1 << bit); 110} 111void clearbit(int *target, int bit) { 112 *target &= ~(1 << bit); 113} 114 115//===---------------------------------------------------------------------===// 116 117Instead of the following for memset char*, 1, 10: 118 119 movl $16843009, 4(%edx) 120 movl $16843009, (%edx) 121 movw $257, 8(%edx) 122 123It might be better to generate 124 125 movl $16843009, %eax 126 movl %eax, 4(%edx) 127 movl %eax, (%edx) 128 movw al, 8(%edx) 129 130when we can spare a register. It reduces code size. 131 132//===---------------------------------------------------------------------===// 133 134Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently 135get this: 136 137define i32 @test1(i32 %X) { 138 %Y = sdiv i32 %X, 8 139 ret i32 %Y 140} 141 142_test1: 143 movl 4(%esp), %eax 144 movl %eax, %ecx 145 sarl $31, %ecx 146 shrl $29, %ecx 147 addl %ecx, %eax 148 sarl $3, %eax 149 ret 150 151GCC knows several different ways to codegen it, one of which is this: 152 153_test1: 154 movl 4(%esp), %eax 155 cmpl $-1, %eax 156 leal 7(%eax), %ecx 157 cmovle %ecx, %eax 158 sarl $3, %eax 159 ret 160 161which is probably slower, but it's interesting at least :) 162 163//===---------------------------------------------------------------------===// 164 165We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl 166We should leave these as libcalls for everything over a much lower threshold, 167since libc is hand tuned for medium and large mem ops (avoiding RFO for large 168stores, TLB preheating, etc) 169 170//===---------------------------------------------------------------------===// 171 172Optimize this into something reasonable: 173 x * copysign(1.0, y) * copysign(1.0, z) 174 175//===---------------------------------------------------------------------===// 176 177Optimize copysign(x, *y) to use an integer load from y. 178 179//===---------------------------------------------------------------------===// 180 181The following tests perform worse with LSR: 182 183lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. 184 185//===---------------------------------------------------------------------===// 186 187Adding to the list of cmp / test poor codegen issues: 188 189int test(__m128 *A, __m128 *B) { 190 if (_mm_comige_ss(*A, *B)) 191 return 3; 192 else 193 return 4; 194} 195 196_test: 197 movl 8(%esp), %eax 198 movaps (%eax), %xmm0 199 movl 4(%esp), %eax 200 movaps (%eax), %xmm1 201 comiss %xmm0, %xmm1 202 setae %al 203 movzbl %al, %ecx 204 movl $3, %eax 205 movl $4, %edx 206 cmpl $0, %ecx 207 cmove %edx, %eax 208 ret 209 210Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There 211are a number of issues. 1) We are introducing a setcc between the result of the 212intrisic call and select. 2) The intrinsic is expected to produce a i32 value 213so a any extend (which becomes a zero extend) is added. 214 215We probably need some kind of target DAG combine hook to fix this. 216 217//===---------------------------------------------------------------------===// 218 219We generate significantly worse code for this than GCC: 220http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150 221http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701 222 223There is also one case we do worse on PPC. 224 225//===---------------------------------------------------------------------===// 226 227For this: 228 229int test(int a) 230{ 231 return a * 3; 232} 233 234We currently emits 235 imull $3, 4(%esp), %eax 236 237Perhaps this is what we really should generate is? Is imull three or four 238cycles? Note: ICC generates this: 239 movl 4(%esp), %eax 240 leal (%eax,%eax,2), %eax 241 242The current instruction priority is based on pattern complexity. The former is 243more "complex" because it folds a load so the latter will not be emitted. 244 245Perhaps we should use AddedComplexity to give LEA32r a higher priority? We 246should always try to match LEA first since the LEA matching code does some 247estimate to determine whether the match is profitable. 248 249However, if we care more about code size, then imull is better. It's two bytes 250shorter than movl + leal. 251 252On a Pentium M, both variants have the same characteristics with regard 253to throughput; however, the multiplication has a latency of four cycles, as 254opposed to two cycles for the movl+lea variant. 255 256//===---------------------------------------------------------------------===// 257 258It appears gcc place string data with linkonce linkage in 259.section __TEXT,__const_coal,coalesced instead of 260.section __DATA,__const_coal,coalesced. 261Take a look at darwin.h, there are other Darwin assembler directives that we 262do not make use of. 263 264//===---------------------------------------------------------------------===// 265 266define i32 @foo(i32* %a, i32 %t) { 267entry: 268 br label %cond_true 269 270cond_true: ; preds = %cond_true, %entry 271 %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3] 272 %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1] 273 %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1] 274 %tmp3 = load i32* %tmp2 ; <i32> [#uses=1] 275 %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1] 276 %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2] 277 %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2] 278 %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1] 279 br i1 %tmp, label %bb12, label %cond_true 280 281bb12: ; preds = %cond_true 282 ret i32 %tmp7 283} 284is pessimized by -loop-reduce and -indvars 285 286//===---------------------------------------------------------------------===// 287 288u32 to float conversion improvement: 289 290float uint32_2_float( unsigned u ) { 291 float fl = (int) (u & 0xffff); 292 float fh = (int) (u >> 16); 293 fh *= 0x1.0p16f; 294 return fh + fl; 295} 296 29700000000 subl $0x04,%esp 29800000003 movl 0x08(%esp,1),%eax 29900000007 movl %eax,%ecx 30000000009 shrl $0x10,%ecx 3010000000c cvtsi2ss %ecx,%xmm0 30200000010 andl $0x0000ffff,%eax 30300000015 cvtsi2ss %eax,%xmm1 30400000019 mulss 0x00000078,%xmm0 30500000021 addss %xmm1,%xmm0 30600000025 movss %xmm0,(%esp,1) 3070000002a flds (%esp,1) 3080000002d addl $0x04,%esp 30900000030 ret 310 311//===---------------------------------------------------------------------===// 312 313When using fastcc abi, align stack slot of argument of type double on 8 byte 314boundary to improve performance. 315 316//===---------------------------------------------------------------------===// 317 318GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting 319simplifications for integer "x cmp y ? a : b". 320 321//===---------------------------------------------------------------------===// 322 323Consider the expansion of: 324 325define i32 @test3(i32 %X) { 326 %tmp1 = urem i32 %X, 255 327 ret i32 %tmp1 328} 329 330Currently it compiles to: 331 332... 333 movl $2155905153, %ecx 334 movl 8(%esp), %esi 335 movl %esi, %eax 336 mull %ecx 337... 338 339This could be "reassociated" into: 340 341 movl $2155905153, %eax 342 movl 8(%esp), %ecx 343 mull %ecx 344 345to avoid the copy. In fact, the existing two-address stuff would do this 346except that mul isn't a commutative 2-addr instruction. I guess this has 347to be done at isel time based on the #uses to mul? 348 349//===---------------------------------------------------------------------===// 350 351Make sure the instruction which starts a loop does not cross a cacheline 352boundary. This requires knowning the exact length of each machine instruction. 353That is somewhat complicated, but doable. Example 256.bzip2: 354 355In the new trace, the hot loop has an instruction which crosses a cacheline 356boundary. In addition to potential cache misses, this can't help decoding as I 357imagine there has to be some kind of complicated decoder reset and realignment 358to grab the bytes from the next cacheline. 359 360532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines 361942 942 0x3d03 movl %dh, (1809(%esp, %esi) 362937 937 0x3d0a incl %esi 3633 3 0x3d0b cmpb %bl, %dl 36427 27 0x3d0d jnz 0x000062db <main+11707> 365 366//===---------------------------------------------------------------------===// 367 368In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. 369 370//===---------------------------------------------------------------------===// 371 372This could be a single 16-bit load. 373 374int f(char *p) { 375 if ((p[0] == 1) & (p[1] == 2)) return 1; 376 return 0; 377} 378 379//===---------------------------------------------------------------------===// 380 381We should inline lrintf and probably other libc functions. 382 383//===---------------------------------------------------------------------===// 384 385This code: 386 387void test(int X) { 388 if (X) abort(); 389} 390 391is currently compiled to: 392 393_test: 394 subl $12, %esp 395 cmpl $0, 16(%esp) 396 jne LBB1_1 397 addl $12, %esp 398 ret 399LBB1_1: 400 call L_abort$stub 401 402It would be better to produce: 403 404_test: 405 subl $12, %esp 406 cmpl $0, 16(%esp) 407 jne L_abort$stub 408 addl $12, %esp 409 ret 410 411This can be applied to any no-return function call that takes no arguments etc. 412Alternatively, the stack save/restore logic could be shrink-wrapped, producing 413something like this: 414 415_test: 416 cmpl $0, 4(%esp) 417 jne LBB1_1 418 ret 419LBB1_1: 420 subl $12, %esp 421 call L_abort$stub 422 423Both are useful in different situations. Finally, it could be shrink-wrapped 424and tail called, like this: 425 426_test: 427 cmpl $0, 4(%esp) 428 jne LBB1_1 429 ret 430LBB1_1: 431 pop %eax # realign stack. 432 call L_abort$stub 433 434Though this probably isn't worth it. 435 436//===---------------------------------------------------------------------===// 437 438Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with 439a neg instead of a sub instruction. Consider: 440 441int test(char X) { return 7-X; } 442 443we currently produce: 444_test: 445 movl $7, %eax 446 movsbl 4(%esp), %ecx 447 subl %ecx, %eax 448 ret 449 450We would use one fewer register if codegen'd as: 451 452 movsbl 4(%esp), %eax 453 neg %eax 454 add $7, %eax 455 ret 456 457Note that this isn't beneficial if the load can be folded into the sub. In 458this case, we want a sub: 459 460int test(int X) { return 7-X; } 461_test: 462 movl $7, %eax 463 subl 4(%esp), %eax 464 ret 465 466//===---------------------------------------------------------------------===// 467 468Leaf functions that require one 4-byte spill slot have a prolog like this: 469 470_foo: 471 pushl %esi 472 subl $4, %esp 473... 474and an epilog like this: 475 addl $4, %esp 476 popl %esi 477 ret 478 479It would be smaller, and potentially faster, to push eax on entry and to 480pop into a dummy register instead of using addl/subl of esp. Just don't pop 481into any return registers :) 482 483//===---------------------------------------------------------------------===// 484 485The X86 backend should fold (branch (or (setcc, setcc))) into multiple 486branches. We generate really poor code for: 487 488double testf(double a) { 489 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); 490} 491 492For example, the entry BB is: 493 494_testf: 495 subl $20, %esp 496 pxor %xmm0, %xmm0 497 movsd 24(%esp), %xmm1 498 ucomisd %xmm0, %xmm1 499 setnp %al 500 sete %cl 501 testb %cl, %al 502 jne LBB1_5 # UnifiedReturnBlock 503LBB1_1: # cond_true 504 505 506it would be better to replace the last four instructions with: 507 508 jp LBB1_1 509 je LBB1_5 510LBB1_1: 511 512We also codegen the inner ?: into a diamond: 513 514 cvtss2sd LCPI1_0(%rip), %xmm2 515 cvtss2sd LCPI1_1(%rip), %xmm3 516 ucomisd %xmm1, %xmm0 517 ja LBB1_3 # cond_true 518LBB1_2: # cond_true 519 movapd %xmm3, %xmm2 520LBB1_3: # cond_true 521 movapd %xmm2, %xmm0 522 ret 523 524We should sink the load into xmm3 into the LBB1_2 block. This should 525be pretty easy, and will nuke all the copies. 526 527//===---------------------------------------------------------------------===// 528 529This: 530 #include <algorithm> 531 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b) 532 { return std::make_pair(a + b, a + b < a); } 533 bool no_overflow(unsigned a, unsigned b) 534 { return !full_add(a, b).second; } 535 536Should compile to: 537 addl %esi, %edi 538 setae %al 539 movzbl %al, %eax 540 ret 541 542on x86-64, instead of the rather stupid-looking: 543 addl %esi, %edi 544 setb %al 545 xorb $1, %al 546 movzbl %al, %eax 547 ret 548 549 550//===---------------------------------------------------------------------===// 551 552The following code: 553 554bb114.preheader: ; preds = %cond_next94 555 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1] 556 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1] 557 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1] 558 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1] 559 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1] 560 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2] 561 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1] 562 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1] 563 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1] 564 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1] 565 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1] 566 br label %bb114 567 568produces: 569 570LBB3_5: # bb114.preheader 571 movswl -68(%ebp), %eax 572 movl $32, %ecx 573 movl %ecx, -80(%ebp) 574 subl %eax, -80(%ebp) 575 movswl -52(%ebp), %eax 576 movl %ecx, -84(%ebp) 577 subl %eax, -84(%ebp) 578 movswl -70(%ebp), %eax 579 movl %ecx, -88(%ebp) 580 subl %eax, -88(%ebp) 581 movswl -50(%ebp), %eax 582 subl %eax, %ecx 583 movl %ecx, -76(%ebp) 584 movswl -42(%ebp), %eax 585 movl %eax, -92(%ebp) 586 movswl -66(%ebp), %eax 587 movl %eax, -96(%ebp) 588 movw $0, -98(%ebp) 589 590This appears to be bad because the RA is not folding the store to the stack 591slot into the movl. The above instructions could be: 592 movl $32, -80(%ebp) 593... 594 movl $32, -84(%ebp) 595... 596This seems like a cross between remat and spill folding. 597 598This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't 599change, so we could simply subtract %eax from %ecx first and then use %ecx (or 600vice-versa). 601 602//===---------------------------------------------------------------------===// 603 604This code: 605 606 %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1] 607 br i1 %tmp659, label %cond_true662, label %cond_next715 608 609produces this: 610 611 testw %cx, %cx 612 movswl %cx, %esi 613 jns LBB4_109 # cond_next715 614 615Shark tells us that using %cx in the testw instruction is sub-optimal. It 616suggests using the 32-bit register (which is what ICC uses). 617 618//===---------------------------------------------------------------------===// 619 620We compile this: 621 622void compare (long long foo) { 623 if (foo < 4294967297LL) 624 abort(); 625} 626 627to: 628 629compare: 630 subl $4, %esp 631 cmpl $0, 8(%esp) 632 setne %al 633 movzbw %al, %ax 634 cmpl $1, 12(%esp) 635 setg %cl 636 movzbw %cl, %cx 637 cmove %ax, %cx 638 testb $1, %cl 639 jne .LBB1_2 # UnifiedReturnBlock 640.LBB1_1: # ifthen 641 call abort 642.LBB1_2: # UnifiedReturnBlock 643 addl $4, %esp 644 ret 645 646(also really horrible code on ppc). This is due to the expand code for 64-bit 647compares. GCC produces multiple branches, which is much nicer: 648 649compare: 650 subl $12, %esp 651 movl 20(%esp), %edx 652 movl 16(%esp), %eax 653 decl %edx 654 jle .L7 655.L5: 656 addl $12, %esp 657 ret 658 .p2align 4,,7 659.L7: 660 jl .L4 661 cmpl $0, %eax 662 .p2align 4,,8 663 ja .L5 664.L4: 665 .p2align 4,,9 666 call abort 667 668//===---------------------------------------------------------------------===// 669 670Tail call optimization improvements: Tail call optimization currently 671pushes all arguments on the top of the stack (their normal place for 672non-tail call optimized calls) that source from the callers arguments 673or that source from a virtual register (also possibly sourcing from 674callers arguments). 675This is done to prevent overwriting of parameters (see example 676below) that might be used later. 677 678example: 679 680int callee(int32, int64); 681int caller(int32 arg1, int32 arg2) { 682 int64 local = arg2 * 2; 683 return callee(arg2, (int64)local); 684} 685 686[arg1] [!arg2 no longer valid since we moved local onto it] 687[arg2] -> [(int64) 688[RETADDR] local ] 689 690Moving arg1 onto the stack slot of callee function would overwrite 691arg2 of the caller. 692 693Possible optimizations: 694 695 696 - Analyse the actual parameters of the callee to see which would 697 overwrite a caller parameter which is used by the callee and only 698 push them onto the top of the stack. 699 700 int callee (int32 arg1, int32 arg2); 701 int caller (int32 arg1, int32 arg2) { 702 return callee(arg1,arg2); 703 } 704 705 Here we don't need to write any variables to the top of the stack 706 since they don't overwrite each other. 707 708 int callee (int32 arg1, int32 arg2); 709 int caller (int32 arg1, int32 arg2) { 710 return callee(arg2,arg1); 711 } 712 713 Here we need to push the arguments because they overwrite each 714 other. 715 716//===---------------------------------------------------------------------===// 717 718main () 719{ 720 int i = 0; 721 unsigned long int z = 0; 722 723 do { 724 z -= 0x00004000; 725 i++; 726 if (i > 0x00040000) 727 abort (); 728 } while (z > 0); 729 exit (0); 730} 731 732gcc compiles this to: 733 734_main: 735 subl $28, %esp 736 xorl %eax, %eax 737 jmp L2 738L3: 739 cmpl $262144, %eax 740 je L10 741L2: 742 addl $1, %eax 743 cmpl $262145, %eax 744 jne L3 745 call L_abort$stub 746L10: 747 movl $0, (%esp) 748 call L_exit$stub 749 750llvm: 751 752_main: 753 subl $12, %esp 754 movl $1, %eax 755 movl $16384, %ecx 756LBB1_1: # bb 757 cmpl $262145, %eax 758 jge LBB1_4 # cond_true 759LBB1_2: # cond_next 760 incl %eax 761 addl $4294950912, %ecx 762 cmpl $16384, %ecx 763 jne LBB1_1 # bb 764LBB1_3: # bb11 765 xorl %eax, %eax 766 addl $12, %esp 767 ret 768LBB1_4: # cond_true 769 call L_abort$stub 770 7711. LSR should rewrite the first cmp with induction variable %ecx. 7722. DAG combiner should fold 773 leal 1(%eax), %edx 774 cmpl $262145, %edx 775 => 776 cmpl $262144, %eax 777 778//===---------------------------------------------------------------------===// 779 780define i64 @test(double %X) { 781 %Y = fptosi double %X to i64 782 ret i64 %Y 783} 784 785compiles to: 786 787_test: 788 subl $20, %esp 789 movsd 24(%esp), %xmm0 790 movsd %xmm0, 8(%esp) 791 fldl 8(%esp) 792 fisttpll (%esp) 793 movl 4(%esp), %edx 794 movl (%esp), %eax 795 addl $20, %esp 796 #FP_REG_KILL 797 ret 798 799This should just fldl directly from the input stack slot. 800 801//===---------------------------------------------------------------------===// 802 803This code: 804int foo (int x) { return (x & 65535) | 255; } 805 806Should compile into: 807 808_foo: 809 movzwl 4(%esp), %eax 810 orl $255, %eax 811 ret 812 813instead of: 814_foo: 815 movl $65280, %eax 816 andl 4(%esp), %eax 817 orl $255, %eax 818 ret 819 820//===---------------------------------------------------------------------===// 821 822We're codegen'ing multiply of long longs inefficiently: 823 824unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) { 825 return arg1 * arg2; 826} 827 828We compile to (fomit-frame-pointer): 829 830_LLM: 831 pushl %esi 832 movl 8(%esp), %ecx 833 movl 16(%esp), %esi 834 movl %esi, %eax 835 mull %ecx 836 imull 12(%esp), %esi 837 addl %edx, %esi 838 imull 20(%esp), %ecx 839 movl %esi, %edx 840 addl %ecx, %edx 841 popl %esi 842 ret 843 844This looks like a scheduling deficiency and lack of remat of the load from 845the argument area. ICC apparently produces: 846 847 movl 8(%esp), %ecx 848 imull 12(%esp), %ecx 849 movl 16(%esp), %eax 850 imull 4(%esp), %eax 851 addl %eax, %ecx 852 movl 4(%esp), %eax 853 mull 12(%esp) 854 addl %ecx, %edx 855 ret 856 857Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR: 858http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236 859 860//===---------------------------------------------------------------------===// 861 862We can fold a store into "zeroing a reg". Instead of: 863 864xorl %eax, %eax 865movl %eax, 124(%esp) 866 867we should get: 868 869movl $0, 124(%esp) 870 871if the flags of the xor are dead. 872 873Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should 874be folded into: shl [mem], 1 875 876//===---------------------------------------------------------------------===// 877 878In SSE mode, we turn abs and neg into a load from the constant pool plus a xor 879or and instruction, for example: 880 881 xorpd LCPI1_0, %xmm2 882 883However, if xmm2 gets spilled, we end up with really ugly code like this: 884 885 movsd (%esp), %xmm0 886 xorpd LCPI1_0, %xmm0 887 movsd %xmm0, (%esp) 888 889Since we 'know' that this is a 'neg', we can actually "fold" the spill into 890the neg/abs instruction, turning it into an *integer* operation, like this: 891 892 xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31) 893 894you could also use xorb, but xorl is less likely to lead to a partial register 895stall. Here is a contrived testcase: 896 897double a, b, c; 898void test(double *P) { 899 double X = *P; 900 a = X; 901 bar(); 902 X = -X; 903 b = X; 904 bar(); 905 c = X; 906} 907 908//===---------------------------------------------------------------------===// 909 910The generated code on x86 for checking for signed overflow on a multiply the 911obvious way is much longer than it needs to be. 912 913int x(int a, int b) { 914 long long prod = (long long)a*b; 915 return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1); 916} 917 918See PR2053 for more details. 919 920//===---------------------------------------------------------------------===// 921 922We should investigate using cdq/ctld (effect: edx = sar eax, 31) 923more aggressively; it should cost the same as a move+shift on any modern 924processor, but it's a lot shorter. Downside is that it puts more 925pressure on register allocation because it has fixed operands. 926 927Example: 928int abs(int x) {return x < 0 ? -x : x;} 929 930gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.: 931abs: 932 movl 4(%esp), %eax 933 cltd 934 xorl %edx, %eax 935 subl %edx, %eax 936 ret 937 938//===---------------------------------------------------------------------===// 939 940Take the following code (from 941http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): 942 943extern unsigned char first_one[65536]; 944int FirstOnet(unsigned long long arg1) 945{ 946 if (arg1 >> 48) 947 return (first_one[arg1 >> 48]); 948 return 0; 949} 950 951 952The following code is currently generated: 953FirstOnet: 954 movl 8(%esp), %eax 955 cmpl $65536, %eax 956 movl 4(%esp), %ecx 957 jb .LBB1_2 # UnifiedReturnBlock 958.LBB1_1: # ifthen 959 shrl $16, %eax 960 movzbl first_one(%eax), %eax 961 ret 962.LBB1_2: # UnifiedReturnBlock 963 xorl %eax, %eax 964 ret 965 966We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this 967lets us change the cmpl into a testl, which is shorter, and eliminate the shift. 968 969//===---------------------------------------------------------------------===// 970 971We compile this function: 972 973define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind { 974entry: 975 %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1] 976 br i1 %tmp2, label %bb7, label %bb 977 978bb: ; preds = %entry 979 %tmp6 = add i32 %b, %a ; <i32> [#uses=1] 980 ret i32 %tmp6 981 982bb7: ; preds = %entry 983 %tmp10 = sub i32 %a, %c ; <i32> [#uses=1] 984 ret i32 %tmp10 985} 986 987to: 988 989foo: # @foo 990# %bb.0: # %entry 991 movl 4(%esp), %ecx 992 cmpb $0, 16(%esp) 993 je .LBB0_2 994# %bb.1: # %bb 995 movl 8(%esp), %eax 996 addl %ecx, %eax 997 ret 998.LBB0_2: # %bb7 999 movl 12(%esp), %edx 1000 movl %ecx, %eax 1001 subl %edx, %eax 1002 ret 1003 1004There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a 1005couple more movls by putting 4(%esp) into %eax instead of %ecx. 1006 1007//===---------------------------------------------------------------------===// 1008 1009See rdar://4653682. 1010 1011From flops: 1012 1013LBB1_15: # bb310 1014 cvtss2sd LCPI1_0, %xmm1 1015 addsd %xmm1, %xmm0 1016 movsd 176(%esp), %xmm2 1017 mulsd %xmm0, %xmm2 1018 movapd %xmm2, %xmm3 1019 mulsd %xmm3, %xmm3 1020 movapd %xmm3, %xmm4 1021 mulsd LCPI1_23, %xmm4 1022 addsd LCPI1_24, %xmm4 1023 mulsd %xmm3, %xmm4 1024 addsd LCPI1_25, %xmm4 1025 mulsd %xmm3, %xmm4 1026 addsd LCPI1_26, %xmm4 1027 mulsd %xmm3, %xmm4 1028 addsd LCPI1_27, %xmm4 1029 mulsd %xmm3, %xmm4 1030 addsd LCPI1_28, %xmm4 1031 mulsd %xmm3, %xmm4 1032 addsd %xmm1, %xmm4 1033 mulsd %xmm2, %xmm4 1034 movsd 152(%esp), %xmm1 1035 addsd %xmm4, %xmm1 1036 movsd %xmm1, 152(%esp) 1037 incl %eax 1038 cmpl %eax, %esi 1039 jge LBB1_15 # bb310 1040LBB1_16: # bb358.loopexit 1041 movsd 152(%esp), %xmm0 1042 addsd %xmm0, %xmm0 1043 addsd LCPI1_22, %xmm0 1044 movsd %xmm0, 152(%esp) 1045 1046Rather than spilling the result of the last addsd in the loop, we should have 1047insert a copy to split the interval (one for the duration of the loop, one 1048extending to the fall through). The register pressure in the loop isn't high 1049enough to warrant the spill. 1050 1051Also check why xmm7 is not used at all in the function. 1052 1053//===---------------------------------------------------------------------===// 1054 1055Take the following: 1056 1057target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128" 1058target triple = "i386-apple-darwin8" 1059@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2] 1060define fastcc void @abort_gzip() noreturn nounwind { 1061entry: 1062 %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1] 1063 br i1 %tmp.b.i, label %bb.i, label %bb4.i 1064bb.i: ; preds = %entry 1065 tail call void @exit( i32 1 ) noreturn nounwind 1066 unreachable 1067bb4.i: ; preds = %entry 1068 store i1 true, i1* @in_exit.4870.b 1069 tail call void @exit( i32 1 ) noreturn nounwind 1070 unreachable 1071} 1072declare void @exit(i32) noreturn nounwind 1073 1074This compiles into: 1075_abort_gzip: ## @abort_gzip 1076## %bb.0: ## %entry 1077 subl $12, %esp 1078 movb _in_exit.4870.b, %al 1079 cmpb $1, %al 1080 jne LBB0_2 1081 1082We somehow miss folding the movb into the cmpb. 1083 1084//===---------------------------------------------------------------------===// 1085 1086We compile: 1087 1088int test(int x, int y) { 1089 return x-y-1; 1090} 1091 1092into (-m64): 1093 1094_test: 1095 decl %edi 1096 movl %edi, %eax 1097 subl %esi, %eax 1098 ret 1099 1100it would be better to codegen as: x+~y (notl+addl) 1101 1102//===---------------------------------------------------------------------===// 1103 1104This code: 1105 1106int foo(const char *str,...) 1107{ 1108 __builtin_va_list a; int x; 1109 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a); 1110 return x; 1111} 1112 1113gets compiled into this on x86-64: 1114 subq $200, %rsp 1115 movaps %xmm7, 160(%rsp) 1116 movaps %xmm6, 144(%rsp) 1117 movaps %xmm5, 128(%rsp) 1118 movaps %xmm4, 112(%rsp) 1119 movaps %xmm3, 96(%rsp) 1120 movaps %xmm2, 80(%rsp) 1121 movaps %xmm1, 64(%rsp) 1122 movaps %xmm0, 48(%rsp) 1123 movq %r9, 40(%rsp) 1124 movq %r8, 32(%rsp) 1125 movq %rcx, 24(%rsp) 1126 movq %rdx, 16(%rsp) 1127 movq %rsi, 8(%rsp) 1128 leaq (%rsp), %rax 1129 movq %rax, 192(%rsp) 1130 leaq 208(%rsp), %rax 1131 movq %rax, 184(%rsp) 1132 movl $48, 180(%rsp) 1133 movl $8, 176(%rsp) 1134 movl 176(%rsp), %eax 1135 cmpl $47, %eax 1136 jbe .LBB1_3 # bb 1137.LBB1_1: # bb3 1138 movq 184(%rsp), %rcx 1139 leaq 8(%rcx), %rax 1140 movq %rax, 184(%rsp) 1141.LBB1_2: # bb4 1142 movl (%rcx), %eax 1143 addq $200, %rsp 1144 ret 1145.LBB1_3: # bb 1146 movl %eax, %ecx 1147 addl $8, %eax 1148 addq 192(%rsp), %rcx 1149 movl %eax, 176(%rsp) 1150 jmp .LBB1_2 # bb4 1151 1152gcc 4.3 generates: 1153 subq $96, %rsp 1154.LCFI0: 1155 leaq 104(%rsp), %rax 1156 movq %rsi, -80(%rsp) 1157 movl $8, -120(%rsp) 1158 movq %rax, -112(%rsp) 1159 leaq -88(%rsp), %rax 1160 movq %rax, -104(%rsp) 1161 movl $8, %eax 1162 cmpl $48, %eax 1163 jb .L6 1164 movq -112(%rsp), %rdx 1165 movl (%rdx), %eax 1166 addq $96, %rsp 1167 ret 1168 .p2align 4,,10 1169 .p2align 3 1170.L6: 1171 mov %eax, %edx 1172 addq -104(%rsp), %rdx 1173 addl $8, %eax 1174 movl %eax, -120(%rsp) 1175 movl (%rdx), %eax 1176 addq $96, %rsp 1177 ret 1178 1179and it gets compiled into this on x86: 1180 pushl %ebp 1181 movl %esp, %ebp 1182 subl $4, %esp 1183 leal 12(%ebp), %eax 1184 movl %eax, -4(%ebp) 1185 leal 16(%ebp), %eax 1186 movl %eax, -4(%ebp) 1187 movl 12(%ebp), %eax 1188 addl $4, %esp 1189 popl %ebp 1190 ret 1191 1192gcc 4.3 generates: 1193 pushl %ebp 1194 movl %esp, %ebp 1195 movl 12(%ebp), %eax 1196 popl %ebp 1197 ret 1198 1199//===---------------------------------------------------------------------===// 1200 1201Teach tblgen not to check bitconvert source type in some cases. This allows us 1202to consolidate the following patterns in X86InstrMMX.td: 1203 1204def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1205 (iPTR 0))))), 1206 (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>; 1207def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1208 (iPTR 0))))), 1209 (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>; 1210def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1211 (iPTR 0))))), 1212 (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>; 1213 1214There are other cases in various td files. 1215 1216//===---------------------------------------------------------------------===// 1217 1218Take something like the following on x86-32: 1219unsigned a(unsigned long long x, unsigned y) {return x % y;} 1220 1221We currently generate a libcall, but we really shouldn't: the expansion is 1222shorter and likely faster than the libcall. The expected code is something 1223like the following: 1224 1225 movl 12(%ebp), %eax 1226 movl 16(%ebp), %ecx 1227 xorl %edx, %edx 1228 divl %ecx 1229 movl 8(%ebp), %eax 1230 divl %ecx 1231 movl %edx, %eax 1232 ret 1233 1234A similar code sequence works for division. 1235 1236//===---------------------------------------------------------------------===// 1237 1238We currently compile this: 1239 1240define i32 @func1(i32 %v1, i32 %v2) nounwind { 1241entry: 1242 %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) 1243 %sum = extractvalue {i32, i1} %t, 0 1244 %obit = extractvalue {i32, i1} %t, 1 1245 br i1 %obit, label %overflow, label %normal 1246normal: 1247 ret i32 %sum 1248overflow: 1249 call void @llvm.trap() 1250 unreachable 1251} 1252declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) 1253declare void @llvm.trap() 1254 1255to: 1256 1257_func1: 1258 movl 4(%esp), %eax 1259 addl 8(%esp), %eax 1260 jo LBB1_2 ## overflow 1261LBB1_1: ## normal 1262 ret 1263LBB1_2: ## overflow 1264 ud2 1265 1266it would be nice to produce "into" someday. 1267 1268//===---------------------------------------------------------------------===// 1269 1270Test instructions can be eliminated by using EFLAGS values from arithmetic 1271instructions. This is currently not done for mul, and, or, xor, neg, shl, 1272sra, srl, shld, shrd, atomic ops, and others. It is also currently not done 1273for read-modify-write instructions. It is also current not done if the 1274OF or CF flags are needed. 1275 1276The shift operators have the complication that when the shift count is 1277zero, EFLAGS is not set, so they can only subsume a test instruction if 1278the shift count is known to be non-zero. Also, using the EFLAGS value 1279from a shift is apparently very slow on some x86 implementations. 1280 1281In read-modify-write instructions, the root node in the isel match is 1282the store, and isel has no way for the use of the EFLAGS result of the 1283arithmetic to be remapped to the new node. 1284 1285Add and subtract instructions set OF on signed overflow and CF on unsiged 1286overflow, while test instructions always clear OF and CF. In order to 1287replace a test with an add or subtract in a situation where OF or CF is 1288needed, codegen must be able to prove that the operation cannot see 1289signed or unsigned overflow, respectively. 1290 1291//===---------------------------------------------------------------------===// 1292 1293memcpy/memmove do not lower to SSE copies when possible. A silly example is: 1294define <16 x float> @foo(<16 x float> %A) nounwind { 1295 %tmp = alloca <16 x float>, align 16 1296 %tmp2 = alloca <16 x float>, align 16 1297 store <16 x float> %A, <16 x float>* %tmp 1298 %s = bitcast <16 x float>* %tmp to i8* 1299 %s2 = bitcast <16 x float>* %tmp2 to i8* 1300 call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16) 1301 %R = load <16 x float>* %tmp2 1302 ret <16 x float> %R 1303} 1304 1305declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind 1306 1307which compiles to: 1308 1309_foo: 1310 subl $140, %esp 1311 movaps %xmm3, 112(%esp) 1312 movaps %xmm2, 96(%esp) 1313 movaps %xmm1, 80(%esp) 1314 movaps %xmm0, 64(%esp) 1315 movl 60(%esp), %eax 1316 movl %eax, 124(%esp) 1317 movl 56(%esp), %eax 1318 movl %eax, 120(%esp) 1319 movl 52(%esp), %eax 1320 <many many more 32-bit copies> 1321 movaps (%esp), %xmm0 1322 movaps 16(%esp), %xmm1 1323 movaps 32(%esp), %xmm2 1324 movaps 48(%esp), %xmm3 1325 addl $140, %esp 1326 ret 1327 1328On Nehalem, it may even be cheaper to just use movups when unaligned than to 1329fall back to lower-granularity chunks. 1330 1331//===---------------------------------------------------------------------===// 1332 1333Implement processor-specific optimizations for parity with GCC on these 1334processors. GCC does two optimizations: 1335 13361. ix86_pad_returns inserts a noop before ret instructions if immediately 1337 preceded by a conditional branch or is the target of a jump. 13382. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of 1339 code contains more than 3 branches. 1340 1341The first one is done for all AMDs, Core2, and "Generic" 1342The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, 1343 Core 2, and "Generic" 1344 1345//===---------------------------------------------------------------------===// 1346Testcase: 1347int x(int a) { return (a&0xf0)>>4; } 1348 1349Current output: 1350 movl 4(%esp), %eax 1351 shrl $4, %eax 1352 andl $15, %eax 1353 ret 1354 1355Ideal output: 1356 movzbl 4(%esp), %eax 1357 shrl $4, %eax 1358 ret 1359 1360//===---------------------------------------------------------------------===// 1361 1362Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch 1363properly. 1364 1365When the return value is not used (i.e. only care about the value in the 1366memory), x86 does not have to use add to implement these. Instead, it can use 1367add, sub, inc, dec instructions with the "lock" prefix. 1368 1369This is currently implemented using a bit of instruction selection trick. The 1370issue is the target independent pattern produces one output and a chain and we 1371want to map it into one that just output a chain. The current trick is to select 1372it into a MERGE_VALUES with the first definition being an implicit_def. The 1373proper solution is to add new ISD opcodes for the no-output variant. DAG 1374combiner can then transform the node before it gets to target node selection. 1375 1376Problem #2 is we are adding a whole bunch of x86 atomic instructions when in 1377fact these instructions are identical to the non-lock versions. We need a way to 1378add target specific information to target nodes and have this information 1379carried over to machine instructions. Asm printer (or JIT) can use this 1380information to add the "lock" prefix. 1381 1382//===---------------------------------------------------------------------===// 1383 1384struct B { 1385 unsigned char y0 : 1; 1386}; 1387 1388int bar(struct B* a) { return a->y0; } 1389 1390define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize { 1391 %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0 1392 %2 = load i8* %1, align 1 1393 %3 = and i8 %2, 1 1394 %4 = zext i8 %3 to i32 1395 ret i32 %4 1396} 1397 1398bar: # @bar 1399# %bb.0: 1400 movb (%rdi), %al 1401 andb $1, %al 1402 movzbl %al, %eax 1403 ret 1404 1405Missed optimization: should be movl+andl. 1406 1407//===---------------------------------------------------------------------===// 1408 1409The x86_64 abi says: 1410 1411Booleans, when stored in a memory object, are stored as single byte objects the 1412value of which is always 0 (false) or 1 (true). 1413 1414We are not using this fact: 1415 1416int bar(_Bool *a) { return *a; } 1417 1418define i32 @bar(i8* nocapture %a) nounwind readonly optsize { 1419 %1 = load i8* %a, align 1, !tbaa !0 1420 %tmp = and i8 %1, 1 1421 %2 = zext i8 %tmp to i32 1422 ret i32 %2 1423} 1424 1425bar: 1426 movb (%rdi), %al 1427 andb $1, %al 1428 movzbl %al, %eax 1429 ret 1430 1431GCC produces 1432 1433bar: 1434 movzbl (%rdi), %eax 1435 ret 1436 1437//===---------------------------------------------------------------------===// 1438 1439Take the following C code: 1440int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } 1441 1442We generate the following IR with clang: 1443define i32 @f(i32 %a, i32 %b) nounwind readnone { 1444entry: 1445 %tmp = xor i32 %b, %a ; <i32> [#uses=1] 1446 %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1] 1447 %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1] 1448 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] 1449 ret i32 %conv5 1450} 1451 1452And the following x86 code: 1453 xorl %esi, %edi 1454 testb $-1, %dil 1455 sete %al 1456 movzbl %al, %eax 1457 ret 1458 1459A cmpb instead of the xorl+testb would be one instruction shorter. 1460 1461//===---------------------------------------------------------------------===// 1462 1463Given the following C code: 1464int f(int a, int b) { return (signed char)a == (signed char)b; } 1465 1466We generate the following IR with clang: 1467define i32 @f(i32 %a, i32 %b) nounwind readnone { 1468entry: 1469 %sext = shl i32 %a, 24 ; <i32> [#uses=1] 1470 %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1] 1471 %sext6 = shl i32 %b, 24 ; <i32> [#uses=1] 1472 %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1] 1473 %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1] 1474 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] 1475 ret i32 %conv5 1476} 1477 1478And the following x86 code: 1479 movsbl %sil, %eax 1480 movsbl %dil, %ecx 1481 cmpl %eax, %ecx 1482 sete %al 1483 movzbl %al, %eax 1484 ret 1485 1486 1487It should be possible to eliminate the sign extensions. 1488 1489//===---------------------------------------------------------------------===// 1490 1491LLVM misses a load+store narrowing opportunity in this code: 1492 1493%struct.bf = type { i64, i16, i16, i32 } 1494 1495@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] 1496 1497define void @t1() nounwind ssp { 1498entry: 1499 %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] 1500 %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1] 1501 %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2] 1502 %3 = load i32* %2, align 1 ; <i32> [#uses=1] 1503 %4 = and i32 %3, -65537 ; <i32> [#uses=1] 1504 store i32 %4, i32* %2, align 1 1505 %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] 1506 %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1] 1507 %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2] 1508 %8 = load i32* %7, align 1 ; <i32> [#uses=1] 1509 %9 = and i32 %8, -131073 ; <i32> [#uses=1] 1510 store i32 %9, i32* %7, align 1 1511 ret void 1512} 1513 1514LLVM currently emits this: 1515 1516 movq bfi(%rip), %rax 1517 andl $-65537, 8(%rax) 1518 movq bfi(%rip), %rax 1519 andl $-131073, 8(%rax) 1520 ret 1521 1522It could narrow the loads and stores to emit this: 1523 1524 movq bfi(%rip), %rax 1525 andb $-2, 10(%rax) 1526 movq bfi(%rip), %rax 1527 andb $-3, 10(%rax) 1528 ret 1529 1530The trouble is that there is a TokenFactor between the store and the 1531load, making it non-trivial to determine if there's anything between 1532the load and the store which would prohibit narrowing. 1533 1534//===---------------------------------------------------------------------===// 1535 1536This code: 1537void foo(unsigned x) { 1538 if (x == 0) bar(); 1539 else if (x == 1) qux(); 1540} 1541 1542currently compiles into: 1543_foo: 1544 movl 4(%esp), %eax 1545 cmpl $1, %eax 1546 je LBB0_3 1547 testl %eax, %eax 1548 jne LBB0_4 1549 1550the testl could be removed: 1551_foo: 1552 movl 4(%esp), %eax 1553 cmpl $1, %eax 1554 je LBB0_3 1555 jb LBB0_4 1556 15570 is the only unsigned number < 1. 1558 1559//===---------------------------------------------------------------------===// 1560 1561This code: 1562 1563%0 = type { i32, i1 } 1564 1565define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp { 1566entry: 1567 %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x) 1568 %cmp = extractvalue %0 %uadd, 1 1569 %inc = zext i1 %cmp to i32 1570 %add = add i32 %x, %sum 1571 %z.0 = add i32 %add, %inc 1572 ret i32 %z.0 1573} 1574 1575declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone 1576 1577compiles to: 1578 1579_add32carry: ## @add32carry 1580 addl %esi, %edi 1581 sbbl %ecx, %ecx 1582 movl %edi, %eax 1583 subl %ecx, %eax 1584 ret 1585 1586But it could be: 1587 1588_add32carry: 1589 leal (%rsi,%rdi), %eax 1590 cmpl %esi, %eax 1591 adcl $0, %eax 1592 ret 1593 1594//===---------------------------------------------------------------------===// 1595 1596The hot loop of 256.bzip2 contains code that looks a bit like this: 1597 1598int foo(char *P, char *Q, int x, int y) { 1599 if (P[0] != Q[0]) 1600 return P[0] < Q[0]; 1601 if (P[1] != Q[1]) 1602 return P[1] < Q[1]; 1603 if (P[2] != Q[2]) 1604 return P[2] < Q[2]; 1605 return P[3] < Q[3]; 1606} 1607 1608In the real code, we get a lot more wrong than this. However, even in this 1609code we generate: 1610 1611_foo: ## @foo 1612## %bb.0: ## %entry 1613 movb (%rsi), %al 1614 movb (%rdi), %cl 1615 cmpb %al, %cl 1616 je LBB0_2 1617LBB0_1: ## %if.then 1618 cmpb %al, %cl 1619 jmp LBB0_5 1620LBB0_2: ## %if.end 1621 movb 1(%rsi), %al 1622 movb 1(%rdi), %cl 1623 cmpb %al, %cl 1624 jne LBB0_1 1625## %bb.3: ## %if.end38 1626 movb 2(%rsi), %al 1627 movb 2(%rdi), %cl 1628 cmpb %al, %cl 1629 jne LBB0_1 1630## %bb.4: ## %if.end60 1631 movb 3(%rdi), %al 1632 cmpb 3(%rsi), %al 1633LBB0_5: ## %if.end60 1634 setl %al 1635 movzbl %al, %eax 1636 ret 1637 1638Note that we generate jumps to LBB0_1 which does a redundant compare. The 1639redundant compare also forces the register values to be live, which prevents 1640folding one of the loads into the compare. In contrast, GCC 4.2 produces: 1641 1642_foo: 1643 movzbl (%rsi), %eax 1644 cmpb %al, (%rdi) 1645 jne L10 1646L12: 1647 movzbl 1(%rsi), %eax 1648 cmpb %al, 1(%rdi) 1649 jne L10 1650 movzbl 2(%rsi), %eax 1651 cmpb %al, 2(%rdi) 1652 jne L10 1653 movzbl 3(%rdi), %eax 1654 cmpb 3(%rsi), %al 1655L10: 1656 setl %al 1657 movzbl %al, %eax 1658 ret 1659 1660which is "perfect". 1661 1662//===---------------------------------------------------------------------===// 1663 1664For the branch in the following code: 1665int a(); 1666int b(int x, int y) { 1667 if (x & (1<<(y&7))) 1668 return a(); 1669 return y; 1670} 1671 1672We currently generate: 1673 movb %sil, %al 1674 andb $7, %al 1675 movzbl %al, %eax 1676 btl %eax, %edi 1677 jae .LBB0_2 1678 1679movl+andl would be shorter than the movb+andb+movzbl sequence. 1680 1681//===---------------------------------------------------------------------===// 1682 1683For the following: 1684struct u1 { 1685 float x, y; 1686}; 1687float foo(struct u1 u) { 1688 return u.x + u.y; 1689} 1690 1691We currently generate: 1692 movdqa %xmm0, %xmm1 1693 pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0] 1694 addss %xmm1, %xmm0 1695 ret 1696 1697We could save an instruction here by commuting the addss. 1698 1699//===---------------------------------------------------------------------===// 1700 1701This (from PR9661): 1702 1703float clamp_float(float a) { 1704 if (a > 1.0f) 1705 return 1.0f; 1706 else if (a < 0.0f) 1707 return 0.0f; 1708 else 1709 return a; 1710} 1711 1712Could compile to: 1713 1714clamp_float: # @clamp_float 1715 movss .LCPI0_0(%rip), %xmm1 1716 minss %xmm1, %xmm0 1717 pxor %xmm1, %xmm1 1718 maxss %xmm1, %xmm0 1719 ret 1720 1721with -ffast-math. 1722 1723//===---------------------------------------------------------------------===// 1724 1725This function (from PR9803): 1726 1727int clamp2(int a) { 1728 if (a > 5) 1729 a = 5; 1730 if (a < 0) 1731 return 0; 1732 return a; 1733} 1734 1735Compiles to: 1736 1737_clamp2: ## @clamp2 1738 pushq %rbp 1739 movq %rsp, %rbp 1740 cmpl $5, %edi 1741 movl $5, %ecx 1742 cmovlel %edi, %ecx 1743 testl %ecx, %ecx 1744 movl $0, %eax 1745 cmovnsl %ecx, %eax 1746 popq %rbp 1747 ret 1748 1749The move of 0 could be scheduled above the test to make it is xor reg,reg. 1750 1751//===---------------------------------------------------------------------===// 1752 1753GCC PR48986. We currently compile this: 1754 1755void bar(void); 1756void yyy(int* p) { 1757 if (__sync_fetch_and_add(p, -1) == 1) 1758 bar(); 1759} 1760 1761into: 1762 movl $-1, %eax 1763 lock 1764 xaddl %eax, (%rdi) 1765 cmpl $1, %eax 1766 je LBB0_2 1767 1768Instead we could generate: 1769 1770 lock 1771 dec %rdi 1772 je LBB0_2 1773 1774The trick is to match "fetch_and_add(X, -C) == C". 1775 1776//===---------------------------------------------------------------------===// 1777 1778unsigned t(unsigned a, unsigned b) { 1779 return a <= b ? 5 : -5; 1780} 1781 1782We generate: 1783 movl $5, %ecx 1784 cmpl %esi, %edi 1785 movl $-5, %eax 1786 cmovbel %ecx, %eax 1787 1788GCC: 1789 cmpl %edi, %esi 1790 sbbl %eax, %eax 1791 andl $-10, %eax 1792 addl $5, %eax 1793 1794//===---------------------------------------------------------------------===// 1795