Name |
Date |
Size |
#Lines |
LOC |
||
---|---|---|---|---|---|---|
.. | - | - | ||||
AsmParser/ | 03-May-2024 | - | 1,415 | 1,162 | ||
InstPrinter/ | 03-May-2024 | - | 433 | 328 | ||
MCTargetDesc/ | 03-May-2024 | - | 1,760 | 1,279 | ||
TargetInfo/ | 03-May-2024 | - | 75 | 44 | ||
CMakeLists.txt | D | 03-May-2024 | 1.2 KiB | 43 | 38 | |
LLVMBuild.txt | D | 03-May-2024 | 1,002 | 34 | 30 | |
Makefile | D | 03-May-2024 | 835 | 24 | 10 | |
PPC.h | D | 03-May-2024 | 3.1 KiB | 98 | 45 | |
PPC.td | D | 03-May-2024 | 14.8 KiB | 297 | 273 | |
PPCAsmPrinter.cpp | D | 03-May-2024 | 42.9 KiB | 1,141 | 846 | |
PPCBranchSelector.cpp | D | 03-May-2024 | 6.9 KiB | 202 | 119 | |
PPCCTRLoops.cpp | D | 03-May-2024 | 20.1 KiB | 638 | 474 | |
PPCCallingConv.td | D | 03-May-2024 | 6.4 KiB | 150 | 115 | |
PPCCodeEmitter.cpp | D | 03-May-2024 | 11 KiB | 291 | 204 | |
PPCFastISel.cpp | D | 03-May-2024 | 10.6 KiB | 329 | 221 | |
PPCFrameLowering.cpp | D | 03-May-2024 | 51.7 KiB | 1,485 | 1,080 | |
PPCFrameLowering.h | D | 03-May-2024 | 9.2 KiB | 304 | 201 | |
PPCHazardRecognizers.cpp | D | 03-May-2024 | 8.2 KiB | 246 | 141 | |
PPCHazardRecognizers.h | D | 03-May-2024 | 3.3 KiB | 92 | 42 | |
PPCISelDAGToDAG.cpp | D | 03-May-2024 | 58.6 KiB | 1,559 | 1,122 | |
PPCISelLowering.cpp | D | 03-May-2024 | 317.1 KiB | 7,911 | 5,450 | |
PPCISelLowering.h | D | 03-May-2024 | 29.2 KiB | 656 | 307 | |
PPCInstr64Bit.td | D | 03-May-2024 | 47 KiB | 1,055 | 939 | |
PPCInstrAltivec.td | D | 03-May-2024 | 40 KiB | 843 | 738 | |
PPCInstrBuilder.h | D | 03-May-2024 | 1.4 KiB | 44 | 14 | |
PPCInstrFormats.td | D | 03-May-2024 | 28 KiB | 1,083 | 888 | |
PPCInstrInfo.cpp | D | 03-May-2024 | 55.4 KiB | 1,526 | 1,143 | |
PPCInstrInfo.h | D | 03-May-2024 | 9.2 KiB | 223 | 136 | |
PPCInstrInfo.td | D | 03-May-2024 | 115.7 KiB | 2,619 | 2,350 | |
PPCJITInfo.cpp | D | 03-May-2024 | 17.6 KiB | 472 | 334 | |
PPCJITInfo.h | D | 03-May-2024 | 1.6 KiB | 50 | 25 | |
PPCMCInstLower.cpp | D | 03-May-2024 | 6.8 KiB | 207 | 159 | |
PPCMachineFunctionInfo.cpp | D | 03-May-2024 | 438 | 16 | 3 | |
PPCMachineFunctionInfo.h | D | 03-May-2024 | 6.5 KiB | 179 | 82 | |
PPCPerfectShuffle.h | D | 03-May-2024 | 397.4 KiB | 6,587 | 6,564 | |
PPCRegisterInfo.cpp | D | 03-May-2024 | 30.8 KiB | 845 | 554 | |
PPCRegisterInfo.h | D | 03-May-2024 | 3.7 KiB | 109 | 66 | |
PPCRegisterInfo.td | D | 03-May-2024 | 8.3 KiB | 237 | 202 | |
PPCRelocations.h | D | 03-May-2024 | 1.9 KiB | 57 | 19 | |
PPCSchedule.td | D | 03-May-2024 | 14.7 KiB | 520 | 514 | |
PPCSchedule440.td | D | 03-May-2024 | 35.5 KiB | 664 | 654 | |
PPCScheduleA2.td | D | 03-May-2024 | 49.5 KiB | 767 | 756 | |
PPCScheduleE500mc.td | D | 03-May-2024 | 14.7 KiB | 266 | 260 | |
PPCScheduleE5500.td | D | 03-May-2024 | 17.7 KiB | 310 | 301 | |
PPCScheduleG3.td | D | 03-May-2024 | 3.5 KiB | 72 | 69 | |
PPCScheduleG4.td | D | 03-May-2024 | 4.1 KiB | 82 | 80 | |
PPCScheduleG4Plus.td | D | 03-May-2024 | 4.6 KiB | 89 | 86 | |
PPCScheduleG5.td | D | 03-May-2024 | 5.5 KiB | 110 | 104 | |
PPCSelectionDAGInfo.cpp | D | 03-May-2024 | 737 | 24 | 8 | |
PPCSelectionDAGInfo.h | D | 03-May-2024 | 830 | 32 | 12 | |
PPCSubtarget.cpp | D | 03-May-2024 | 6.2 KiB | 188 | 119 | |
PPCSubtarget.h | D | 03-May-2024 | 6.9 KiB | 212 | 122 | |
PPCTargetMachine.cpp | D | 03-May-2024 | 5.4 KiB | 170 | 112 | |
PPCTargetMachine.h | D | 03-May-2024 | 3.3 KiB | 101 | 67 | |
PPCTargetObjectFile.cpp | D | 03-May-2024 | 2.6 KiB | 68 | 34 | |
PPCTargetObjectFile.h | D | 03-May-2024 | 1.2 KiB | 36 | 15 | |
PPCTargetTransformInfo.cpp | D | 03-May-2024 | 8.1 KiB | 241 | 140 | |
README.txt | D | 03-May-2024 | 22.7 KiB | 899 | 686 | |
README_ALTIVEC.txt | D | 03-May-2024 | 6.2 KiB | 212 | 158 |
README.txt
1//===- README.txt - Notes for improving PowerPC-specific code gen ---------===// 2 3TODO: 4* lmw/stmw pass a la arm load store optimizer for prolog/epilog 5 6===-------------------------------------------------------------------------=== 7 8On PPC64, this: 9 10long f2 (long x) { return 0xfffffff000000000UL; } 11long f3 (long x) { return 0x1ffffffffUL; } 12 13could compile into: 14 15_f2: 16 li r3,-1 17 rldicr r3,r3,0,27 18 blr 19_f3: 20 li r3,-1 21 rldicl r3,r3,0,31 22 blr 23 24we produce: 25 26_f2: 27 lis r2, 4095 28 ori r2, r2, 65535 29 sldi r3, r2, 36 30 blr 31_f3: 32 li r2, 1 33 sldi r2, r2, 32 34 oris r2, r2, 65535 35 ori r3, r2, 65535 36 blr 37 38===-------------------------------------------------------------------------=== 39 40This code: 41 42unsigned add32carry(unsigned sum, unsigned x) { 43 unsigned z = sum + x; 44 if (sum + x < x) 45 z++; 46 return z; 47} 48 49Should compile to something like: 50 51 addc r3,r3,r4 52 addze r3,r3 53 54instead we get: 55 56 add r3, r4, r3 57 cmplw cr7, r3, r4 58 mfcr r4 ; 1 59 rlwinm r4, r4, 29, 31, 31 60 add r3, r3, r4 61 62Ick. 63 64===-------------------------------------------------------------------------=== 65 66Support 'update' load/store instructions. These are cracked on the G5, but are 67still a codesize win. 68 69With preinc enabled, this: 70 71long *%test4(long *%X, long *%dest) { 72 %Y = getelementptr long* %X, int 4 73 %A = load long* %Y 74 store long %A, long* %dest 75 ret long* %Y 76} 77 78compiles to: 79 80_test4: 81 mr r2, r3 82 lwzu r5, 32(r2) 83 lwz r3, 36(r3) 84 stw r5, 0(r4) 85 stw r3, 4(r4) 86 mr r3, r2 87 blr 88 89with -sched=list-burr, I get: 90 91_test4: 92 lwz r2, 36(r3) 93 lwzu r5, 32(r3) 94 stw r2, 4(r4) 95 stw r5, 0(r4) 96 blr 97 98===-------------------------------------------------------------------------=== 99 100We compile the hottest inner loop of viterbi to: 101 102 li r6, 0 103 b LBB1_84 ;bb432.i 104LBB1_83: ;bb420.i 105 lbzx r8, r5, r7 106 addi r6, r7, 1 107 stbx r8, r4, r7 108LBB1_84: ;bb432.i 109 mr r7, r6 110 cmplwi cr0, r7, 143 111 bne cr0, LBB1_83 ;bb420.i 112 113The CBE manages to produce: 114 115 li r0, 143 116 mtctr r0 117loop: 118 lbzx r2, r2, r11 119 stbx r0, r2, r9 120 addi r2, r2, 1 121 bdz later 122 b loop 123 124This could be much better (bdnz instead of bdz) but it still beats us. If we 125produced this with bdnz, the loop would be a single dispatch group. 126 127===-------------------------------------------------------------------------=== 128 129Lump the constant pool for each function into ONE pic object, and reference 130pieces of it as offsets from the start. For functions like this (contrived 131to have lots of constants obviously): 132 133double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; } 134 135We generate: 136 137_X: 138 lis r2, ha16(.CPI_X_0) 139 lfd f0, lo16(.CPI_X_0)(r2) 140 lis r2, ha16(.CPI_X_1) 141 lfd f2, lo16(.CPI_X_1)(r2) 142 fmadd f0, f1, f0, f2 143 lis r2, ha16(.CPI_X_2) 144 lfd f1, lo16(.CPI_X_2)(r2) 145 lis r2, ha16(.CPI_X_3) 146 lfd f2, lo16(.CPI_X_3)(r2) 147 fmadd f1, f0, f1, f2 148 blr 149 150It would be better to materialize .CPI_X into a register, then use immediates 151off of the register to avoid the lis's. This is even more important in PIC 152mode. 153 154Note that this (and the static variable version) is discussed here for GCC: 155http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html 156 157Here's another example (the sgn function): 158double testf(double a) { 159 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); 160} 161 162it produces a BB like this: 163LBB1_1: ; cond_true 164 lis r2, ha16(LCPI1_0) 165 lfs f0, lo16(LCPI1_0)(r2) 166 lis r2, ha16(LCPI1_1) 167 lis r3, ha16(LCPI1_2) 168 lfs f2, lo16(LCPI1_2)(r3) 169 lfs f3, lo16(LCPI1_1)(r2) 170 fsub f0, f0, f1 171 fsel f1, f0, f2, f3 172 blr 173 174===-------------------------------------------------------------------------=== 175 176PIC Code Gen IPO optimization: 177 178Squish small scalar globals together into a single global struct, allowing the 179address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size 180of the GOT on targets with one). 181 182Note that this is discussed here for GCC: 183http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html 184 185===-------------------------------------------------------------------------=== 186 187Compile offsets from allocas: 188 189int *%test() { 190 %X = alloca { int, int } 191 %Y = getelementptr {int,int}* %X, int 0, uint 1 192 ret int* %Y 193} 194 195into a single add, not two: 196 197_test: 198 addi r2, r1, -8 199 addi r3, r2, 4 200 blr 201 202--> important for C++. 203 204===-------------------------------------------------------------------------=== 205 206No loads or stores of the constants should be needed: 207 208struct foo { double X, Y; }; 209void xxx(struct foo F); 210void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); } 211 212===-------------------------------------------------------------------------=== 213 214Darwin Stub removal: 215 216We still generate calls to foo$stub, and stubs, on Darwin. This is not 217necessary when building with the Leopard (10.5) or later linker, as stubs are 218generated by ld when necessary. Parameterizing this based on the deployment 219target (-mmacosx-version-min) is probably enough. x86-32 does this right, see 220its logic. 221 222===-------------------------------------------------------------------------=== 223 224Darwin Stub LICM optimization: 225 226Loops like this: 227 228 for (...) bar(); 229 230Have to go through an indirect stub if bar is external or linkonce. It would 231be better to compile it as: 232 233 fp = &bar; 234 for (...) fp(); 235 236which only computes the address of bar once (instead of each time through the 237stub). This is Darwin specific and would have to be done in the code generator. 238Probably not a win on x86. 239 240===-------------------------------------------------------------------------=== 241 242Simple IPO for argument passing, change: 243 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y) 244 245the Darwin ABI specifies that any integer arguments in the first 32 bytes worth 246of arguments get assigned to r3 through r10. That is, if you have a function 247foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the 248argument bytes for r4 and r5. The trick then would be to shuffle the argument 249order for functions we can internalize so that the maximum number of 250integers/pointers get passed in regs before you see any of the fp arguments. 251 252Instead of implementing this, it would actually probably be easier to just 253implement a PPC fastcc, where we could do whatever we wanted to the CC, 254including having this work sanely. 255 256===-------------------------------------------------------------------------=== 257 258Fix Darwin FP-In-Integer Registers ABI 259 260Darwin passes doubles in structures in integer registers, which is very very 261bad. Add something like a BITCAST to LLVM, then do an i-p transformation that 262percolates these things out of functions. 263 264Check out how horrible this is: 265http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html 266 267This is an extension of "interprocedural CC unmunging" that can't be done with 268just fastcc. 269 270===-------------------------------------------------------------------------=== 271 272Compile this: 273 274int foo(int a) { 275 int b = (a < 8); 276 if (b) { 277 return b * 3; // ignore the fact that this is always 3. 278 } else { 279 return 2; 280 } 281} 282 283into something not this: 284 285_foo: 2861) cmpwi cr7, r3, 8 287 mfcr r2, 1 288 rlwinm r2, r2, 29, 31, 31 2891) cmpwi cr0, r3, 7 290 bgt cr0, LBB1_2 ; UnifiedReturnBlock 291LBB1_1: ; then 292 rlwinm r2, r2, 0, 31, 31 293 mulli r3, r2, 3 294 blr 295LBB1_2: ; UnifiedReturnBlock 296 li r3, 2 297 blr 298 299In particular, the two compares (marked 1) could be shared by reversing one. 300This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the 301same operands (but backwards) exists. In this case, this wouldn't save us 302anything though, because the compares still wouldn't be shared. 303 304===-------------------------------------------------------------------------=== 305 306We should custom expand setcc instead of pretending that we have it. That 307would allow us to expose the access of the crbit after the mfcr, allowing 308that access to be trivially folded into other ops. A simple example: 309 310int foo(int a, int b) { return (a < b) << 4; } 311 312compiles into: 313 314_foo: 315 cmpw cr7, r3, r4 316 mfcr r2, 1 317 rlwinm r2, r2, 29, 31, 31 318 slwi r3, r2, 4 319 blr 320 321===-------------------------------------------------------------------------=== 322 323Fold add and sub with constant into non-extern, non-weak addresses so this: 324 325static int a; 326void bar(int b) { a = b; } 327void foo(unsigned char *c) { 328 *c = a; 329} 330 331So that 332 333_foo: 334 lis r2, ha16(_a) 335 la r2, lo16(_a)(r2) 336 lbz r2, 3(r2) 337 stb r2, 0(r3) 338 blr 339 340Becomes 341 342_foo: 343 lis r2, ha16(_a+3) 344 lbz r2, lo16(_a+3)(r2) 345 stb r2, 0(r3) 346 blr 347 348===-------------------------------------------------------------------------=== 349 350We generate really bad code for this: 351 352int f(signed char *a, _Bool b, _Bool c) { 353 signed char t = 0; 354 if (b) t = *a; 355 if (c) *a = t; 356} 357 358===-------------------------------------------------------------------------=== 359 360This: 361int test(unsigned *P) { return *P >> 24; } 362 363Should compile to: 364 365_test: 366 lbz r3,0(r3) 367 blr 368 369not: 370 371_test: 372 lwz r2, 0(r3) 373 srwi r3, r2, 24 374 blr 375 376===-------------------------------------------------------------------------=== 377 378On the G5, logical CR operations are more expensive in their three 379address form: ops that read/write the same register are half as expensive as 380those that read from two registers that are different from their destination. 381 382We should model this with two separate instructions. The isel should generate 383the "two address" form of the instructions. When the register allocator 384detects that it needs to insert a copy due to the two-addresness of the CR 385logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point 386we can convert to the "three address" instruction, to save code space. 387 388This only matters when we start generating cr logical ops. 389 390===-------------------------------------------------------------------------=== 391 392We should compile these two functions to the same thing: 393 394#include <stdlib.h> 395void f(int a, int b, int *P) { 396 *P = (a-b)>=0?(a-b):(b-a); 397} 398void g(int a, int b, int *P) { 399 *P = abs(a-b); 400} 401 402Further, they should compile to something better than: 403 404_g: 405 subf r2, r4, r3 406 subfic r3, r2, 0 407 cmpwi cr0, r2, -1 408 bgt cr0, LBB2_2 ; entry 409LBB2_1: ; entry 410 mr r2, r3 411LBB2_2: ; entry 412 stw r2, 0(r5) 413 blr 414 415GCC produces: 416 417_g: 418 subf r4,r4,r3 419 srawi r2,r4,31 420 xor r0,r2,r4 421 subf r0,r2,r0 422 stw r0,0(r5) 423 blr 424 425... which is much nicer. 426 427This theoretically may help improve twolf slightly (used in dimbox.c:142?). 428 429===-------------------------------------------------------------------------=== 430 431PR5945: This: 432define i32 @clamp0g(i32 %a) { 433entry: 434 %cmp = icmp slt i32 %a, 0 435 %sel = select i1 %cmp, i32 0, i32 %a 436 ret i32 %sel 437} 438 439Is compile to this with the PowerPC (32-bit) backend: 440 441_clamp0g: 442 cmpwi cr0, r3, 0 443 li r2, 0 444 blt cr0, LBB1_2 445; BB#1: ; %entry 446 mr r2, r3 447LBB1_2: ; %entry 448 mr r3, r2 449 blr 450 451This could be reduced to the much simpler: 452 453_clamp0g: 454 srawi r2, r3, 31 455 andc r3, r3, r2 456 blr 457 458===-------------------------------------------------------------------------=== 459 460int foo(int N, int ***W, int **TK, int X) { 461 int t, i; 462 463 for (t = 0; t < N; ++t) 464 for (i = 0; i < 4; ++i) 465 W[t / X][i][t % X] = TK[i][t]; 466 467 return 5; 468} 469 470We generate relatively atrocious code for this loop compared to gcc. 471 472We could also strength reduce the rem and the div: 473http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf 474 475===-------------------------------------------------------------------------=== 476 477float foo(float X) { return (int)(X); } 478 479Currently produces: 480 481_foo: 482 fctiwz f0, f1 483 stfd f0, -8(r1) 484 lwz r2, -4(r1) 485 extsw r2, r2 486 std r2, -16(r1) 487 lfd f0, -16(r1) 488 fcfid f0, f0 489 frsp f1, f0 490 blr 491 492We could use a target dag combine to turn the lwz/extsw into an lwa when the 493lwz has a single use. Since LWA is cracked anyway, this would be a codesize 494win only. 495 496===-------------------------------------------------------------------------=== 497 498We generate ugly code for this: 499 500void func(unsigned int *ret, float dx, float dy, float dz, float dw) { 501 unsigned code = 0; 502 if(dx < -dw) code |= 1; 503 if(dx > dw) code |= 2; 504 if(dy < -dw) code |= 4; 505 if(dy > dw) code |= 8; 506 if(dz < -dw) code |= 16; 507 if(dz > dw) code |= 32; 508 *ret = code; 509} 510 511===-------------------------------------------------------------------------=== 512 513%struct.B = type { i8, [3 x i8] } 514 515define void @bar(%struct.B* %b) { 516entry: 517 %tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1] 518 %tmp = load i32* %tmp ; <uint> [#uses=1] 519 %tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1] 520 %tmp4 = load i32* %tmp3 ; <uint> [#uses=1] 521 %tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2] 522 %tmp9 = load i32* %tmp8 ; <uint> [#uses=1] 523 %tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1] 524 %tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1] 525 %tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1] 526 %tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1] 527 %tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1] 528 %tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1] 529 store i32 %tmp13, i32* %tmp8 530 ret void 531} 532 533We emit: 534 535_foo: 536 lwz r2, 0(r3) 537 slwi r4, r2, 1 538 or r4, r4, r2 539 rlwimi r2, r4, 0, 0, 0 540 stw r2, 0(r3) 541 blr 542 543We could collapse a bunch of those ORs and ANDs and generate the following 544equivalent code: 545 546_foo: 547 lwz r2, 0(r3) 548 rlwinm r4, r2, 1, 0, 0 549 or r2, r2, r4 550 stw r2, 0(r3) 551 blr 552 553===-------------------------------------------------------------------------=== 554 555We compile: 556 557unsigned test6(unsigned x) { 558 return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16); 559} 560 561into: 562 563_test6: 564 lis r2, 255 565 rlwinm r3, r3, 16, 0, 31 566 ori r2, r2, 255 567 and r3, r3, r2 568 blr 569 570GCC gets it down to: 571 572_test6: 573 rlwinm r0,r3,16,8,15 574 rlwinm r3,r3,16,24,31 575 or r3,r3,r0 576 blr 577 578 579===-------------------------------------------------------------------------=== 580 581Consider a function like this: 582 583float foo(float X) { return X + 1234.4123f; } 584 585The FP constant ends up in the constant pool, so we need to get the LR register. 586 This ends up producing code like this: 587 588_foo: 589.LBB_foo_0: ; entry 590 mflr r11 591*** stw r11, 8(r1) 592 bl "L00000$pb" 593"L00000$pb": 594 mflr r2 595 addis r2, r2, ha16(.CPI_foo_0-"L00000$pb") 596 lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2) 597 fadds f1, f1, f0 598*** lwz r11, 8(r1) 599 mtlr r11 600 blr 601 602This is functional, but there is no reason to spill the LR register all the way 603to the stack (the two marked instrs): spilling it to a GPR is quite enough. 604 605Implementing this will require some codegen improvements. Nate writes: 606 607"So basically what we need to support the "no stack frame save and restore" is a 608generalization of the LR optimization to "callee-save regs". 609 610Currently, we have LR marked as a callee-save reg. The register allocator sees 611that it's callee save, and spills it directly to the stack. 612 613Ideally, something like this would happen: 614 615LR would be in a separate register class from the GPRs. The class of LR would be 616marked "unspillable". When the register allocator came across an unspillable 617reg, it would ask "what is the best class to copy this into that I *can* spill" 618If it gets a class back, which it will in this case (the gprs), it grabs a free 619register of that class. If it is then later necessary to spill that reg, so be 620it. 621 622===-------------------------------------------------------------------------=== 623 624We compile this: 625int test(_Bool X) { 626 return X ? 524288 : 0; 627} 628 629to: 630_test: 631 cmplwi cr0, r3, 0 632 lis r2, 8 633 li r3, 0 634 beq cr0, LBB1_2 ;entry 635LBB1_1: ;entry 636 mr r3, r2 637LBB1_2: ;entry 638 blr 639 640instead of: 641_test: 642 addic r2,r3,-1 643 subfe r0,r2,r3 644 slwi r3,r0,19 645 blr 646 647This sort of thing occurs a lot due to globalopt. 648 649===-------------------------------------------------------------------------=== 650 651We compile: 652 653define i32 @bar(i32 %x) nounwind readnone ssp { 654entry: 655 %0 = icmp eq i32 %x, 0 ; <i1> [#uses=1] 656 %neg = sext i1 %0 to i32 ; <i32> [#uses=1] 657 ret i32 %neg 658} 659 660to: 661 662_bar: 663 cntlzw r2, r3 664 slwi r2, r2, 26 665 srawi r3, r2, 31 666 blr 667 668it would be better to produce: 669 670_bar: 671 addic r3,r3,-1 672 subfe r3,r3,r3 673 blr 674 675===-------------------------------------------------------------------------=== 676 677We currently compile 32-bit bswap: 678 679declare i32 @llvm.bswap.i32(i32 %A) 680define i32 @test(i32 %A) { 681 %B = call i32 @llvm.bswap.i32(i32 %A) 682 ret i32 %B 683} 684 685to: 686 687_test: 688 rlwinm r2, r3, 24, 16, 23 689 slwi r4, r3, 24 690 rlwimi r2, r3, 8, 24, 31 691 rlwimi r4, r3, 8, 8, 15 692 rlwimi r4, r2, 0, 16, 31 693 mr r3, r4 694 blr 695 696it would be more efficient to produce: 697 698_foo: mr r0,r3 699 rlwinm r3,r3,8,0xffffffff 700 rlwimi r3,r0,24,0,7 701 rlwimi r3,r0,24,16,23 702 blr 703 704===-------------------------------------------------------------------------=== 705 706test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to: 707 708__ZNK4llvm5APInt17countLeadingZerosEv: 709 ld r2, 0(r3) 710 cntlzd r2, r2 711 or r2, r2, r2 <<-- silly. 712 addi r3, r2, -64 713 blr 714 715The dead or is a 'truncate' from 64- to 32-bits. 716 717===-------------------------------------------------------------------------=== 718 719We generate horrible ppc code for this: 720 721#define N 2000000 722double a[N],c[N]; 723void simpleloop() { 724 int j; 725 for (j=0; j<N; j++) 726 c[j] = a[j]; 727} 728 729LBB1_1: ;bb 730 lfdx f0, r3, r4 731 addi r5, r5, 1 ;; Extra IV for the exit value compare. 732 stfdx f0, r2, r4 733 addi r4, r4, 8 734 735 xoris r6, r5, 30 ;; This is due to a large immediate. 736 cmplwi cr0, r6, 33920 737 bne cr0, LBB1_1 738 739//===---------------------------------------------------------------------===// 740 741This: 742 #include <algorithm> 743 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b) 744 { return std::make_pair(a + b, a + b < a); } 745 bool no_overflow(unsigned a, unsigned b) 746 { return !full_add(a, b).second; } 747 748Should compile to: 749 750__Z11no_overflowjj: 751 add r4,r3,r4 752 subfc r3,r3,r4 753 li r3,0 754 adde r3,r3,r3 755 blr 756 757(or better) not: 758 759__Z11no_overflowjj: 760 add r2, r4, r3 761 cmplw cr7, r2, r3 762 mfcr r2 763 rlwinm r2, r2, 29, 31, 31 764 xori r3, r2, 1 765 blr 766 767//===---------------------------------------------------------------------===// 768 769We compile some FP comparisons into an mfcr with two rlwinms and an or. For 770example: 771#include <math.h> 772int test(double x, double y) { return islessequal(x, y);} 773int test2(double x, double y) { return islessgreater(x, y);} 774int test3(double x, double y) { return !islessequal(x, y);} 775 776Compiles into (all three are similar, but the bits differ): 777 778_test: 779 fcmpu cr7, f1, f2 780 mfcr r2 781 rlwinm r3, r2, 29, 31, 31 782 rlwinm r2, r2, 31, 31, 31 783 or r3, r2, r3 784 blr 785 786GCC compiles this into: 787 788 _test: 789 fcmpu cr7,f1,f2 790 cror 30,28,30 791 mfcr r3 792 rlwinm r3,r3,31,1 793 blr 794 795which is more efficient and can use mfocr. See PR642 for some more context. 796 797//===---------------------------------------------------------------------===// 798 799void foo(float *data, float d) { 800 long i; 801 for (i = 0; i < 8000; i++) 802 data[i] = d; 803} 804void foo2(float *data, float d) { 805 long i; 806 data--; 807 for (i = 0; i < 8000; i++) { 808 data[1] = d; 809 data++; 810 } 811} 812 813These compile to: 814 815_foo: 816 li r2, 0 817LBB1_1: ; bb 818 addi r4, r2, 4 819 stfsx f1, r3, r2 820 cmplwi cr0, r4, 32000 821 mr r2, r4 822 bne cr0, LBB1_1 ; bb 823 blr 824_foo2: 825 li r2, 0 826LBB2_1: ; bb 827 addi r4, r2, 4 828 stfsx f1, r3, r2 829 cmplwi cr0, r4, 32000 830 mr r2, r4 831 bne cr0, LBB2_1 ; bb 832 blr 833 834The 'mr' could be eliminated to folding the add into the cmp better. 835 836//===---------------------------------------------------------------------===// 837Codegen for the following (low-probability) case deteriorated considerably 838when the correctness fixes for unordered comparisons went in (PR 642, 58871). 839It should be possible to recover the code quality described in the comments. 840 841; RUN: llvm-as < %s | llc -march=ppc32 | grep or | count 3 842; This should produce one 'or' or 'cror' instruction per function. 843 844; RUN: llvm-as < %s | llc -march=ppc32 | grep mfcr | count 3 845; PR2964 846 847define i32 @test(double %x, double %y) nounwind { 848entry: 849 %tmp3 = fcmp ole double %x, %y ; <i1> [#uses=1] 850 %tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1] 851 ret i32 %tmp345 852} 853 854define i32 @test2(double %x, double %y) nounwind { 855entry: 856 %tmp3 = fcmp one double %x, %y ; <i1> [#uses=1] 857 %tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1] 858 ret i32 %tmp345 859} 860 861define i32 @test3(double %x, double %y) nounwind { 862entry: 863 %tmp3 = fcmp ugt double %x, %y ; <i1> [#uses=1] 864 %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1] 865 ret i32 %tmp34 866} 867//===----------------------------------------------------------------------===// 868; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg 869 870; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and 871; should not be generated except with -enable-finite-only-fp-math or the like). 872; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to 873; recognize a more elaborate tree than a simple SETxx. 874 875define double @test_FNEG_sel(double %A, double %B, double %C) { 876 %D = fsub double -0.000000e+00, %A ; <double> [#uses=1] 877 %Cond = fcmp ugt double %D, -0.000000e+00 ; <i1> [#uses=1] 878 %E = select i1 %Cond, double %B, double %C ; <double> [#uses=1] 879 ret double %E 880} 881 882//===----------------------------------------------------------------------===// 883The save/restore sequence for CR in prolog/epilog is terrible: 884- Each CR subreg is saved individually, rather than doing one save as a unit. 885- On Darwin, the save is done after the decrement of SP, which means the offset 886from SP of the save slot can be too big for a store instruction, which means we 887need an additional register (currently hacked in 96015+96020; the solution there 888is correct, but poor). 889- On SVR4 the same thing can happen, and I don't think saving before the SP 890decrement is safe on that target, as there is no red zone. This is currently 891broken AFAIK, although it's not a target I can exercise. 892The following demonstrates the problem: 893extern void bar(char *p); 894void foo() { 895 char x[100000]; 896 bar(x); 897 __asm__("" ::: "cr2"); 898} 899
README_ALTIVEC.txt
1//===- README_ALTIVEC.txt - Notes for improving Altivec code gen ----------===// 2 3Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector 4registers, to generate better spill code. 5 6//===----------------------------------------------------------------------===// 7 8The first should be a single lvx from the constant pool, the second should be 9a xor/stvx: 10 11void foo(void) { 12 int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 }; 13 bar (x); 14} 15 16#include <string.h> 17void foo(void) { 18 int x[8] __attribute__((aligned(128))); 19 memset (x, 0, sizeof (x)); 20 bar (x); 21} 22 23//===----------------------------------------------------------------------===// 24 25Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0: 26http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763 27 28When -ffast-math is on, we can use 0.0. 29 30//===----------------------------------------------------------------------===// 31 32 Consider this: 33 v4f32 Vector; 34 v4f32 Vector2 = { Vector.X, Vector.X, Vector.X, Vector.X }; 35 36Since we know that "Vector" is 16-byte aligned and we know the element offset 37of ".X", we should change the load into a lve*x instruction, instead of doing 38a load/store/lve*x sequence. 39 40//===----------------------------------------------------------------------===// 41 42For functions that use altivec AND have calls, we are VRSAVE'ing all call 43clobbered regs. 44 45//===----------------------------------------------------------------------===// 46 47Implement passing vectors by value into calls and receiving them as arguments. 48 49//===----------------------------------------------------------------------===// 50 51GCC apparently tries to codegen { C1, C2, Variable, C3 } as a constant pool load 52of C1/C2/C3, then a load and vperm of Variable. 53 54//===----------------------------------------------------------------------===// 55 56We need a way to teach tblgen that some operands of an intrinsic are required to 57be constants. The verifier should enforce this constraint. 58 59//===----------------------------------------------------------------------===// 60 61We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte 62aligned stack slot, followed by a load/vperm. We should probably just store it 63to a scalar stack slot, then use lvsl/vperm to load it. If the value is already 64in memory this is a big win. 65 66//===----------------------------------------------------------------------===// 67 68extract_vector_elt of an arbitrary constant vector can be done with the 69following instructions: 70 71vTemp = vec_splat(v0,2); // 2 is the element the src is in. 72vec_ste(&destloc,0,vTemp); 73 74We can do an arbitrary non-constant value by using lvsr/perm/ste. 75 76//===----------------------------------------------------------------------===// 77 78If we want to tie instruction selection into the scheduler, we can do some 79constant formation with different instructions. For example, we can generate 80"vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with 81"vsplti 0" or "vxor", each of which use different execution units, thus could 82help scheduling. 83 84This is probably only reasonable for a post-pass scheduler. 85 86//===----------------------------------------------------------------------===// 87 88For this function: 89 90void test(vector float *A, vector float *B) { 91 vector float C = (vector float)vec_cmpeq(*A, *B); 92 if (!vec_any_eq(*A, *B)) 93 *B = (vector float){0,0,0,0}; 94 *A = C; 95} 96 97we get the following basic block: 98 99 ... 100 lvx v2, 0, r4 101 lvx v3, 0, r3 102 vcmpeqfp v4, v3, v2 103 vcmpeqfp. v2, v3, v2 104 bne cr6, LBB1_2 ; cond_next 105 106The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the 107vcmpeqfp. result is used by a branch. This can be improved. 108 109//===----------------------------------------------------------------------===// 110 111The code generated for this is truly aweful: 112 113vector float test(float a, float b) { 114 return (vector float){ 0.0, a, 0.0, 0.0}; 115} 116 117LCPI1_0: ; float 118 .space 4 119 .text 120 .globl _test 121 .align 4 122_test: 123 mfspr r2, 256 124 oris r3, r2, 4096 125 mtspr 256, r3 126 lis r3, ha16(LCPI1_0) 127 addi r4, r1, -32 128 stfs f1, -16(r1) 129 addi r5, r1, -16 130 lfs f0, lo16(LCPI1_0)(r3) 131 stfs f0, -32(r1) 132 lvx v2, 0, r4 133 lvx v3, 0, r5 134 vmrghw v3, v3, v2 135 vspltw v2, v2, 0 136 vmrghw v2, v2, v3 137 mtspr 256, r2 138 blr 139 140//===----------------------------------------------------------------------===// 141 142int foo(vector float *x, vector float *y) { 143 if (vec_all_eq(*x,*y)) return 3245; 144 else return 12; 145} 146 147A predicate compare being used in a select_cc should have the same peephole 148applied to it as a predicate compare used by a br_cc. There should be no 149mfcr here: 150 151_foo: 152 mfspr r2, 256 153 oris r5, r2, 12288 154 mtspr 256, r5 155 li r5, 12 156 li r6, 3245 157 lvx v2, 0, r4 158 lvx v3, 0, r3 159 vcmpeqfp. v2, v3, v2 160 mfcr r3, 2 161 rlwinm r3, r3, 25, 31, 31 162 cmpwi cr0, r3, 0 163 bne cr0, LBB1_2 ; entry 164LBB1_1: ; entry 165 mr r6, r5 166LBB1_2: ; entry 167 mr r3, r6 168 mtspr 256, r2 169 blr 170 171//===----------------------------------------------------------------------===// 172 173CodeGen/PowerPC/vec_constants.ll has an and operation that should be 174codegen'd to andc. The issue is that the 'all ones' build vector is 175SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected 176which prevents the vnot pattern from matching. 177 178 179//===----------------------------------------------------------------------===// 180 181An alternative to the store/store/load approach for illegal insert element 182lowering would be: 183 1841. store element to any ol' slot 1852. lvx the slot 1863. lvsl 0; splat index; vcmpeq to generate a select mask 1874. lvsl slot + x; vperm to rotate result into correct slot 1885. vsel result together. 189 190//===----------------------------------------------------------------------===// 191 192Should codegen branches on vec_any/vec_all to avoid mfcr. Two examples: 193 194#include <altivec.h> 195 int f(vector float a, vector float b) 196 { 197 int aa = 0; 198 if (vec_all_ge(a, b)) 199 aa |= 0x1; 200 if (vec_any_ge(a,b)) 201 aa |= 0x2; 202 return aa; 203} 204 205vector float f(vector float a, vector float b) { 206 if (vec_any_eq(a, b)) 207 return a; 208 else 209 return b; 210} 211 212