Name |
Date |
Size |
#Lines |
LOC |
||
---|---|---|---|---|---|---|
.. | - | - | ||||
AsmParser/ | 03-May-2024 | - | 3,812 | 3,020 | ||
Disassembler/ | 03-May-2024 | - | 4,042 | 2,893 | ||
InstPrinter/ | 03-May-2024 | - | 1,511 | 1,185 | ||
MCTargetDesc/ | 03-May-2024 | - | 5,452 | 3,637 | ||
TargetInfo/ | 03-May-2024 | - | 124 | 72 | ||
Utils/ | 03-May-2024 | - | 373 | 212 | ||
Android.mk | D | 03-May-2024 | 1.6 KiB | 73 | 54 | |
CMakeLists.txt | D | 03-May-2024 | 1.8 KiB | 57 | 51 | |
LLVMBuild.txt | D | 03-May-2024 | 1 KiB | 36 | 32 | |
Makefile | D | 03-May-2024 | 840 | 24 | 10 | |
README-FPStack.txt | D | 03-May-2024 | 2.7 KiB | 86 | 58 | |
README-MMX.txt | D | 03-May-2024 | 1.5 KiB | 72 | 55 | |
README-SSE.txt | D | 03-May-2024 | 26.3 KiB | 929 | 713 | |
README-UNIMPLEMENTED.txt | D | 03-May-2024 | 679 | 15 | 12 | |
README-X86-64.txt | D | 03-May-2024 | 6 KiB | 185 | 150 | |
README.txt | D | 03-May-2024 | 51.6 KiB | 1,996 | 1,562 | |
X86.h | D | 03-May-2024 | 3.1 KiB | 83 | 23 | |
X86.td | D | 03-May-2024 | 21.8 KiB | 442 | 391 | |
X86AsmPrinter.cpp | D | 03-May-2024 | 25.6 KiB | 727 | 562 | |
X86AsmPrinter.h | D | 03-May-2024 | 1.7 KiB | 59 | 35 | |
X86AtomicExpandPass.cpp | D | 03-May-2024 | 9.8 KiB | 288 | 183 | |
X86CallingConv.h | D | 03-May-2024 | 1.1 KiB | 36 | 14 | |
X86CallingConv.td | D | 03-May-2024 | 25.7 KiB | 663 | 519 | |
X86CodeEmitter.cpp | D | 03-May-2024 | 53.1 KiB | 1,499 | 1,098 | |
X86CompilationCallback_Win64.asm | D | 03-May-2024 | 1.6 KiB | 69 | 55 | |
X86FastISel.cpp | D | 03-May-2024 | 115.6 KiB | 3,376 | 2,493 | |
X86FixupLEAs.cpp | D | 03-May-2024 | 11.9 KiB | 343 | 259 | |
X86FloatingPoint.cpp | D | 03-May-2024 | 65.9 KiB | 1,774 | 1,147 | |
X86FrameLowering.cpp | D | 03-May-2024 | 62.7 KiB | 1,740 | 1,174 | |
X86FrameLowering.h | D | 03-May-2024 | 2.8 KiB | 75 | 42 | |
X86ISelDAGToDAG.cpp | D | 03-May-2024 | 102 KiB | 2,789 | 1,989 | |
X86ISelLowering.cpp | D | 03-May-2024 | 878.9 KiB | 22,817 | 16,563 | |
X86ISelLowering.h | D | 03-May-2024 | 40.7 KiB | 1,017 | 431 | |
X86Instr3DNow.td | D | 03-May-2024 | 4.4 KiB | 104 | 90 | |
X86InstrAVX512.td | D | 03-May-2024 | 227.4 KiB | 4,528 | 4,048 | |
X86InstrArithmetic.td | D | 03-May-2024 | 66 KiB | 1,404 | 1,256 | |
X86InstrBuilder.h | D | 03-May-2024 | 6.6 KiB | 185 | 107 | |
X86InstrCMovSetCC.td | D | 03-May-2024 | 5.3 KiB | 113 | 103 | |
X86InstrCompiler.td | D | 03-May-2024 | 79.1 KiB | 1,777 | 1,576 | |
X86InstrControl.td | D | 03-May-2024 | 15.2 KiB | 316 | 286 | |
X86InstrExtension.td | D | 03-May-2024 | 8.8 KiB | 173 | 157 | |
X86InstrFMA.td | D | 03-May-2024 | 19.2 KiB | 391 | 362 | |
X86InstrFPStack.td | D | 03-May-2024 | 34.3 KiB | 701 | 640 | |
X86InstrFormats.td | D | 03-May-2024 | 37.9 KiB | 854 | 780 | |
X86InstrFragmentsSIMD.td | D | 03-May-2024 | 26.4 KiB | 571 | 478 | |
X86InstrInfo.cpp | D | 03-May-2024 | 226.3 KiB | 5,581 | 4,700 | |
X86InstrInfo.h | D | 03-May-2024 | 21.1 KiB | 465 | 262 | |
X86InstrInfo.td | D | 03-May-2024 | 131.3 KiB | 2,864 | 2,525 | |
X86InstrMMX.td | D | 03-May-2024 | 28.6 KiB | 624 | 545 | |
X86InstrSSE.td | D | 03-May-2024 | 430.6 KiB | 9,062 | 8,184 | |
X86InstrSVM.td | D | 03-May-2024 | 2.1 KiB | 63 | 52 | |
X86InstrShiftRotate.td | D | 03-May-2024 | 46.1 KiB | 970 | 899 | |
X86InstrSystem.td | D | 03-May-2024 | 26.7 KiB | 570 | 490 | |
X86InstrTSX.td | D | 03-May-2024 | 1.7 KiB | 48 | 37 | |
X86InstrVMX.td | D | 03-May-2024 | 3.2 KiB | 67 | 63 | |
X86InstrXOP.td | D | 03-May-2024 | 14.4 KiB | 290 | 266 | |
X86JITInfo.cpp | D | 03-May-2024 | 19.3 KiB | 589 | 459 | |
X86JITInfo.h | D | 03-May-2024 | 3 KiB | 80 | 29 | |
X86MCInstLower.cpp | D | 03-May-2024 | 35.3 KiB | 927 | 717 | |
X86MachineFunctionInfo.cpp | D | 03-May-2024 | 444 | 15 | 3 | |
X86MachineFunctionInfo.h | D | 03-May-2024 | 5.6 KiB | 146 | 76 | |
X86PadShortFunction.cpp | D | 03-May-2024 | 6.8 KiB | 218 | 136 | |
X86RegisterInfo.cpp | D | 03-May-2024 | 25 KiB | 718 | 587 | |
X86RegisterInfo.h | D | 03-May-2024 | 4.9 KiB | 142 | 61 | |
X86RegisterInfo.td | D | 03-May-2024 | 19.3 KiB | 476 | 412 | |
X86Relocations.h | D | 03-May-2024 | 2 KiB | 53 | 15 | |
X86SchedHaswell.td | D | 03-May-2024 | 8.6 KiB | 265 | 232 | |
X86SchedSandyBridge.td | D | 03-May-2024 | 8 KiB | 250 | 217 | |
X86Schedule.td | D | 03-May-2024 | 21.9 KiB | 642 | 578 | |
X86ScheduleAtom.td | D | 03-May-2024 | 28.4 KiB | 544 | 493 | |
X86ScheduleSLM.td | D | 03-May-2024 | 7.4 KiB | 232 | 200 | |
X86SelectionDAGInfo.cpp | D | 03-May-2024 | 10.1 KiB | 268 | 203 | |
X86SelectionDAGInfo.h | D | 03-May-2024 | 1.6 KiB | 49 | 27 | |
X86Subtarget.cpp | D | 03-May-2024 | 12.2 KiB | 375 | 245 | |
X86Subtarget.h | D | 03-May-2024 | 15.9 KiB | 473 | 253 | |
X86TargetMachine.cpp | D | 03-May-2024 | 6 KiB | 187 | 116 | |
X86TargetMachine.h | D | 03-May-2024 | 2.3 KiB | 71 | 45 | |
X86TargetObjectFile.cpp | D | 03-May-2024 | 3.9 KiB | 109 | 69 | |
X86TargetObjectFile.h | D | 03-May-2024 | 2 KiB | 54 | 27 | |
X86TargetTransformInfo.cpp | D | 03-May-2024 | 38.9 KiB | 1,065 | 734 | |
X86VZeroUpper.cpp | D | 03-May-2024 | 11.5 KiB | 317 | 188 |
README-FPStack.txt
1//===---------------------------------------------------------------------===// 2// Random ideas for the X86 backend: FP stack related stuff 3//===---------------------------------------------------------------------===// 4 5//===---------------------------------------------------------------------===// 6 7Some targets (e.g. athlons) prefer freep to fstp ST(0): 8http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html 9 10//===---------------------------------------------------------------------===// 11 12This should use fiadd on chips where it is profitable: 13double foo(double P, int *I) { return P+*I; } 14 15We have fiadd patterns now but the followings have the same cost and 16complexity. We need a way to specify the later is more profitable. 17 18def FpADD32m : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW, 19 [(set RFP:$dst, (fadd RFP:$src1, 20 (extloadf64f32 addr:$src2)))]>; 21 // ST(0) = ST(0) + [mem32] 22 23def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW, 24 [(set RFP:$dst, (fadd RFP:$src1, 25 (X86fild addr:$src2, i32)))]>; 26 // ST(0) = ST(0) + [mem32int] 27 28//===---------------------------------------------------------------------===// 29 30The FP stackifier should handle simple permutates to reduce number of shuffle 31instructions, e.g. turning: 32 33fld P -> fld Q 34fld Q fld P 35fxch 36 37or: 38 39fxch -> fucomi 40fucomi jl X 41jg X 42 43Ideas: 44http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html 45 46 47//===---------------------------------------------------------------------===// 48 49Add a target specific hook to DAG combiner to handle SINT_TO_FP and 50FP_TO_SINT when the source operand is already in memory. 51 52//===---------------------------------------------------------------------===// 53 54Open code rint,floor,ceil,trunc: 55http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html 56http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html 57 58Opencode the sincos[f] libcall. 59 60//===---------------------------------------------------------------------===// 61 62None of the FPStack instructions are handled in 63X86RegisterInfo::foldMemoryOperand, which prevents the spiller from 64folding spill code into the instructions. 65 66//===---------------------------------------------------------------------===// 67 68Currently the x86 codegen isn't very good at mixing SSE and FPStack 69code: 70 71unsigned int foo(double x) { return x; } 72 73foo: 74 subl $20, %esp 75 movsd 24(%esp), %xmm0 76 movsd %xmm0, 8(%esp) 77 fldl 8(%esp) 78 fisttpll (%esp) 79 movl (%esp), %eax 80 addl $20, %esp 81 ret 82 83This just requires being smarter when custom expanding fptoui. 84 85//===---------------------------------------------------------------------===// 86
README-MMX.txt
1//===---------------------------------------------------------------------===// 2// Random ideas for the X86 backend: MMX-specific stuff. 3//===---------------------------------------------------------------------===// 4 5//===---------------------------------------------------------------------===// 6 7This: 8 9#include <mmintrin.h> 10 11__v2si qux(int A) { 12 return (__v2si){ 0, A }; 13} 14 15is compiled into: 16 17_qux: 18 subl $28, %esp 19 movl 32(%esp), %eax 20 movd %eax, %mm0 21 movq %mm0, (%esp) 22 movl (%esp), %eax 23 movl %eax, 20(%esp) 24 movq %mm0, 8(%esp) 25 movl 12(%esp), %eax 26 movl %eax, 16(%esp) 27 movq 16(%esp), %mm0 28 addl $28, %esp 29 ret 30 31Yuck! 32 33GCC gives us: 34 35_qux: 36 subl $12, %esp 37 movl 16(%esp), %eax 38 movl 20(%esp), %edx 39 movl $0, (%eax) 40 movl %edx, 4(%eax) 41 addl $12, %esp 42 ret $4 43 44//===---------------------------------------------------------------------===// 45 46We generate crappy code for this: 47 48__m64 t() { 49 return _mm_cvtsi32_si64(1); 50} 51 52_t: 53 subl $12, %esp 54 movl $1, %eax 55 movd %eax, %mm0 56 movq %mm0, (%esp) 57 movl (%esp), %eax 58 movl 4(%esp), %edx 59 addl $12, %esp 60 ret 61 62The extra stack traffic is covered in the previous entry. But the other reason 63is we are not smart about materializing constants in MMX registers. With -m64 64 65 movl $1, %eax 66 movd %eax, %mm0 67 movd %mm0, %rax 68 ret 69 70We should be using a constantpool load instead: 71 movq LC0(%rip), %rax 72
README-SSE.txt
1//===---------------------------------------------------------------------===// 2// Random ideas for the X86 backend: SSE-specific stuff. 3//===---------------------------------------------------------------------===// 4 5//===---------------------------------------------------------------------===// 6 7SSE Variable shift can be custom lowered to something like this, which uses a 8small table + unaligned load + shuffle instead of going through memory. 9 10__m128i_shift_right: 11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 13 14... 15__m128i shift_right(__m128i value, unsigned long offset) { 16 return _mm_shuffle_epi8(value, 17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); 18} 19 20//===---------------------------------------------------------------------===// 21 22SSE has instructions for doing operations on complex numbers, we should pattern 23match them. For example, this should turn into a horizontal add: 24 25typedef float __attribute__((vector_size(16))) v4f32; 26float f32(v4f32 A) { 27 return A[0]+A[1]+A[2]+A[3]; 28} 29 30Instead we get this: 31 32_f32: ## @f32 33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] 34 addss %xmm0, %xmm1 35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] 36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] 37 movaps %xmm0, %xmm3 38 addss %xmm1, %xmm3 39 movdqa %xmm2, %xmm0 40 addss %xmm3, %xmm0 41 ret 42 43Also, there are cases where some simple local SLP would improve codegen a bit. 44compiling this: 45 46_Complex float f32(_Complex float A, _Complex float B) { 47 return A+B; 48} 49 50into: 51 52_f32: ## @f32 53 movdqa %xmm0, %xmm2 54 addss %xmm1, %xmm2 55 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] 56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] 57 addss %xmm1, %xmm3 58 movaps %xmm2, %xmm0 59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 60 ret 61 62seems silly when it could just be one addps. 63 64 65//===---------------------------------------------------------------------===// 66 67Expand libm rounding functions inline: Significant speedups possible. 68http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html 69 70//===---------------------------------------------------------------------===// 71 72When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and 73other fast SSE modes. 74 75//===---------------------------------------------------------------------===// 76 77Think about doing i64 math in SSE regs on x86-32. 78 79//===---------------------------------------------------------------------===// 80 81This testcase should have no SSE instructions in it, and only one load from 82a constant pool: 83 84double %test3(bool %B) { 85 %C = select bool %B, double 123.412, double 523.01123123 86 ret double %C 87} 88 89Currently, the select is being lowered, which prevents the dag combiner from 90turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' 91 92The pattern isel got this one right. 93 94//===---------------------------------------------------------------------===// 95 96SSE should implement 'select_cc' using 'emulated conditional moves' that use 97pcmp/pand/pandn/por to do a selection instead of a conditional branch: 98 99double %X(double %Y, double %Z, double %A, double %B) { 100 %C = setlt double %A, %B 101 %z = fadd double %Z, 0.0 ;; select operand is not a load 102 %D = select bool %C, double %Y, double %z 103 ret double %D 104} 105 106We currently emit: 107 108_X: 109 subl $12, %esp 110 xorpd %xmm0, %xmm0 111 addsd 24(%esp), %xmm0 112 movsd 32(%esp), %xmm1 113 movsd 16(%esp), %xmm2 114 ucomisd 40(%esp), %xmm1 115 jb LBB_X_2 116LBB_X_1: 117 movsd %xmm0, %xmm2 118LBB_X_2: 119 movsd %xmm2, (%esp) 120 fldl (%esp) 121 addl $12, %esp 122 ret 123 124//===---------------------------------------------------------------------===// 125 126Lower memcpy / memset to a series of SSE 128 bit move instructions when it's 127feasible. 128 129//===---------------------------------------------------------------------===// 130 131Codegen: 132 if (copysign(1.0, x) == copysign(1.0, y)) 133into: 134 if (x^y & mask) 135when using SSE. 136 137//===---------------------------------------------------------------------===// 138 139Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half 140of a v4sf value. 141 142//===---------------------------------------------------------------------===// 143 144Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. 145Perhaps use pxor / xorp* to clear a XMM register first? 146 147//===---------------------------------------------------------------------===// 148 149External test Nurbs exposed some problems. Look for 150__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc 151emits: 152 153 movaps (%edx), %xmm2 #59.21 154 movaps (%edx), %xmm5 #60.21 155 movaps (%edx), %xmm4 #61.21 156 movaps (%edx), %xmm3 #62.21 157 movl 40(%ecx), %ebp #69.49 158 shufps $0, %xmm2, %xmm5 #60.21 159 movl 100(%esp), %ebx #69.20 160 movl (%ebx), %edi #69.20 161 imull %ebp, %edi #69.49 162 addl (%eax), %edi #70.33 163 shufps $85, %xmm2, %xmm4 #61.21 164 shufps $170, %xmm2, %xmm3 #62.21 165 shufps $255, %xmm2, %xmm2 #63.21 166 lea (%ebp,%ebp,2), %ebx #69.49 167 negl %ebx #69.49 168 lea -3(%edi,%ebx), %ebx #70.33 169 shll $4, %ebx #68.37 170 addl 32(%ecx), %ebx #68.37 171 testb $15, %bl #91.13 172 jne L_B1.24 # Prob 5% #91.13 173 174This is the llvm code after instruction scheduling: 175 176cond_next140 (0xa910740, LLVM BB @0xa90beb0): 177 %reg1078 = MOV32ri -3 178 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 179 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 180 %reg1080 = IMUL32rr %reg1079, %reg1037 181 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 182 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 183 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 184 %reg1082 = SHL32ri %reg1038, 4 185 %reg1039 = ADD32rr %reg1036, %reg1082 186 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 187 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 188 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 189 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 190 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 191 %reg1040 = MOV32rr %reg1039 192 %reg1084 = AND32ri8 %reg1039, 15 193 CMP32ri8 %reg1084, 0 194 JE mbb<cond_next204,0xa914d30> 195 196Still ok. After register allocation: 197 198cond_next140 (0xa910740, LLVM BB @0xa90beb0): 199 %EAX = MOV32ri -3 200 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0 201 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0 202 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0 203 %EDX = MOV32rm %EDX, 1, %NOREG, 40 204 IMUL32rr %EAX<def&use>, %EDX 205 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0 206 %ESI = MOV32rm %ESI, 1, %NOREG, 0 207 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI 208 %EAX = LEA32r %ESI, 1, %EAX, -3 209 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0 210 %ESI = MOV32rm %ESI, 1, %NOREG, 32 211 %EDI = MOV32rr %EAX 212 SHL32ri %EDI<def&use>, 4 213 ADD32rr %EDI<def&use>, %ESI 214 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 215 %XMM1 = MOVAPSrr %XMM0 216 SHUFPSrr %XMM1<def&use>, %XMM1, 170 217 %XMM2 = MOVAPSrr %XMM0 218 SHUFPSrr %XMM2<def&use>, %XMM2, 0 219 %XMM3 = MOVAPSrr %XMM0 220 SHUFPSrr %XMM3<def&use>, %XMM3, 255 221 SHUFPSrr %XMM0<def&use>, %XMM0, 85 222 %EBX = MOV32rr %EDI 223 AND32ri8 %EBX<def&use>, 15 224 CMP32ri8 %EBX, 0 225 JE mbb<cond_next204,0xa914d30> 226 227This looks really bad. The problem is shufps is a destructive opcode. Since it 228appears as operand two in more than one shufps ops. It resulted in a number of 229copies. Note icc also suffers from the same problem. Either the instruction 230selector should select pshufd or The register allocator can made the two-address 231to three-address transformation. 232 233It also exposes some other problems. See MOV32ri -3 and the spills. 234 235//===---------------------------------------------------------------------===// 236 237Consider: 238 239__m128 test(float a) { 240 return _mm_set_ps(0.0, 0.0, 0.0, a*a); 241} 242 243This compiles into: 244 245movss 4(%esp), %xmm1 246mulss %xmm1, %xmm1 247xorps %xmm0, %xmm0 248movss %xmm1, %xmm0 249ret 250 251Because mulss doesn't modify the top 3 elements, the top elements of 252xmm1 are already zero'd. We could compile this to: 253 254movss 4(%esp), %xmm0 255mulss %xmm0, %xmm0 256ret 257 258//===---------------------------------------------------------------------===// 259 260Here's a sick and twisted idea. Consider code like this: 261 262__m128 test(__m128 a) { 263 float b = *(float*)&A; 264 ... 265 return _mm_set_ps(0.0, 0.0, 0.0, b); 266} 267 268This might compile to this code: 269 270movaps c(%esp), %xmm1 271xorps %xmm0, %xmm0 272movss %xmm1, %xmm0 273ret 274 275Now consider if the ... code caused xmm1 to get spilled. This might produce 276this code: 277 278movaps c(%esp), %xmm1 279movaps %xmm1, c2(%esp) 280... 281 282xorps %xmm0, %xmm0 283movaps c2(%esp), %xmm1 284movss %xmm1, %xmm0 285ret 286 287However, since the reload is only used by these instructions, we could 288"fold" it into the uses, producing something like this: 289 290movaps c(%esp), %xmm1 291movaps %xmm1, c2(%esp) 292... 293 294movss c2(%esp), %xmm0 295ret 296 297... saving two instructions. 298 299The basic idea is that a reload from a spill slot, can, if only one 4-byte 300chunk is used, bring in 3 zeros the one element instead of 4 elements. 301This can be used to simplify a variety of shuffle operations, where the 302elements are fixed zeros. 303 304//===---------------------------------------------------------------------===// 305 306This code generates ugly code, probably due to costs being off or something: 307 308define void @test(float* %P, <4 x float>* %P2 ) { 309 %xFloat0.688 = load float* %P 310 %tmp = load <4 x float>* %P2 311 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 312 store <4 x float> %inFloat3.713, <4 x float>* %P2 313 ret void 314} 315 316Generates: 317 318_test: 319 movl 8(%esp), %eax 320 movaps (%eax), %xmm0 321 pxor %xmm1, %xmm1 322 movaps %xmm0, %xmm2 323 shufps $50, %xmm1, %xmm2 324 shufps $132, %xmm2, %xmm0 325 movaps %xmm0, (%eax) 326 ret 327 328Would it be better to generate: 329 330_test: 331 movl 8(%esp), %ecx 332 movaps (%ecx), %xmm0 333 xor %eax, %eax 334 pinsrw $6, %eax, %xmm0 335 pinsrw $7, %eax, %xmm0 336 movaps %xmm0, (%ecx) 337 ret 338 339? 340 341//===---------------------------------------------------------------------===// 342 343Some useful information in the Apple Altivec / SSE Migration Guide: 344 345http://developer.apple.com/documentation/Performance/Conceptual/ 346Accelerate_sse_migration/index.html 347 348e.g. SSE select using and, andnot, or. Various SSE compare translations. 349 350//===---------------------------------------------------------------------===// 351 352Add hooks to commute some CMPP operations. 353 354//===---------------------------------------------------------------------===// 355 356Apply the same transformation that merged four float into a single 128-bit load 357to loads from constant pool. 358 359//===---------------------------------------------------------------------===// 360 361Floating point max / min are commutable when -enable-unsafe-fp-path is 362specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other 363nodes which are selected to max / min instructions that are marked commutable. 364 365//===---------------------------------------------------------------------===// 366 367We should materialize vector constants like "all ones" and "signbit" with 368code like: 369 370 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 371 372and: 373 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 374 psrlq xmm1, 31 ; xmm1 = all 100000000000... 375 376instead of using a load from the constant pool. The later is important for 377ABS/NEG/copysign etc. 378 379//===---------------------------------------------------------------------===// 380 381These functions: 382 383#include <xmmintrin.h> 384__m128i a; 385void x(unsigned short n) { 386 a = _mm_slli_epi32 (a, n); 387} 388void y(unsigned n) { 389 a = _mm_slli_epi32 (a, n); 390} 391 392compile to ( -O3 -static -fomit-frame-pointer): 393_x: 394 movzwl 4(%esp), %eax 395 movd %eax, %xmm0 396 movaps _a, %xmm1 397 pslld %xmm0, %xmm1 398 movaps %xmm1, _a 399 ret 400_y: 401 movd 4(%esp), %xmm0 402 movaps _a, %xmm1 403 pslld %xmm0, %xmm1 404 movaps %xmm1, _a 405 ret 406 407"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems 408like movd would be sufficient in both cases as the value is already zero 409extended in the 32-bit stack slot IIRC. For signed short, it should also be 410save, as a really-signed value would be undefined for pslld. 411 412 413//===---------------------------------------------------------------------===// 414 415#include <math.h> 416int t1(double d) { return signbit(d); } 417 418This currently compiles to: 419 subl $12, %esp 420 movsd 16(%esp), %xmm0 421 movsd %xmm0, (%esp) 422 movl 4(%esp), %eax 423 shrl $31, %eax 424 addl $12, %esp 425 ret 426 427We should use movmskp{s|d} instead. 428 429//===---------------------------------------------------------------------===// 430 431CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single 432(aligned) vector load. This functionality has a couple of problems. 433 4341. The code to infer alignment from loads of globals is in the X86 backend, 435 not the dag combiner. This is because dagcombine2 needs to be able to see 436 through the X86ISD::Wrapper node, which DAGCombine can't really do. 4372. The code for turning 4 x load into a single vector load is target 438 independent and should be moved to the dag combiner. 4393. The code for turning 4 x load into a vector load can only handle a direct 440 load from a global or a direct load from the stack. It should be generalized 441 to handle any load from P, P+4, P+8, P+12, where P can be anything. 4424. The alignment inference code cannot handle loads from globals in non-static 443 mode because it doesn't look through the extra dyld stub load. If you try 444 vec_align.ll without -relocation-model=static, you'll see what I mean. 445 446//===---------------------------------------------------------------------===// 447 448We should lower store(fneg(load p), q) into an integer load+xor+store, which 449eliminates a constant pool load. For example, consider: 450 451define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { 452entry: 453 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] 454 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly 455 ret i64 %tmp20 456} 457declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly 458 459This currently compiles to: 460 461LCPI1_0: # <4 x float> 462 .long 2147483648 # float -0 463 .long 2147483648 # float -0 464 .long 2147483648 # float -0 465 .long 2147483648 # float -0 466_ccosf: 467 subl $12, %esp 468 movss 16(%esp), %xmm0 469 movss %xmm0, 4(%esp) 470 movss 20(%esp), %xmm0 471 xorps LCPI1_0, %xmm0 472 movss %xmm0, (%esp) 473 call L_ccoshf$stub 474 addl $12, %esp 475 ret 476 477Note the load into xmm0, then xor (to negate), then store. In PIC mode, 478this code computes the pic base and does two loads to do the constant pool 479load, so the improvement is much bigger. 480 481The tricky part about this xform is that the argument load/store isn't exposed 482until post-legalize, and at that point, the fneg has been custom expanded into 483an X86 fxor. This means that we need to handle this case in the x86 backend 484instead of in target independent code. 485 486//===---------------------------------------------------------------------===// 487 488Non-SSE4 insert into 16 x i8 is atrociously bad. 489 490//===---------------------------------------------------------------------===// 491 492<2 x i64> extract is substantially worse than <2 x f64>, even if the destination 493is memory. 494 495//===---------------------------------------------------------------------===// 496 497INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert 498any number of 0.0 simultaneously. Currently we only use it for simple 499insertions. 500 501See comments in LowerINSERT_VECTOR_ELT_SSE4. 502 503//===---------------------------------------------------------------------===// 504 505On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not 506Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are 507legal, it'll just take a few extra patterns written in the .td file. 508 509Note: this is not a code quality issue; the custom lowered code happens to be 510right, but we shouldn't have to custom lower anything. This is probably related 511to <2 x i64> ops being so bad. 512 513//===---------------------------------------------------------------------===// 514 515LLVM currently generates stack realignment code, when it is not necessary 516needed. The problem is that we need to know about stack alignment too early, 517before RA runs. 518 519At that point we don't know, whether there will be vector spill, or not. 520Stack realignment logic is overly conservative here, but otherwise we can 521produce unaligned loads/stores. 522 523Fixing this will require some huge RA changes. 524 525Testcase: 526#include <emmintrin.h> 527 528typedef short vSInt16 __attribute__ ((__vector_size__ (16))); 529 530static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, 531- 22725, - 12873};; 532 533vSInt16 madd(vSInt16 b) 534{ 535 return _mm_madd_epi16(a, b); 536} 537 538Generated code (x86-32, linux): 539madd: 540 pushl %ebp 541 movl %esp, %ebp 542 andl $-16, %esp 543 movaps .LCPI1_0, %xmm1 544 pmaddwd %xmm1, %xmm0 545 movl %ebp, %esp 546 popl %ebp 547 ret 548 549//===---------------------------------------------------------------------===// 550 551Consider: 552#include <emmintrin.h> 553__m128 foo2 (float x) { 554 return _mm_set_ps (0, 0, x, 0); 555} 556 557In x86-32 mode, we generate this spiffy code: 558 559_foo2: 560 movss 4(%esp), %xmm0 561 pshufd $81, %xmm0, %xmm0 562 ret 563 564in x86-64 mode, we generate this code, which could be better: 565 566_foo2: 567 xorps %xmm1, %xmm1 568 movss %xmm0, %xmm1 569 pshufd $81, %xmm1, %xmm0 570 ret 571 572In sse4 mode, we could use insertps to make both better. 573 574Here's another testcase that could use insertps [mem]: 575 576#include <xmmintrin.h> 577extern float x2, x3; 578__m128 foo1 (float x1, float x4) { 579 return _mm_set_ps (x2, x1, x3, x4); 580} 581 582gcc mainline compiles it to: 583 584foo1: 585 insertps $0x10, x2(%rip), %xmm0 586 insertps $0x10, x3(%rip), %xmm1 587 movaps %xmm1, %xmm2 588 movlhps %xmm0, %xmm2 589 movaps %xmm2, %xmm0 590 ret 591 592//===---------------------------------------------------------------------===// 593 594We compile vector multiply-by-constant into poor code: 595 596define <4 x i32> @f(<4 x i32> %i) nounwind { 597 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > 598 ret <4 x i32> %A 599} 600 601On targets without SSE4.1, this compiles into: 602 603LCPI1_0: ## <4 x i32> 604 .long 10 605 .long 10 606 .long 10 607 .long 10 608 .text 609 .align 4,0x90 610 .globl _f 611_f: 612 pshufd $3, %xmm0, %xmm1 613 movd %xmm1, %eax 614 imull LCPI1_0+12, %eax 615 movd %eax, %xmm1 616 pshufd $1, %xmm0, %xmm2 617 movd %xmm2, %eax 618 imull LCPI1_0+4, %eax 619 movd %eax, %xmm2 620 punpckldq %xmm1, %xmm2 621 movd %xmm0, %eax 622 imull LCPI1_0, %eax 623 movd %eax, %xmm1 624 movhlps %xmm0, %xmm0 625 movd %xmm0, %eax 626 imull LCPI1_0+8, %eax 627 movd %eax, %xmm0 628 punpckldq %xmm0, %xmm1 629 movaps %xmm1, %xmm0 630 punpckldq %xmm2, %xmm0 631 ret 632 633It would be better to synthesize integer vector multiplication by constants 634using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, 635simple cases such as multiplication by powers of two would be better as 636vector shifts than as multiplications. 637 638//===---------------------------------------------------------------------===// 639 640We compile this: 641 642__m128i 643foo2 (char x) 644{ 645 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); 646} 647 648into: 649 movl $1, %eax 650 xorps %xmm0, %xmm0 651 pinsrw $2, %eax, %xmm0 652 movzbl 4(%esp), %eax 653 pinsrw $3, %eax, %xmm0 654 movl $256, %eax 655 pinsrw $7, %eax, %xmm0 656 ret 657 658 659gcc-4.2: 660 subl $12, %esp 661 movzbl 16(%esp), %eax 662 movdqa LC0, %xmm0 663 pinsrw $3, %eax, %xmm0 664 addl $12, %esp 665 ret 666 .const 667 .align 4 668LC0: 669 .word 0 670 .word 0 671 .word 1 672 .word 0 673 .word 0 674 .word 0 675 .word 0 676 .word 256 677 678With SSE4, it should be 679 movdqa .LC0(%rip), %xmm0 680 pinsrb $6, %edi, %xmm0 681 682//===---------------------------------------------------------------------===// 683 684We should transform a shuffle of two vectors of constants into a single vector 685of constants. Also, insertelement of a constant into a vector of constants 686should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. 687 688We compiled it to something horrible: 689 690 .align 4 691LCPI1_1: ## float 692 .long 1065353216 ## float 1 693 .const 694 695 .align 4 696LCPI1_0: ## <4 x float> 697 .space 4 698 .long 1065353216 ## float 1 699 .space 4 700 .long 1065353216 ## float 1 701 .text 702 .align 4,0x90 703 .globl _t 704_t: 705 xorps %xmm0, %xmm0 706 movhps LCPI1_0, %xmm0 707 movss LCPI1_1, %xmm1 708 movaps %xmm0, %xmm2 709 shufps $2, %xmm1, %xmm2 710 shufps $132, %xmm2, %xmm0 711 movaps %xmm0, 0 712 713//===---------------------------------------------------------------------===// 714rdar://5907648 715 716This function: 717 718float foo(unsigned char x) { 719 return x; 720} 721 722compiles to (x86-32): 723 724define float @foo(i8 zeroext %x) nounwind { 725 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] 726 ret float %tmp12 727} 728 729compiles to: 730 731_foo: 732 subl $4, %esp 733 movzbl 8(%esp), %eax 734 cvtsi2ss %eax, %xmm0 735 movss %xmm0, (%esp) 736 flds (%esp) 737 addl $4, %esp 738 ret 739 740We should be able to use: 741 cvtsi2ss 8($esp), %xmm0 742since we know the stack slot is already zext'd. 743 744//===---------------------------------------------------------------------===// 745 746Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) 747when code size is critical. movlps is slower than movsd on core2 but it's one 748byte shorter. 749 750//===---------------------------------------------------------------------===// 751 752We should use a dynamic programming based approach to tell when using FPStack 753operations is cheaper than SSE. SciMark montecarlo contains code like this 754for example: 755 756double MonteCarlo_num_flops(int Num_samples) { 757 return ((double) Num_samples)* 4.0; 758} 759 760In fpstack mode, this compiles into: 761 762LCPI1_0: 763 .long 1082130432 ## float 4.000000e+00 764_MonteCarlo_num_flops: 765 subl $4, %esp 766 movl 8(%esp), %eax 767 movl %eax, (%esp) 768 fildl (%esp) 769 fmuls LCPI1_0 770 addl $4, %esp 771 ret 772 773in SSE mode, it compiles into significantly slower code: 774 775_MonteCarlo_num_flops: 776 subl $12, %esp 777 cvtsi2sd 16(%esp), %xmm0 778 mulsd LCPI1_0, %xmm0 779 movsd %xmm0, (%esp) 780 fldl (%esp) 781 addl $12, %esp 782 ret 783 784There are also other cases in scimark where using fpstack is better, it is 785cheaper to do fld1 than load from a constant pool for example, so 786"load, add 1.0, store" is better done in the fp stack, etc. 787 788//===---------------------------------------------------------------------===// 789 790The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to 791"cmpsd". For example, this code: 792 793double d1(double x) { return x == x ? x : x + x; } 794 795Compiles into: 796 797_d1: 798 ucomisd %xmm0, %xmm0 799 jnp LBB1_2 800 addsd %xmm0, %xmm0 801 ret 802LBB1_2: 803 ret 804 805Also, the 'ret's should be shared. This is PR6032. 806 807//===---------------------------------------------------------------------===// 808 809These should compile into the same code (PR6214): Perhaps instcombine should 810canonicalize the former into the later? 811 812define float @foo(float %x) nounwind { 813 %t = bitcast float %x to i32 814 %s = and i32 %t, 2147483647 815 %d = bitcast i32 %s to float 816 ret float %d 817} 818 819declare float @fabsf(float %n) 820define float @bar(float %x) nounwind { 821 %d = call float @fabsf(float %x) 822 ret float %d 823} 824 825//===---------------------------------------------------------------------===// 826 827This IR (from PR6194): 828 829target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 830target triple = "x86_64-apple-darwin10.0.0" 831 832%0 = type { double, double } 833%struct.float3 = type { float, float, float } 834 835define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { 836entry: 837 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] 838 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] 839 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] 840 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] 841 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] 842 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] 843 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] 844 store float %tmp12, float* %tmp5 845 ret void 846} 847 848Compiles to: 849 850_test: ## @test 851 movd %xmm0, %rax 852 shrq $32, %rax 853 movl %eax, 4(%rdi) 854 ret 855 856This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and 857doing a shuffle from v[1] to v[0] then a float store. 858 859//===---------------------------------------------------------------------===// 860 861On SSE4 machines, we compile this code: 862 863define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, 864 <2 x float> *%P) nounwind { 865 %Z = fadd <2 x float> %Q, %R 866 867 store <2 x float> %Z, <2 x float> *%P 868 ret <2 x float> %Z 869} 870 871into: 872 873_test2: ## @test2 874## BB#0: 875 insertps $0, %xmm2, %xmm2 876 insertps $16, %xmm3, %xmm2 877 insertps $0, %xmm0, %xmm3 878 insertps $16, %xmm1, %xmm3 879 addps %xmm2, %xmm3 880 movq %xmm3, (%rdi) 881 movaps %xmm3, %xmm0 882 pshufd $1, %xmm3, %xmm1 883 ## kill: XMM1<def> XMM1<kill> 884 ret 885 886The insertps's of $0 are pointless complex copies. 887 888//===---------------------------------------------------------------------===// 889 890[UNSAFE FP] 891 892void foo(double, double, double); 893void norm(double x, double y, double z) { 894 double scale = __builtin_sqrt(x*x + y*y + z*z); 895 foo(x/scale, y/scale, z/scale); 896} 897 898We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is 899slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first 900and emit 3 mulsd in place of the divs. This can be done as a target-independent 901transform. 902 903If we're dealing with floats instead of doubles we could even replace the sqrtss 904and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the 905cost of reduced accuracy. 906 907//===---------------------------------------------------------------------===// 908 909This function should be matched to haddpd when the appropriate CPU is enabled: 910 911#include <x86intrin.h> 912double f (__m128d p) { 913 return p[0] + p[1]; 914} 915 916similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should 917turn into hsubpd also. 918 919//===---------------------------------------------------------------------===// 920 921define <2 x i32> @foo(<2 x double> %in) { 922 %x = fptosi <2 x double> %in to <2 x i32> 923 ret <2 x i32> %x 924} 925 926Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si. 927 928//===---------------------------------------------------------------------===// 929
README-UNIMPLEMENTED.txt
1//===---------------------------------------------------------------------===// 2// Testcases that crash the X86 backend because they aren't implemented 3//===---------------------------------------------------------------------===// 4 5These are cases we know the X86 backend doesn't handle. Patches are welcome 6and appreciated, because no one has signed up to implemented these yet. 7Implementing these would allow elimination of the corresponding intrinsics, 8which would be great. 9 101) vector shifts 112) vector comparisons 123) vector fp<->int conversions: PR2683, PR2684, PR2685, PR2686, PR2688 134) bitcasts from vectors to scalars: PR2804 145) llvm.atomic.cmp.swap.i128.p0i128: PR3462 15
README-X86-64.txt
1//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===// 2 3AMD64 Optimization Manual 8.2 has some nice information about optimizing integer 4multiplication by a constant. How much of it applies to Intel's X86-64 5implementation? There are definite trade-offs to consider: latency vs. register 6pressure vs. code size. 7 8//===---------------------------------------------------------------------===// 9 10Are we better off using branches instead of cmove to implement FP to 11unsigned i64? 12 13_conv: 14 ucomiss LC0(%rip), %xmm0 15 cvttss2siq %xmm0, %rdx 16 jb L3 17 subss LC0(%rip), %xmm0 18 movabsq $-9223372036854775808, %rax 19 cvttss2siq %xmm0, %rdx 20 xorq %rax, %rdx 21L3: 22 movq %rdx, %rax 23 ret 24 25instead of 26 27_conv: 28 movss LCPI1_0(%rip), %xmm1 29 cvttss2siq %xmm0, %rcx 30 movaps %xmm0, %xmm2 31 subss %xmm1, %xmm2 32 cvttss2siq %xmm2, %rax 33 movabsq $-9223372036854775808, %rdx 34 xorq %rdx, %rax 35 ucomiss %xmm1, %xmm0 36 cmovb %rcx, %rax 37 ret 38 39Seems like the jb branch has high likelihood of being taken. It would have 40saved a few instructions. 41 42//===---------------------------------------------------------------------===// 43 44It's not possible to reference AH, BH, CH, and DH registers in an instruction 45requiring REX prefix. However, divb and mulb both produce results in AH. If isel 46emits a CopyFromReg which gets turned into a movb and that can be allocated a 47r8b - r15b. 48 49To get around this, isel emits a CopyFromReg from AX and then right shift it 50down by 8 and truncate it. It's not pretty but it works. We need some register 51allocation magic to make the hack go away (e.g. putting additional constraints 52on the result of the movb). 53 54//===---------------------------------------------------------------------===// 55 56The x86-64 ABI for hidden-argument struct returns requires that the 57incoming value of %rdi be copied into %rax by the callee upon return. 58 59The idea is that it saves callers from having to remember this value, 60which would often require a callee-saved register. Callees usually 61need to keep this value live for most of their body anyway, so it 62doesn't add a significant burden on them. 63 64We currently implement this in codegen, however this is suboptimal 65because it means that it would be quite awkward to implement the 66optimization for callers. 67 68A better implementation would be to relax the LLVM IR rules for sret 69arguments to allow a function with an sret argument to have a non-void 70return type, and to have the front-end to set up the sret argument value 71as the return value of the function. The front-end could more easily 72emit uses of the returned struct value to be in terms of the function's 73lowered return value, and it would free non-C frontends from a 74complication only required by a C-based ABI. 75 76//===---------------------------------------------------------------------===// 77 78We get a redundant zero extension for code like this: 79 80int mask[1000]; 81int foo(unsigned x) { 82 if (x < 10) 83 x = x * 45; 84 else 85 x = x * 78; 86 return mask[x]; 87} 88 89_foo: 90LBB1_0: ## entry 91 cmpl $9, %edi 92 jbe LBB1_3 ## bb 93LBB1_1: ## bb1 94 imull $78, %edi, %eax 95LBB1_2: ## bb2 96 movl %eax, %eax <---- 97 movq _mask@GOTPCREL(%rip), %rcx 98 movl (%rcx,%rax,4), %eax 99 ret 100LBB1_3: ## bb 101 imull $45, %edi, %eax 102 jmp LBB1_2 ## bb2 103 104Before regalloc, we have: 105 106 %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def> 107 JMP mbb<bb2,0x203afb0> 108 Successors according to CFG: 0x203afb0 (#3) 109 110bb1: 0x203af60, LLVM BB @0x1e02310, ID#2: 111 Predecessors according to CFG: 0x203aec0 (#0) 112 %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def> 113 Successors according to CFG: 0x203afb0 (#3) 114 115bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3: 116 Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2) 117 %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>, 118 %reg1026, mbb<bb1,0x203af60> 119 %reg1029<def> = MOVZX64rr32 %reg1027 120 121so we'd have to know that IMUL32rri8 leaves the high word zero extended and to 122be able to recognize the zero extend. This could also presumably be implemented 123if we have whole-function selectiondags. 124 125//===---------------------------------------------------------------------===// 126 127Take the following code 128(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653): 129extern unsigned long table[]; 130unsigned long foo(unsigned char *p) { 131 unsigned long tag = *p; 132 return table[tag >> 4] + table[tag & 0xf]; 133} 134 135Current code generated: 136 movzbl (%rdi), %eax 137 movq %rax, %rcx 138 andq $240, %rcx 139 shrq %rcx 140 andq $15, %rax 141 movq table(,%rax,8), %rax 142 addq table(%rcx), %rax 143 ret 144 145Issues: 1461. First movq should be movl; saves a byte. 1472. Both andq's should be andl; saves another two bytes. I think this was 148 implemented at one point, but subsequently regressed. 1493. shrq should be shrl; saves another byte. 1504. The first andq can be completely eliminated by using a slightly more 151 expensive addressing mode. 152 153//===---------------------------------------------------------------------===// 154 155Consider the following (contrived testcase, but contains common factors): 156 157#include <stdarg.h> 158int test(int x, ...) { 159 int sum, i; 160 va_list l; 161 va_start(l, x); 162 for (i = 0; i < x; i++) 163 sum += va_arg(l, int); 164 va_end(l); 165 return sum; 166} 167 168Testcase given in C because fixing it will likely involve changing the IR 169generated for it. The primary issue with the result is that it doesn't do any 170of the optimizations which are possible if we know the address of a va_list 171in the current function is never taken: 1721. We shouldn't spill the XMM registers because we only call va_arg with "int". 1732. It would be nice if we could scalarrepl the va_list. 1743. Probably overkill, but it'd be cool if we could peel off the first five 175iterations of the loop. 176 177Other optimizations involving functions which use va_arg on floats which don't 178have the address of a va_list taken: 1791. Conversely to the above, we shouldn't spill general registers if we only 180 call va_arg on "double". 1812. If we know nothing more than 64 bits wide is read from the XMM registers, 182 we can change the spilling code to reduce the amount of stack used by half. 183 184//===---------------------------------------------------------------------===// 185
README.txt
1//===---------------------------------------------------------------------===// 2// Random ideas for the X86 backend. 3//===---------------------------------------------------------------------===// 4 5This should be one DIV/IDIV instruction, not a libcall: 6 7unsigned test(unsigned long long X, unsigned Y) { 8 return X/Y; 9} 10 11This can be done trivially with a custom legalizer. What about overflow 12though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224 13 14//===---------------------------------------------------------------------===// 15 16Improvements to the multiply -> shift/add algorithm: 17http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html 18 19//===---------------------------------------------------------------------===// 20 21Improve code like this (occurs fairly frequently, e.g. in LLVM): 22long long foo(int x) { return 1LL << x; } 23 24http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html 25http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html 26http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html 27 28Another useful one would be ~0ULL >> X and ~0ULL << X. 29 30One better solution for 1LL << x is: 31 xorl %eax, %eax 32 xorl %edx, %edx 33 testb $32, %cl 34 sete %al 35 setne %dl 36 sall %cl, %eax 37 sall %cl, %edx 38 39But that requires good 8-bit subreg support. 40 41Also, this might be better. It's an extra shift, but it's one instruction 42shorter, and doesn't stress 8-bit subreg support. 43(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html, 44but without the unnecessary and.) 45 movl %ecx, %eax 46 shrl $5, %eax 47 movl %eax, %edx 48 xorl $1, %edx 49 sall %cl, %eax 50 sall %cl. %edx 51 5264-bit shifts (in general) expand to really bad code. Instead of using 53cmovs, we should expand to a conditional branch like GCC produces. 54 55//===---------------------------------------------------------------------===// 56 57Some isel ideas: 58 591. Dynamic programming based approach when compile time is not an 60 issue. 612. Code duplication (addressing mode) during isel. 623. Other ideas from "Register-Sensitive Selection, Duplication, and 63 Sequencing of Instructions". 644. Scheduling for reduced register pressure. E.g. "Minimum Register 65 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 66 and other related papers. 67 http://citeseer.ist.psu.edu/govindarajan01minimum.html 68 69//===---------------------------------------------------------------------===// 70 71Should we promote i16 to i32 to avoid partial register update stalls? 72 73//===---------------------------------------------------------------------===// 74 75Leave any_extend as pseudo instruction and hint to register 76allocator. Delay codegen until post register allocation. 77Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach 78the coalescer how to deal with it though. 79 80//===---------------------------------------------------------------------===// 81 82It appears icc use push for parameter passing. Need to investigate. 83 84//===---------------------------------------------------------------------===// 85 86This: 87 88void foo(void); 89void bar(int x, int *P) { 90 x >>= 2; 91 if (x) 92 foo(); 93 *P = x; 94} 95 96compiles into: 97 98 movq %rsi, %rbx 99 movl %edi, %r14d 100 sarl $2, %r14d 101 testl %r14d, %r14d 102 je LBB0_2 103 104Instead of doing an explicit test, we can use the flags off the sar. This 105occurs in a bigger testcase like this, which is pretty common: 106 107#include <vector> 108int test1(std::vector<int> &X) { 109 int Sum = 0; 110 for (long i = 0, e = X.size(); i != e; ++i) 111 X[i] = 0; 112 return Sum; 113} 114 115//===---------------------------------------------------------------------===// 116 117Only use inc/neg/not instructions on processors where they are faster than 118add/sub/xor. They are slower on the P4 due to only updating some processor 119flags. 120 121//===---------------------------------------------------------------------===// 122 123The instruction selector sometimes misses folding a load into a compare. The 124pattern is written as (cmp reg, (load p)). Because the compare isn't 125commutative, it is not matched with the load on both sides. The dag combiner 126should be made smart enough to canonicalize the load into the RHS of a compare 127when it can invert the result of the compare for free. 128 129//===---------------------------------------------------------------------===// 130 131In many cases, LLVM generates code like this: 132 133_test: 134 movl 8(%esp), %eax 135 cmpl %eax, 4(%esp) 136 setl %al 137 movzbl %al, %eax 138 ret 139 140on some processors (which ones?), it is more efficient to do this: 141 142_test: 143 movl 8(%esp), %ebx 144 xor %eax, %eax 145 cmpl %ebx, 4(%esp) 146 setl %al 147 ret 148 149Doing this correctly is tricky though, as the xor clobbers the flags. 150 151//===---------------------------------------------------------------------===// 152 153We should generate bts/btr/etc instructions on targets where they are cheap or 154when codesize is important. e.g., for: 155 156void setbit(int *target, int bit) { 157 *target |= (1 << bit); 158} 159void clearbit(int *target, int bit) { 160 *target &= ~(1 << bit); 161} 162 163//===---------------------------------------------------------------------===// 164 165Instead of the following for memset char*, 1, 10: 166 167 movl $16843009, 4(%edx) 168 movl $16843009, (%edx) 169 movw $257, 8(%edx) 170 171It might be better to generate 172 173 movl $16843009, %eax 174 movl %eax, 4(%edx) 175 movl %eax, (%edx) 176 movw al, 8(%edx) 177 178when we can spare a register. It reduces code size. 179 180//===---------------------------------------------------------------------===// 181 182Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently 183get this: 184 185define i32 @test1(i32 %X) { 186 %Y = sdiv i32 %X, 8 187 ret i32 %Y 188} 189 190_test1: 191 movl 4(%esp), %eax 192 movl %eax, %ecx 193 sarl $31, %ecx 194 shrl $29, %ecx 195 addl %ecx, %eax 196 sarl $3, %eax 197 ret 198 199GCC knows several different ways to codegen it, one of which is this: 200 201_test1: 202 movl 4(%esp), %eax 203 cmpl $-1, %eax 204 leal 7(%eax), %ecx 205 cmovle %ecx, %eax 206 sarl $3, %eax 207 ret 208 209which is probably slower, but it's interesting at least :) 210 211//===---------------------------------------------------------------------===// 212 213We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl 214We should leave these as libcalls for everything over a much lower threshold, 215since libc is hand tuned for medium and large mem ops (avoiding RFO for large 216stores, TLB preheating, etc) 217 218//===---------------------------------------------------------------------===// 219 220Optimize this into something reasonable: 221 x * copysign(1.0, y) * copysign(1.0, z) 222 223//===---------------------------------------------------------------------===// 224 225Optimize copysign(x, *y) to use an integer load from y. 226 227//===---------------------------------------------------------------------===// 228 229The following tests perform worse with LSR: 230 231lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. 232 233//===---------------------------------------------------------------------===// 234 235Adding to the list of cmp / test poor codegen issues: 236 237int test(__m128 *A, __m128 *B) { 238 if (_mm_comige_ss(*A, *B)) 239 return 3; 240 else 241 return 4; 242} 243 244_test: 245 movl 8(%esp), %eax 246 movaps (%eax), %xmm0 247 movl 4(%esp), %eax 248 movaps (%eax), %xmm1 249 comiss %xmm0, %xmm1 250 setae %al 251 movzbl %al, %ecx 252 movl $3, %eax 253 movl $4, %edx 254 cmpl $0, %ecx 255 cmove %edx, %eax 256 ret 257 258Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There 259are a number of issues. 1) We are introducing a setcc between the result of the 260intrisic call and select. 2) The intrinsic is expected to produce a i32 value 261so a any extend (which becomes a zero extend) is added. 262 263We probably need some kind of target DAG combine hook to fix this. 264 265//===---------------------------------------------------------------------===// 266 267We generate significantly worse code for this than GCC: 268http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150 269http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701 270 271There is also one case we do worse on PPC. 272 273//===---------------------------------------------------------------------===// 274 275For this: 276 277int test(int a) 278{ 279 return a * 3; 280} 281 282We currently emits 283 imull $3, 4(%esp), %eax 284 285Perhaps this is what we really should generate is? Is imull three or four 286cycles? Note: ICC generates this: 287 movl 4(%esp), %eax 288 leal (%eax,%eax,2), %eax 289 290The current instruction priority is based on pattern complexity. The former is 291more "complex" because it folds a load so the latter will not be emitted. 292 293Perhaps we should use AddedComplexity to give LEA32r a higher priority? We 294should always try to match LEA first since the LEA matching code does some 295estimate to determine whether the match is profitable. 296 297However, if we care more about code size, then imull is better. It's two bytes 298shorter than movl + leal. 299 300On a Pentium M, both variants have the same characteristics with regard 301to throughput; however, the multiplication has a latency of four cycles, as 302opposed to two cycles for the movl+lea variant. 303 304//===---------------------------------------------------------------------===// 305 306__builtin_ffs codegen is messy. 307 308int ffs_(unsigned X) { return __builtin_ffs(X); } 309 310llvm produces: 311ffs_: 312 movl 4(%esp), %ecx 313 bsfl %ecx, %eax 314 movl $32, %edx 315 cmove %edx, %eax 316 incl %eax 317 xorl %edx, %edx 318 testl %ecx, %ecx 319 cmove %edx, %eax 320 ret 321 322vs gcc: 323 324_ffs_: 325 movl $-1, %edx 326 bsfl 4(%esp), %eax 327 cmove %edx, %eax 328 addl $1, %eax 329 ret 330 331Another example of __builtin_ffs (use predsimplify to eliminate a select): 332 333int foo (unsigned long j) { 334 if (j) 335 return __builtin_ffs (j) - 1; 336 else 337 return 0; 338} 339 340//===---------------------------------------------------------------------===// 341 342It appears gcc place string data with linkonce linkage in 343.section __TEXT,__const_coal,coalesced instead of 344.section __DATA,__const_coal,coalesced. 345Take a look at darwin.h, there are other Darwin assembler directives that we 346do not make use of. 347 348//===---------------------------------------------------------------------===// 349 350define i32 @foo(i32* %a, i32 %t) { 351entry: 352 br label %cond_true 353 354cond_true: ; preds = %cond_true, %entry 355 %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3] 356 %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1] 357 %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1] 358 %tmp3 = load i32* %tmp2 ; <i32> [#uses=1] 359 %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1] 360 %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2] 361 %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2] 362 %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1] 363 br i1 %tmp, label %bb12, label %cond_true 364 365bb12: ; preds = %cond_true 366 ret i32 %tmp7 367} 368is pessimized by -loop-reduce and -indvars 369 370//===---------------------------------------------------------------------===// 371 372u32 to float conversion improvement: 373 374float uint32_2_float( unsigned u ) { 375 float fl = (int) (u & 0xffff); 376 float fh = (int) (u >> 16); 377 fh *= 0x1.0p16f; 378 return fh + fl; 379} 380 38100000000 subl $0x04,%esp 38200000003 movl 0x08(%esp,1),%eax 38300000007 movl %eax,%ecx 38400000009 shrl $0x10,%ecx 3850000000c cvtsi2ss %ecx,%xmm0 38600000010 andl $0x0000ffff,%eax 38700000015 cvtsi2ss %eax,%xmm1 38800000019 mulss 0x00000078,%xmm0 38900000021 addss %xmm1,%xmm0 39000000025 movss %xmm0,(%esp,1) 3910000002a flds (%esp,1) 3920000002d addl $0x04,%esp 39300000030 ret 394 395//===---------------------------------------------------------------------===// 396 397When using fastcc abi, align stack slot of argument of type double on 8 byte 398boundary to improve performance. 399 400//===---------------------------------------------------------------------===// 401 402GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting 403simplifications for integer "x cmp y ? a : b". 404 405//===---------------------------------------------------------------------===// 406 407Consider the expansion of: 408 409define i32 @test3(i32 %X) { 410 %tmp1 = urem i32 %X, 255 411 ret i32 %tmp1 412} 413 414Currently it compiles to: 415 416... 417 movl $2155905153, %ecx 418 movl 8(%esp), %esi 419 movl %esi, %eax 420 mull %ecx 421... 422 423This could be "reassociated" into: 424 425 movl $2155905153, %eax 426 movl 8(%esp), %ecx 427 mull %ecx 428 429to avoid the copy. In fact, the existing two-address stuff would do this 430except that mul isn't a commutative 2-addr instruction. I guess this has 431to be done at isel time based on the #uses to mul? 432 433//===---------------------------------------------------------------------===// 434 435Make sure the instruction which starts a loop does not cross a cacheline 436boundary. This requires knowning the exact length of each machine instruction. 437That is somewhat complicated, but doable. Example 256.bzip2: 438 439In the new trace, the hot loop has an instruction which crosses a cacheline 440boundary. In addition to potential cache misses, this can't help decoding as I 441imagine there has to be some kind of complicated decoder reset and realignment 442to grab the bytes from the next cacheline. 443 444532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines 445942 942 0x3d03 movl %dh, (1809(%esp, %esi) 446937 937 0x3d0a incl %esi 4473 3 0x3d0b cmpb %bl, %dl 44827 27 0x3d0d jnz 0x000062db <main+11707> 449 450//===---------------------------------------------------------------------===// 451 452In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. 453 454//===---------------------------------------------------------------------===// 455 456This could be a single 16-bit load. 457 458int f(char *p) { 459 if ((p[0] == 1) & (p[1] == 2)) return 1; 460 return 0; 461} 462 463//===---------------------------------------------------------------------===// 464 465We should inline lrintf and probably other libc functions. 466 467//===---------------------------------------------------------------------===// 468 469Use the FLAGS values from arithmetic instructions more. For example, compile: 470 471int add_zf(int *x, int y, int a, int b) { 472 if ((*x += y) == 0) 473 return a; 474 else 475 return b; 476} 477 478to: 479 addl %esi, (%rdi) 480 movl %edx, %eax 481 cmovne %ecx, %eax 482 ret 483instead of: 484 485_add_zf: 486 addl (%rdi), %esi 487 movl %esi, (%rdi) 488 testl %esi, %esi 489 cmove %edx, %ecx 490 movl %ecx, %eax 491 ret 492 493As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll 494without a test instruction. 495 496//===---------------------------------------------------------------------===// 497 498These two functions have identical effects: 499 500unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;} 501unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;} 502 503We currently compile them to: 504 505_f: 506 movl 4(%esp), %eax 507 movl %eax, %ecx 508 incl %ecx 509 movl 8(%esp), %edx 510 cmpl %edx, %ecx 511 jne LBB1_2 #UnifiedReturnBlock 512LBB1_1: #cond_true 513 addl $2, %eax 514 ret 515LBB1_2: #UnifiedReturnBlock 516 movl %ecx, %eax 517 ret 518_f2: 519 movl 4(%esp), %eax 520 movl %eax, %ecx 521 incl %ecx 522 cmpl 8(%esp), %ecx 523 sete %cl 524 movzbl %cl, %ecx 525 leal 1(%ecx,%eax), %eax 526 ret 527 528both of which are inferior to GCC's: 529 530_f: 531 movl 4(%esp), %edx 532 leal 1(%edx), %eax 533 addl $2, %edx 534 cmpl 8(%esp), %eax 535 cmove %edx, %eax 536 ret 537_f2: 538 movl 4(%esp), %eax 539 addl $1, %eax 540 xorl %edx, %edx 541 cmpl 8(%esp), %eax 542 sete %dl 543 addl %edx, %eax 544 ret 545 546//===---------------------------------------------------------------------===// 547 548This code: 549 550void test(int X) { 551 if (X) abort(); 552} 553 554is currently compiled to: 555 556_test: 557 subl $12, %esp 558 cmpl $0, 16(%esp) 559 jne LBB1_1 560 addl $12, %esp 561 ret 562LBB1_1: 563 call L_abort$stub 564 565It would be better to produce: 566 567_test: 568 subl $12, %esp 569 cmpl $0, 16(%esp) 570 jne L_abort$stub 571 addl $12, %esp 572 ret 573 574This can be applied to any no-return function call that takes no arguments etc. 575Alternatively, the stack save/restore logic could be shrink-wrapped, producing 576something like this: 577 578_test: 579 cmpl $0, 4(%esp) 580 jne LBB1_1 581 ret 582LBB1_1: 583 subl $12, %esp 584 call L_abort$stub 585 586Both are useful in different situations. Finally, it could be shrink-wrapped 587and tail called, like this: 588 589_test: 590 cmpl $0, 4(%esp) 591 jne LBB1_1 592 ret 593LBB1_1: 594 pop %eax # realign stack. 595 call L_abort$stub 596 597Though this probably isn't worth it. 598 599//===---------------------------------------------------------------------===// 600 601Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with 602a neg instead of a sub instruction. Consider: 603 604int test(char X) { return 7-X; } 605 606we currently produce: 607_test: 608 movl $7, %eax 609 movsbl 4(%esp), %ecx 610 subl %ecx, %eax 611 ret 612 613We would use one fewer register if codegen'd as: 614 615 movsbl 4(%esp), %eax 616 neg %eax 617 add $7, %eax 618 ret 619 620Note that this isn't beneficial if the load can be folded into the sub. In 621this case, we want a sub: 622 623int test(int X) { return 7-X; } 624_test: 625 movl $7, %eax 626 subl 4(%esp), %eax 627 ret 628 629//===---------------------------------------------------------------------===// 630 631Leaf functions that require one 4-byte spill slot have a prolog like this: 632 633_foo: 634 pushl %esi 635 subl $4, %esp 636... 637and an epilog like this: 638 addl $4, %esp 639 popl %esi 640 ret 641 642It would be smaller, and potentially faster, to push eax on entry and to 643pop into a dummy register instead of using addl/subl of esp. Just don't pop 644into any return registers :) 645 646//===---------------------------------------------------------------------===// 647 648The X86 backend should fold (branch (or (setcc, setcc))) into multiple 649branches. We generate really poor code for: 650 651double testf(double a) { 652 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); 653} 654 655For example, the entry BB is: 656 657_testf: 658 subl $20, %esp 659 pxor %xmm0, %xmm0 660 movsd 24(%esp), %xmm1 661 ucomisd %xmm0, %xmm1 662 setnp %al 663 sete %cl 664 testb %cl, %al 665 jne LBB1_5 # UnifiedReturnBlock 666LBB1_1: # cond_true 667 668 669it would be better to replace the last four instructions with: 670 671 jp LBB1_1 672 je LBB1_5 673LBB1_1: 674 675We also codegen the inner ?: into a diamond: 676 677 cvtss2sd LCPI1_0(%rip), %xmm2 678 cvtss2sd LCPI1_1(%rip), %xmm3 679 ucomisd %xmm1, %xmm0 680 ja LBB1_3 # cond_true 681LBB1_2: # cond_true 682 movapd %xmm3, %xmm2 683LBB1_3: # cond_true 684 movapd %xmm2, %xmm0 685 ret 686 687We should sink the load into xmm3 into the LBB1_2 block. This should 688be pretty easy, and will nuke all the copies. 689 690//===---------------------------------------------------------------------===// 691 692This: 693 #include <algorithm> 694 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b) 695 { return std::make_pair(a + b, a + b < a); } 696 bool no_overflow(unsigned a, unsigned b) 697 { return !full_add(a, b).second; } 698 699Should compile to: 700 addl %esi, %edi 701 setae %al 702 movzbl %al, %eax 703 ret 704 705on x86-64, instead of the rather stupid-looking: 706 addl %esi, %edi 707 setb %al 708 xorb $1, %al 709 movzbl %al, %eax 710 ret 711 712 713//===---------------------------------------------------------------------===// 714 715The following code: 716 717bb114.preheader: ; preds = %cond_next94 718 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1] 719 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1] 720 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1] 721 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1] 722 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1] 723 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2] 724 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1] 725 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1] 726 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1] 727 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1] 728 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1] 729 br label %bb114 730 731produces: 732 733LBB3_5: # bb114.preheader 734 movswl -68(%ebp), %eax 735 movl $32, %ecx 736 movl %ecx, -80(%ebp) 737 subl %eax, -80(%ebp) 738 movswl -52(%ebp), %eax 739 movl %ecx, -84(%ebp) 740 subl %eax, -84(%ebp) 741 movswl -70(%ebp), %eax 742 movl %ecx, -88(%ebp) 743 subl %eax, -88(%ebp) 744 movswl -50(%ebp), %eax 745 subl %eax, %ecx 746 movl %ecx, -76(%ebp) 747 movswl -42(%ebp), %eax 748 movl %eax, -92(%ebp) 749 movswl -66(%ebp), %eax 750 movl %eax, -96(%ebp) 751 movw $0, -98(%ebp) 752 753This appears to be bad because the RA is not folding the store to the stack 754slot into the movl. The above instructions could be: 755 movl $32, -80(%ebp) 756... 757 movl $32, -84(%ebp) 758... 759This seems like a cross between remat and spill folding. 760 761This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't 762change, so we could simply subtract %eax from %ecx first and then use %ecx (or 763vice-versa). 764 765//===---------------------------------------------------------------------===// 766 767This code: 768 769 %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1] 770 br i1 %tmp659, label %cond_true662, label %cond_next715 771 772produces this: 773 774 testw %cx, %cx 775 movswl %cx, %esi 776 jns LBB4_109 # cond_next715 777 778Shark tells us that using %cx in the testw instruction is sub-optimal. It 779suggests using the 32-bit register (which is what ICC uses). 780 781//===---------------------------------------------------------------------===// 782 783We compile this: 784 785void compare (long long foo) { 786 if (foo < 4294967297LL) 787 abort(); 788} 789 790to: 791 792compare: 793 subl $4, %esp 794 cmpl $0, 8(%esp) 795 setne %al 796 movzbw %al, %ax 797 cmpl $1, 12(%esp) 798 setg %cl 799 movzbw %cl, %cx 800 cmove %ax, %cx 801 testb $1, %cl 802 jne .LBB1_2 # UnifiedReturnBlock 803.LBB1_1: # ifthen 804 call abort 805.LBB1_2: # UnifiedReturnBlock 806 addl $4, %esp 807 ret 808 809(also really horrible code on ppc). This is due to the expand code for 64-bit 810compares. GCC produces multiple branches, which is much nicer: 811 812compare: 813 subl $12, %esp 814 movl 20(%esp), %edx 815 movl 16(%esp), %eax 816 decl %edx 817 jle .L7 818.L5: 819 addl $12, %esp 820 ret 821 .p2align 4,,7 822.L7: 823 jl .L4 824 cmpl $0, %eax 825 .p2align 4,,8 826 ja .L5 827.L4: 828 .p2align 4,,9 829 call abort 830 831//===---------------------------------------------------------------------===// 832 833Tail call optimization improvements: Tail call optimization currently 834pushes all arguments on the top of the stack (their normal place for 835non-tail call optimized calls) that source from the callers arguments 836or that source from a virtual register (also possibly sourcing from 837callers arguments). 838This is done to prevent overwriting of parameters (see example 839below) that might be used later. 840 841example: 842 843int callee(int32, int64); 844int caller(int32 arg1, int32 arg2) { 845 int64 local = arg2 * 2; 846 return callee(arg2, (int64)local); 847} 848 849[arg1] [!arg2 no longer valid since we moved local onto it] 850[arg2] -> [(int64) 851[RETADDR] local ] 852 853Moving arg1 onto the stack slot of callee function would overwrite 854arg2 of the caller. 855 856Possible optimizations: 857 858 859 - Analyse the actual parameters of the callee to see which would 860 overwrite a caller parameter which is used by the callee and only 861 push them onto the top of the stack. 862 863 int callee (int32 arg1, int32 arg2); 864 int caller (int32 arg1, int32 arg2) { 865 return callee(arg1,arg2); 866 } 867 868 Here we don't need to write any variables to the top of the stack 869 since they don't overwrite each other. 870 871 int callee (int32 arg1, int32 arg2); 872 int caller (int32 arg1, int32 arg2) { 873 return callee(arg2,arg1); 874 } 875 876 Here we need to push the arguments because they overwrite each 877 other. 878 879//===---------------------------------------------------------------------===// 880 881main () 882{ 883 int i = 0; 884 unsigned long int z = 0; 885 886 do { 887 z -= 0x00004000; 888 i++; 889 if (i > 0x00040000) 890 abort (); 891 } while (z > 0); 892 exit (0); 893} 894 895gcc compiles this to: 896 897_main: 898 subl $28, %esp 899 xorl %eax, %eax 900 jmp L2 901L3: 902 cmpl $262144, %eax 903 je L10 904L2: 905 addl $1, %eax 906 cmpl $262145, %eax 907 jne L3 908 call L_abort$stub 909L10: 910 movl $0, (%esp) 911 call L_exit$stub 912 913llvm: 914 915_main: 916 subl $12, %esp 917 movl $1, %eax 918 movl $16384, %ecx 919LBB1_1: # bb 920 cmpl $262145, %eax 921 jge LBB1_4 # cond_true 922LBB1_2: # cond_next 923 incl %eax 924 addl $4294950912, %ecx 925 cmpl $16384, %ecx 926 jne LBB1_1 # bb 927LBB1_3: # bb11 928 xorl %eax, %eax 929 addl $12, %esp 930 ret 931LBB1_4: # cond_true 932 call L_abort$stub 933 9341. LSR should rewrite the first cmp with induction variable %ecx. 9352. DAG combiner should fold 936 leal 1(%eax), %edx 937 cmpl $262145, %edx 938 => 939 cmpl $262144, %eax 940 941//===---------------------------------------------------------------------===// 942 943define i64 @test(double %X) { 944 %Y = fptosi double %X to i64 945 ret i64 %Y 946} 947 948compiles to: 949 950_test: 951 subl $20, %esp 952 movsd 24(%esp), %xmm0 953 movsd %xmm0, 8(%esp) 954 fldl 8(%esp) 955 fisttpll (%esp) 956 movl 4(%esp), %edx 957 movl (%esp), %eax 958 addl $20, %esp 959 #FP_REG_KILL 960 ret 961 962This should just fldl directly from the input stack slot. 963 964//===---------------------------------------------------------------------===// 965 966This code: 967int foo (int x) { return (x & 65535) | 255; } 968 969Should compile into: 970 971_foo: 972 movzwl 4(%esp), %eax 973 orl $255, %eax 974 ret 975 976instead of: 977_foo: 978 movl $65280, %eax 979 andl 4(%esp), %eax 980 orl $255, %eax 981 ret 982 983//===---------------------------------------------------------------------===// 984 985We're codegen'ing multiply of long longs inefficiently: 986 987unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) { 988 return arg1 * arg2; 989} 990 991We compile to (fomit-frame-pointer): 992 993_LLM: 994 pushl %esi 995 movl 8(%esp), %ecx 996 movl 16(%esp), %esi 997 movl %esi, %eax 998 mull %ecx 999 imull 12(%esp), %esi 1000 addl %edx, %esi 1001 imull 20(%esp), %ecx 1002 movl %esi, %edx 1003 addl %ecx, %edx 1004 popl %esi 1005 ret 1006 1007This looks like a scheduling deficiency and lack of remat of the load from 1008the argument area. ICC apparently produces: 1009 1010 movl 8(%esp), %ecx 1011 imull 12(%esp), %ecx 1012 movl 16(%esp), %eax 1013 imull 4(%esp), %eax 1014 addl %eax, %ecx 1015 movl 4(%esp), %eax 1016 mull 12(%esp) 1017 addl %ecx, %edx 1018 ret 1019 1020Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR: 1021http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236 1022 1023//===---------------------------------------------------------------------===// 1024 1025We can fold a store into "zeroing a reg". Instead of: 1026 1027xorl %eax, %eax 1028movl %eax, 124(%esp) 1029 1030we should get: 1031 1032movl $0, 124(%esp) 1033 1034if the flags of the xor are dead. 1035 1036Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should 1037be folded into: shl [mem], 1 1038 1039//===---------------------------------------------------------------------===// 1040 1041In SSE mode, we turn abs and neg into a load from the constant pool plus a xor 1042or and instruction, for example: 1043 1044 xorpd LCPI1_0, %xmm2 1045 1046However, if xmm2 gets spilled, we end up with really ugly code like this: 1047 1048 movsd (%esp), %xmm0 1049 xorpd LCPI1_0, %xmm0 1050 movsd %xmm0, (%esp) 1051 1052Since we 'know' that this is a 'neg', we can actually "fold" the spill into 1053the neg/abs instruction, turning it into an *integer* operation, like this: 1054 1055 xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31) 1056 1057you could also use xorb, but xorl is less likely to lead to a partial register 1058stall. Here is a contrived testcase: 1059 1060double a, b, c; 1061void test(double *P) { 1062 double X = *P; 1063 a = X; 1064 bar(); 1065 X = -X; 1066 b = X; 1067 bar(); 1068 c = X; 1069} 1070 1071//===---------------------------------------------------------------------===// 1072 1073The generated code on x86 for checking for signed overflow on a multiply the 1074obvious way is much longer than it needs to be. 1075 1076int x(int a, int b) { 1077 long long prod = (long long)a*b; 1078 return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1); 1079} 1080 1081See PR2053 for more details. 1082 1083//===---------------------------------------------------------------------===// 1084 1085We should investigate using cdq/ctld (effect: edx = sar eax, 31) 1086more aggressively; it should cost the same as a move+shift on any modern 1087processor, but it's a lot shorter. Downside is that it puts more 1088pressure on register allocation because it has fixed operands. 1089 1090Example: 1091int abs(int x) {return x < 0 ? -x : x;} 1092 1093gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.: 1094abs: 1095 movl 4(%esp), %eax 1096 cltd 1097 xorl %edx, %eax 1098 subl %edx, %eax 1099 ret 1100 1101//===---------------------------------------------------------------------===// 1102 1103Take the following code (from 1104http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): 1105 1106extern unsigned char first_one[65536]; 1107int FirstOnet(unsigned long long arg1) 1108{ 1109 if (arg1 >> 48) 1110 return (first_one[arg1 >> 48]); 1111 return 0; 1112} 1113 1114 1115The following code is currently generated: 1116FirstOnet: 1117 movl 8(%esp), %eax 1118 cmpl $65536, %eax 1119 movl 4(%esp), %ecx 1120 jb .LBB1_2 # UnifiedReturnBlock 1121.LBB1_1: # ifthen 1122 shrl $16, %eax 1123 movzbl first_one(%eax), %eax 1124 ret 1125.LBB1_2: # UnifiedReturnBlock 1126 xorl %eax, %eax 1127 ret 1128 1129We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this 1130lets us change the cmpl into a testl, which is shorter, and eliminate the shift. 1131 1132//===---------------------------------------------------------------------===// 1133 1134We compile this function: 1135 1136define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind { 1137entry: 1138 %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1] 1139 br i1 %tmp2, label %bb7, label %bb 1140 1141bb: ; preds = %entry 1142 %tmp6 = add i32 %b, %a ; <i32> [#uses=1] 1143 ret i32 %tmp6 1144 1145bb7: ; preds = %entry 1146 %tmp10 = sub i32 %a, %c ; <i32> [#uses=1] 1147 ret i32 %tmp10 1148} 1149 1150to: 1151 1152foo: # @foo 1153# BB#0: # %entry 1154 movl 4(%esp), %ecx 1155 cmpb $0, 16(%esp) 1156 je .LBB0_2 1157# BB#1: # %bb 1158 movl 8(%esp), %eax 1159 addl %ecx, %eax 1160 ret 1161.LBB0_2: # %bb7 1162 movl 12(%esp), %edx 1163 movl %ecx, %eax 1164 subl %edx, %eax 1165 ret 1166 1167There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a 1168couple more movls by putting 4(%esp) into %eax instead of %ecx. 1169 1170//===---------------------------------------------------------------------===// 1171 1172See rdar://4653682. 1173 1174From flops: 1175 1176LBB1_15: # bb310 1177 cvtss2sd LCPI1_0, %xmm1 1178 addsd %xmm1, %xmm0 1179 movsd 176(%esp), %xmm2 1180 mulsd %xmm0, %xmm2 1181 movapd %xmm2, %xmm3 1182 mulsd %xmm3, %xmm3 1183 movapd %xmm3, %xmm4 1184 mulsd LCPI1_23, %xmm4 1185 addsd LCPI1_24, %xmm4 1186 mulsd %xmm3, %xmm4 1187 addsd LCPI1_25, %xmm4 1188 mulsd %xmm3, %xmm4 1189 addsd LCPI1_26, %xmm4 1190 mulsd %xmm3, %xmm4 1191 addsd LCPI1_27, %xmm4 1192 mulsd %xmm3, %xmm4 1193 addsd LCPI1_28, %xmm4 1194 mulsd %xmm3, %xmm4 1195 addsd %xmm1, %xmm4 1196 mulsd %xmm2, %xmm4 1197 movsd 152(%esp), %xmm1 1198 addsd %xmm4, %xmm1 1199 movsd %xmm1, 152(%esp) 1200 incl %eax 1201 cmpl %eax, %esi 1202 jge LBB1_15 # bb310 1203LBB1_16: # bb358.loopexit 1204 movsd 152(%esp), %xmm0 1205 addsd %xmm0, %xmm0 1206 addsd LCPI1_22, %xmm0 1207 movsd %xmm0, 152(%esp) 1208 1209Rather than spilling the result of the last addsd in the loop, we should have 1210insert a copy to split the interval (one for the duration of the loop, one 1211extending to the fall through). The register pressure in the loop isn't high 1212enough to warrant the spill. 1213 1214Also check why xmm7 is not used at all in the function. 1215 1216//===---------------------------------------------------------------------===// 1217 1218Take the following: 1219 1220target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128" 1221target triple = "i386-apple-darwin8" 1222@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2] 1223define fastcc void @abort_gzip() noreturn nounwind { 1224entry: 1225 %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1] 1226 br i1 %tmp.b.i, label %bb.i, label %bb4.i 1227bb.i: ; preds = %entry 1228 tail call void @exit( i32 1 ) noreturn nounwind 1229 unreachable 1230bb4.i: ; preds = %entry 1231 store i1 true, i1* @in_exit.4870.b 1232 tail call void @exit( i32 1 ) noreturn nounwind 1233 unreachable 1234} 1235declare void @exit(i32) noreturn nounwind 1236 1237This compiles into: 1238_abort_gzip: ## @abort_gzip 1239## BB#0: ## %entry 1240 subl $12, %esp 1241 movb _in_exit.4870.b, %al 1242 cmpb $1, %al 1243 jne LBB0_2 1244 1245We somehow miss folding the movb into the cmpb. 1246 1247//===---------------------------------------------------------------------===// 1248 1249We compile: 1250 1251int test(int x, int y) { 1252 return x-y-1; 1253} 1254 1255into (-m64): 1256 1257_test: 1258 decl %edi 1259 movl %edi, %eax 1260 subl %esi, %eax 1261 ret 1262 1263it would be better to codegen as: x+~y (notl+addl) 1264 1265//===---------------------------------------------------------------------===// 1266 1267This code: 1268 1269int foo(const char *str,...) 1270{ 1271 __builtin_va_list a; int x; 1272 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a); 1273 return x; 1274} 1275 1276gets compiled into this on x86-64: 1277 subq $200, %rsp 1278 movaps %xmm7, 160(%rsp) 1279 movaps %xmm6, 144(%rsp) 1280 movaps %xmm5, 128(%rsp) 1281 movaps %xmm4, 112(%rsp) 1282 movaps %xmm3, 96(%rsp) 1283 movaps %xmm2, 80(%rsp) 1284 movaps %xmm1, 64(%rsp) 1285 movaps %xmm0, 48(%rsp) 1286 movq %r9, 40(%rsp) 1287 movq %r8, 32(%rsp) 1288 movq %rcx, 24(%rsp) 1289 movq %rdx, 16(%rsp) 1290 movq %rsi, 8(%rsp) 1291 leaq (%rsp), %rax 1292 movq %rax, 192(%rsp) 1293 leaq 208(%rsp), %rax 1294 movq %rax, 184(%rsp) 1295 movl $48, 180(%rsp) 1296 movl $8, 176(%rsp) 1297 movl 176(%rsp), %eax 1298 cmpl $47, %eax 1299 jbe .LBB1_3 # bb 1300.LBB1_1: # bb3 1301 movq 184(%rsp), %rcx 1302 leaq 8(%rcx), %rax 1303 movq %rax, 184(%rsp) 1304.LBB1_2: # bb4 1305 movl (%rcx), %eax 1306 addq $200, %rsp 1307 ret 1308.LBB1_3: # bb 1309 movl %eax, %ecx 1310 addl $8, %eax 1311 addq 192(%rsp), %rcx 1312 movl %eax, 176(%rsp) 1313 jmp .LBB1_2 # bb4 1314 1315gcc 4.3 generates: 1316 subq $96, %rsp 1317.LCFI0: 1318 leaq 104(%rsp), %rax 1319 movq %rsi, -80(%rsp) 1320 movl $8, -120(%rsp) 1321 movq %rax, -112(%rsp) 1322 leaq -88(%rsp), %rax 1323 movq %rax, -104(%rsp) 1324 movl $8, %eax 1325 cmpl $48, %eax 1326 jb .L6 1327 movq -112(%rsp), %rdx 1328 movl (%rdx), %eax 1329 addq $96, %rsp 1330 ret 1331 .p2align 4,,10 1332 .p2align 3 1333.L6: 1334 mov %eax, %edx 1335 addq -104(%rsp), %rdx 1336 addl $8, %eax 1337 movl %eax, -120(%rsp) 1338 movl (%rdx), %eax 1339 addq $96, %rsp 1340 ret 1341 1342and it gets compiled into this on x86: 1343 pushl %ebp 1344 movl %esp, %ebp 1345 subl $4, %esp 1346 leal 12(%ebp), %eax 1347 movl %eax, -4(%ebp) 1348 leal 16(%ebp), %eax 1349 movl %eax, -4(%ebp) 1350 movl 12(%ebp), %eax 1351 addl $4, %esp 1352 popl %ebp 1353 ret 1354 1355gcc 4.3 generates: 1356 pushl %ebp 1357 movl %esp, %ebp 1358 movl 12(%ebp), %eax 1359 popl %ebp 1360 ret 1361 1362//===---------------------------------------------------------------------===// 1363 1364Teach tblgen not to check bitconvert source type in some cases. This allows us 1365to consolidate the following patterns in X86InstrMMX.td: 1366 1367def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1368 (iPTR 0))))), 1369 (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>; 1370def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1371 (iPTR 0))))), 1372 (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>; 1373def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1374 (iPTR 0))))), 1375 (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>; 1376 1377There are other cases in various td files. 1378 1379//===---------------------------------------------------------------------===// 1380 1381Take something like the following on x86-32: 1382unsigned a(unsigned long long x, unsigned y) {return x % y;} 1383 1384We currently generate a libcall, but we really shouldn't: the expansion is 1385shorter and likely faster than the libcall. The expected code is something 1386like the following: 1387 1388 movl 12(%ebp), %eax 1389 movl 16(%ebp), %ecx 1390 xorl %edx, %edx 1391 divl %ecx 1392 movl 8(%ebp), %eax 1393 divl %ecx 1394 movl %edx, %eax 1395 ret 1396 1397A similar code sequence works for division. 1398 1399//===---------------------------------------------------------------------===// 1400 1401These should compile to the same code, but the later codegen's to useless 1402instructions on X86. This may be a trivial dag combine (GCC PR7061): 1403 1404struct s1 { unsigned char a, b; }; 1405unsigned long f1(struct s1 x) { 1406 return x.a + x.b; 1407} 1408struct s2 { unsigned a: 8, b: 8; }; 1409unsigned long f2(struct s2 x) { 1410 return x.a + x.b; 1411} 1412 1413//===---------------------------------------------------------------------===// 1414 1415We currently compile this: 1416 1417define i32 @func1(i32 %v1, i32 %v2) nounwind { 1418entry: 1419 %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) 1420 %sum = extractvalue {i32, i1} %t, 0 1421 %obit = extractvalue {i32, i1} %t, 1 1422 br i1 %obit, label %overflow, label %normal 1423normal: 1424 ret i32 %sum 1425overflow: 1426 call void @llvm.trap() 1427 unreachable 1428} 1429declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) 1430declare void @llvm.trap() 1431 1432to: 1433 1434_func1: 1435 movl 4(%esp), %eax 1436 addl 8(%esp), %eax 1437 jo LBB1_2 ## overflow 1438LBB1_1: ## normal 1439 ret 1440LBB1_2: ## overflow 1441 ud2 1442 1443it would be nice to produce "into" someday. 1444 1445//===---------------------------------------------------------------------===// 1446 1447Test instructions can be eliminated by using EFLAGS values from arithmetic 1448instructions. This is currently not done for mul, and, or, xor, neg, shl, 1449sra, srl, shld, shrd, atomic ops, and others. It is also currently not done 1450for read-modify-write instructions. It is also current not done if the 1451OF or CF flags are needed. 1452 1453The shift operators have the complication that when the shift count is 1454zero, EFLAGS is not set, so they can only subsume a test instruction if 1455the shift count is known to be non-zero. Also, using the EFLAGS value 1456from a shift is apparently very slow on some x86 implementations. 1457 1458In read-modify-write instructions, the root node in the isel match is 1459the store, and isel has no way for the use of the EFLAGS result of the 1460arithmetic to be remapped to the new node. 1461 1462Add and subtract instructions set OF on signed overflow and CF on unsiged 1463overflow, while test instructions always clear OF and CF. In order to 1464replace a test with an add or subtract in a situation where OF or CF is 1465needed, codegen must be able to prove that the operation cannot see 1466signed or unsigned overflow, respectively. 1467 1468//===---------------------------------------------------------------------===// 1469 1470memcpy/memmove do not lower to SSE copies when possible. A silly example is: 1471define <16 x float> @foo(<16 x float> %A) nounwind { 1472 %tmp = alloca <16 x float>, align 16 1473 %tmp2 = alloca <16 x float>, align 16 1474 store <16 x float> %A, <16 x float>* %tmp 1475 %s = bitcast <16 x float>* %tmp to i8* 1476 %s2 = bitcast <16 x float>* %tmp2 to i8* 1477 call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16) 1478 %R = load <16 x float>* %tmp2 1479 ret <16 x float> %R 1480} 1481 1482declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind 1483 1484which compiles to: 1485 1486_foo: 1487 subl $140, %esp 1488 movaps %xmm3, 112(%esp) 1489 movaps %xmm2, 96(%esp) 1490 movaps %xmm1, 80(%esp) 1491 movaps %xmm0, 64(%esp) 1492 movl 60(%esp), %eax 1493 movl %eax, 124(%esp) 1494 movl 56(%esp), %eax 1495 movl %eax, 120(%esp) 1496 movl 52(%esp), %eax 1497 <many many more 32-bit copies> 1498 movaps (%esp), %xmm0 1499 movaps 16(%esp), %xmm1 1500 movaps 32(%esp), %xmm2 1501 movaps 48(%esp), %xmm3 1502 addl $140, %esp 1503 ret 1504 1505On Nehalem, it may even be cheaper to just use movups when unaligned than to 1506fall back to lower-granularity chunks. 1507 1508//===---------------------------------------------------------------------===// 1509 1510Implement processor-specific optimizations for parity with GCC on these 1511processors. GCC does two optimizations: 1512 15131. ix86_pad_returns inserts a noop before ret instructions if immediately 1514 preceded by a conditional branch or is the target of a jump. 15152. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of 1516 code contains more than 3 branches. 1517 1518The first one is done for all AMDs, Core2, and "Generic" 1519The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, 1520 Core 2, and "Generic" 1521 1522//===---------------------------------------------------------------------===// 1523Testcase: 1524int x(int a) { return (a&0xf0)>>4; } 1525 1526Current output: 1527 movl 4(%esp), %eax 1528 shrl $4, %eax 1529 andl $15, %eax 1530 ret 1531 1532Ideal output: 1533 movzbl 4(%esp), %eax 1534 shrl $4, %eax 1535 ret 1536 1537//===---------------------------------------------------------------------===// 1538 1539Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch 1540properly. 1541 1542When the return value is not used (i.e. only care about the value in the 1543memory), x86 does not have to use add to implement these. Instead, it can use 1544add, sub, inc, dec instructions with the "lock" prefix. 1545 1546This is currently implemented using a bit of instruction selection trick. The 1547issue is the target independent pattern produces one output and a chain and we 1548want to map it into one that just output a chain. The current trick is to select 1549it into a MERGE_VALUES with the first definition being an implicit_def. The 1550proper solution is to add new ISD opcodes for the no-output variant. DAG 1551combiner can then transform the node before it gets to target node selection. 1552 1553Problem #2 is we are adding a whole bunch of x86 atomic instructions when in 1554fact these instructions are identical to the non-lock versions. We need a way to 1555add target specific information to target nodes and have this information 1556carried over to machine instructions. Asm printer (or JIT) can use this 1557information to add the "lock" prefix. 1558 1559//===---------------------------------------------------------------------===// 1560 1561struct B { 1562 unsigned char y0 : 1; 1563}; 1564 1565int bar(struct B* a) { return a->y0; } 1566 1567define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize { 1568 %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0 1569 %2 = load i8* %1, align 1 1570 %3 = and i8 %2, 1 1571 %4 = zext i8 %3 to i32 1572 ret i32 %4 1573} 1574 1575bar: # @bar 1576# BB#0: 1577 movb (%rdi), %al 1578 andb $1, %al 1579 movzbl %al, %eax 1580 ret 1581 1582Missed optimization: should be movl+andl. 1583 1584//===---------------------------------------------------------------------===// 1585 1586The x86_64 abi says: 1587 1588Booleans, when stored in a memory object, are stored as single byte objects the 1589value of which is always 0 (false) or 1 (true). 1590 1591We are not using this fact: 1592 1593int bar(_Bool *a) { return *a; } 1594 1595define i32 @bar(i8* nocapture %a) nounwind readonly optsize { 1596 %1 = load i8* %a, align 1, !tbaa !0 1597 %tmp = and i8 %1, 1 1598 %2 = zext i8 %tmp to i32 1599 ret i32 %2 1600} 1601 1602bar: 1603 movb (%rdi), %al 1604 andb $1, %al 1605 movzbl %al, %eax 1606 ret 1607 1608GCC produces 1609 1610bar: 1611 movzbl (%rdi), %eax 1612 ret 1613 1614//===---------------------------------------------------------------------===// 1615 1616Consider the following two functions compiled with clang: 1617_Bool foo(int *x) { return !(*x & 4); } 1618unsigned bar(int *x) { return !(*x & 4); } 1619 1620foo: 1621 movl 4(%esp), %eax 1622 testb $4, (%eax) 1623 sete %al 1624 movzbl %al, %eax 1625 ret 1626 1627bar: 1628 movl 4(%esp), %eax 1629 movl (%eax), %eax 1630 shrl $2, %eax 1631 andl $1, %eax 1632 xorl $1, %eax 1633 ret 1634 1635The second function generates more code even though the two functions are 1636are functionally identical. 1637 1638//===---------------------------------------------------------------------===// 1639 1640Take the following C code: 1641int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } 1642 1643We generate the following IR with clang: 1644define i32 @f(i32 %a, i32 %b) nounwind readnone { 1645entry: 1646 %tmp = xor i32 %b, %a ; <i32> [#uses=1] 1647 %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1] 1648 %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1] 1649 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] 1650 ret i32 %conv5 1651} 1652 1653And the following x86 code: 1654 xorl %esi, %edi 1655 testb $-1, %dil 1656 sete %al 1657 movzbl %al, %eax 1658 ret 1659 1660A cmpb instead of the xorl+testb would be one instruction shorter. 1661 1662//===---------------------------------------------------------------------===// 1663 1664Given the following C code: 1665int f(int a, int b) { return (signed char)a == (signed char)b; } 1666 1667We generate the following IR with clang: 1668define i32 @f(i32 %a, i32 %b) nounwind readnone { 1669entry: 1670 %sext = shl i32 %a, 24 ; <i32> [#uses=1] 1671 %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1] 1672 %sext6 = shl i32 %b, 24 ; <i32> [#uses=1] 1673 %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1] 1674 %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1] 1675 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] 1676 ret i32 %conv5 1677} 1678 1679And the following x86 code: 1680 movsbl %sil, %eax 1681 movsbl %dil, %ecx 1682 cmpl %eax, %ecx 1683 sete %al 1684 movzbl %al, %eax 1685 ret 1686 1687 1688It should be possible to eliminate the sign extensions. 1689 1690//===---------------------------------------------------------------------===// 1691 1692LLVM misses a load+store narrowing opportunity in this code: 1693 1694%struct.bf = type { i64, i16, i16, i32 } 1695 1696@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] 1697 1698define void @t1() nounwind ssp { 1699entry: 1700 %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] 1701 %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1] 1702 %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2] 1703 %3 = load i32* %2, align 1 ; <i32> [#uses=1] 1704 %4 = and i32 %3, -65537 ; <i32> [#uses=1] 1705 store i32 %4, i32* %2, align 1 1706 %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] 1707 %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1] 1708 %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2] 1709 %8 = load i32* %7, align 1 ; <i32> [#uses=1] 1710 %9 = and i32 %8, -131073 ; <i32> [#uses=1] 1711 store i32 %9, i32* %7, align 1 1712 ret void 1713} 1714 1715LLVM currently emits this: 1716 1717 movq bfi(%rip), %rax 1718 andl $-65537, 8(%rax) 1719 movq bfi(%rip), %rax 1720 andl $-131073, 8(%rax) 1721 ret 1722 1723It could narrow the loads and stores to emit this: 1724 1725 movq bfi(%rip), %rax 1726 andb $-2, 10(%rax) 1727 movq bfi(%rip), %rax 1728 andb $-3, 10(%rax) 1729 ret 1730 1731The trouble is that there is a TokenFactor between the store and the 1732load, making it non-trivial to determine if there's anything between 1733the load and the store which would prohibit narrowing. 1734 1735//===---------------------------------------------------------------------===// 1736 1737This code: 1738void foo(unsigned x) { 1739 if (x == 0) bar(); 1740 else if (x == 1) qux(); 1741} 1742 1743currently compiles into: 1744_foo: 1745 movl 4(%esp), %eax 1746 cmpl $1, %eax 1747 je LBB0_3 1748 testl %eax, %eax 1749 jne LBB0_4 1750 1751the testl could be removed: 1752_foo: 1753 movl 4(%esp), %eax 1754 cmpl $1, %eax 1755 je LBB0_3 1756 jb LBB0_4 1757 17580 is the only unsigned number < 1. 1759 1760//===---------------------------------------------------------------------===// 1761 1762This code: 1763 1764%0 = type { i32, i1 } 1765 1766define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp { 1767entry: 1768 %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x) 1769 %cmp = extractvalue %0 %uadd, 1 1770 %inc = zext i1 %cmp to i32 1771 %add = add i32 %x, %sum 1772 %z.0 = add i32 %add, %inc 1773 ret i32 %z.0 1774} 1775 1776declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone 1777 1778compiles to: 1779 1780_add32carry: ## @add32carry 1781 addl %esi, %edi 1782 sbbl %ecx, %ecx 1783 movl %edi, %eax 1784 subl %ecx, %eax 1785 ret 1786 1787But it could be: 1788 1789_add32carry: 1790 leal (%rsi,%rdi), %eax 1791 cmpl %esi, %eax 1792 adcl $0, %eax 1793 ret 1794 1795//===---------------------------------------------------------------------===// 1796 1797The hot loop of 256.bzip2 contains code that looks a bit like this: 1798 1799int foo(char *P, char *Q, int x, int y) { 1800 if (P[0] != Q[0]) 1801 return P[0] < Q[0]; 1802 if (P[1] != Q[1]) 1803 return P[1] < Q[1]; 1804 if (P[2] != Q[2]) 1805 return P[2] < Q[2]; 1806 return P[3] < Q[3]; 1807} 1808 1809In the real code, we get a lot more wrong than this. However, even in this 1810code we generate: 1811 1812_foo: ## @foo 1813## BB#0: ## %entry 1814 movb (%rsi), %al 1815 movb (%rdi), %cl 1816 cmpb %al, %cl 1817 je LBB0_2 1818LBB0_1: ## %if.then 1819 cmpb %al, %cl 1820 jmp LBB0_5 1821LBB0_2: ## %if.end 1822 movb 1(%rsi), %al 1823 movb 1(%rdi), %cl 1824 cmpb %al, %cl 1825 jne LBB0_1 1826## BB#3: ## %if.end38 1827 movb 2(%rsi), %al 1828 movb 2(%rdi), %cl 1829 cmpb %al, %cl 1830 jne LBB0_1 1831## BB#4: ## %if.end60 1832 movb 3(%rdi), %al 1833 cmpb 3(%rsi), %al 1834LBB0_5: ## %if.end60 1835 setl %al 1836 movzbl %al, %eax 1837 ret 1838 1839Note that we generate jumps to LBB0_1 which does a redundant compare. The 1840redundant compare also forces the register values to be live, which prevents 1841folding one of the loads into the compare. In contrast, GCC 4.2 produces: 1842 1843_foo: 1844 movzbl (%rsi), %eax 1845 cmpb %al, (%rdi) 1846 jne L10 1847L12: 1848 movzbl 1(%rsi), %eax 1849 cmpb %al, 1(%rdi) 1850 jne L10 1851 movzbl 2(%rsi), %eax 1852 cmpb %al, 2(%rdi) 1853 jne L10 1854 movzbl 3(%rdi), %eax 1855 cmpb 3(%rsi), %al 1856L10: 1857 setl %al 1858 movzbl %al, %eax 1859 ret 1860 1861which is "perfect". 1862 1863//===---------------------------------------------------------------------===// 1864 1865For the branch in the following code: 1866int a(); 1867int b(int x, int y) { 1868 if (x & (1<<(y&7))) 1869 return a(); 1870 return y; 1871} 1872 1873We currently generate: 1874 movb %sil, %al 1875 andb $7, %al 1876 movzbl %al, %eax 1877 btl %eax, %edi 1878 jae .LBB0_2 1879 1880movl+andl would be shorter than the movb+andb+movzbl sequence. 1881 1882//===---------------------------------------------------------------------===// 1883 1884For the following: 1885struct u1 { 1886 float x, y; 1887}; 1888float foo(struct u1 u) { 1889 return u.x + u.y; 1890} 1891 1892We currently generate: 1893 movdqa %xmm0, %xmm1 1894 pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0] 1895 addss %xmm1, %xmm0 1896 ret 1897 1898We could save an instruction here by commuting the addss. 1899 1900//===---------------------------------------------------------------------===// 1901 1902This (from PR9661): 1903 1904float clamp_float(float a) { 1905 if (a > 1.0f) 1906 return 1.0f; 1907 else if (a < 0.0f) 1908 return 0.0f; 1909 else 1910 return a; 1911} 1912 1913Could compile to: 1914 1915clamp_float: # @clamp_float 1916 movss .LCPI0_0(%rip), %xmm1 1917 minss %xmm1, %xmm0 1918 pxor %xmm1, %xmm1 1919 maxss %xmm1, %xmm0 1920 ret 1921 1922with -ffast-math. 1923 1924//===---------------------------------------------------------------------===// 1925 1926This function (from PR9803): 1927 1928int clamp2(int a) { 1929 if (a > 5) 1930 a = 5; 1931 if (a < 0) 1932 return 0; 1933 return a; 1934} 1935 1936Compiles to: 1937 1938_clamp2: ## @clamp2 1939 pushq %rbp 1940 movq %rsp, %rbp 1941 cmpl $5, %edi 1942 movl $5, %ecx 1943 cmovlel %edi, %ecx 1944 testl %ecx, %ecx 1945 movl $0, %eax 1946 cmovnsl %ecx, %eax 1947 popq %rbp 1948 ret 1949 1950The move of 0 could be scheduled above the test to make it is xor reg,reg. 1951 1952//===---------------------------------------------------------------------===// 1953 1954GCC PR48986. We currently compile this: 1955 1956void bar(void); 1957void yyy(int* p) { 1958 if (__sync_fetch_and_add(p, -1) == 1) 1959 bar(); 1960} 1961 1962into: 1963 movl $-1, %eax 1964 lock 1965 xaddl %eax, (%rdi) 1966 cmpl $1, %eax 1967 je LBB0_2 1968 1969Instead we could generate: 1970 1971 lock 1972 dec %rdi 1973 je LBB0_2 1974 1975The trick is to match "fetch_and_add(X, -C) == C". 1976 1977//===---------------------------------------------------------------------===// 1978 1979unsigned t(unsigned a, unsigned b) { 1980 return a <= b ? 5 : -5; 1981} 1982 1983We generate: 1984 movl $5, %ecx 1985 cmpl %esi, %edi 1986 movl $-5, %eax 1987 cmovbel %ecx, %eax 1988 1989GCC: 1990 cmpl %edi, %esi 1991 sbbl %eax, %eax 1992 andl $-10, %eax 1993 addl $5, %eax 1994 1995//===---------------------------------------------------------------------===// 1996