• Home
Name Date Size #Lines LOC

..--

AsmParser/03-May-2024-1,363975

Disassembler/03-May-2024-3,3722,237

InstPrinter/03-May-2024-839585

MCTargetDesc/03-May-2024-732495

TargetInfo/03-May-2024-10553

Utils/03-May-2024-337200

Android.mkD03-May-20241.7 KiB7354

CMakeLists.txtD03-May-20241.9 KiB6256

MakefileD03-May-2024861 2511

README-FPStack.txtD03-May-20242.7 KiB8658

README-MMX.txtD03-May-20241.5 KiB7255

README-SSE.txtD03-May-202426.4 KiB938719

README-UNIMPLEMENTED.txtD03-May-2024679 1512

README-X86-64.txtD03-May-20246 KiB185150

README.txtD03-May-202453.5 KiB2,0691,619

SSEDomainFix.cppD03-May-202416.1 KiB507345

X86.hD03-May-20243.1 KiB9239

X86.tdD03-May-202411.2 KiB235203

X86AsmBackend.cppD03-May-202414.4 KiB454331

X86AsmPrinter.cppD03-May-202426.1 KiB729571

X86AsmPrinter.hD03-May-20242.9 KiB8854

X86COFFMachineModuleInfo.cppD03-May-2024615 214

X86COFFMachineModuleInfo.hD03-May-20241.4 KiB4722

X86CallingConv.tdD03-May-202415.3 KiB402315

X86CodeEmitter.cppD03-May-202434.8 KiB1,000780

X86CompilationCallback_Win64.asmD03-May-20241.6 KiB6955

X86ELFWriterInfo.cppD03-May-20244.2 KiB154125

X86ELFWriterInfo.hD03-May-20242.2 KiB6020

X86FastISel.cppD03-May-202471.6 KiB2,1341,576

X86FixupKinds.hD03-May-20241.2 KiB3416

X86FloatingPoint.cppD03-May-202463.7 KiB1,7101,097

X86FrameLowering.cppD03-May-202442.7 KiB1,194835

X86FrameLowering.hD03-May-20242.4 KiB6940

X86ISelDAGToDAG.cppD03-May-202481.2 KiB2,2541,675

X86ISelLowering.cppD03-May-2024501.3 KiB13,1589,755

X86ISelLowering.hD03-May-202441 KiB956433

X86Instr3DNow.tdD03-May-20244.3 KiB10389

X86InstrArithmetic.tdD03-May-202453.5 KiB1,126985

X86InstrBuilder.hD03-May-20246.7 KiB185107

X86InstrCMovSetCC.tdD03-May-20244.9 KiB10595

X86InstrCompiler.tdD03-May-202474.4 KiB1,6761,469

X86InstrControl.tdD03-May-202413.9 KiB305273

X86InstrExtension.tdD03-May-20247.8 KiB152134

X86InstrFMA.tdD03-May-20242.8 KiB6155

X86InstrFPStack.tdD03-May-202432.7 KiB649585

X86InstrFormats.tdD03-May-202420.5 KiB536479

X86InstrFragmentsSIMD.tdD03-May-202420.6 KiB468396

X86InstrInfo.cppD03-May-2024119.8 KiB3,2412,746

X86InstrInfo.hD03-May-202435.9 KiB887409

X86InstrInfo.tdD03-May-202475.2 KiB1,6491,402

X86InstrMMX.tdD03-May-202421.9 KiB455387

X86InstrSSE.tdD03-May-2024287.9 KiB5,8675,249

X86InstrShiftRotate.tdD03-May-202437.1 KiB747686

X86InstrSystem.tdD03-May-202419.5 KiB430354

X86InstrVMX.tdD03-May-20242.4 KiB5551

X86JITInfo.cppD03-May-202418.8 KiB575447

X86JITInfo.hD03-May-20243 KiB8231

X86MCCodeEmitter.cppD03-May-202435.6 KiB1,045734

X86MCInstLower.cppD03-May-202427.6 KiB693535

X86MCInstLower.hD03-May-20241.3 KiB5334

X86MachObjectWriter.cppD03-May-202422.3 KiB555401

X86MachineFunctionInfo.hD03-May-20245.2 KiB13669

X86RegisterInfo.cppD03-May-202429.8 KiB848709

X86RegisterInfo.hD03-May-20244.6 KiB13654

X86RegisterInfo.tdD03-May-202419.4 KiB468415

X86Relocations.hD03-May-20242 KiB5315

X86SelectionDAGInfo.cppD03-May-20249.8 KiB260201

X86SelectionDAGInfo.hD03-May-20241.9 KiB5731

X86Subtarget.cppD03-May-202411.1 KiB324200

X86Subtarget.hD03-May-20248.9 KiB261133

X86TargetMachine.cppD03-May-20247 KiB194133

X86TargetMachine.hD03-May-20244.4 KiB136104

X86TargetObjectFile.cppD03-May-20244.1 KiB12290

X86TargetObjectFile.hD03-May-20242.1 KiB6139

README-FPStack.txt

1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend: FP stack related stuff
3//===---------------------------------------------------------------------===//
4
5//===---------------------------------------------------------------------===//
6
7Some targets (e.g. athlons) prefer freep to fstp ST(0):
8http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
9
10//===---------------------------------------------------------------------===//
11
12This should use fiadd on chips where it is profitable:
13double foo(double P, int *I) { return P+*I; }
14
15We have fiadd patterns now but the followings have the same cost and
16complexity. We need a way to specify the later is more profitable.
17
18def FpADD32m  : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
19                    [(set RFP:$dst, (fadd RFP:$src1,
20                                     (extloadf64f32 addr:$src2)))]>;
21                // ST(0) = ST(0) + [mem32]
22
23def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
24                    [(set RFP:$dst, (fadd RFP:$src1,
25                                     (X86fild addr:$src2, i32)))]>;
26                // ST(0) = ST(0) + [mem32int]
27
28//===---------------------------------------------------------------------===//
29
30The FP stackifier should handle simple permutates to reduce number of shuffle
31instructions, e.g. turning:
32
33fld P	->		fld Q
34fld Q			fld P
35fxch
36
37or:
38
39fxch	->		fucomi
40fucomi			jl X
41jg X
42
43Ideas:
44http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
45
46
47//===---------------------------------------------------------------------===//
48
49Add a target specific hook to DAG combiner to handle SINT_TO_FP and
50FP_TO_SINT when the source operand is already in memory.
51
52//===---------------------------------------------------------------------===//
53
54Open code rint,floor,ceil,trunc:
55http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
56http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
57
58Opencode the sincos[f] libcall.
59
60//===---------------------------------------------------------------------===//
61
62None of the FPStack instructions are handled in
63X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
64folding spill code into the instructions.
65
66//===---------------------------------------------------------------------===//
67
68Currently the x86 codegen isn't very good at mixing SSE and FPStack
69code:
70
71unsigned int foo(double x) { return x; }
72
73foo:
74	subl $20, %esp
75	movsd 24(%esp), %xmm0
76	movsd %xmm0, 8(%esp)
77	fldl 8(%esp)
78	fisttpll (%esp)
79	movl (%esp), %eax
80	addl $20, %esp
81	ret
82
83This just requires being smarter when custom expanding fptoui.
84
85//===---------------------------------------------------------------------===//
86

README-MMX.txt

1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend: MMX-specific stuff.
3//===---------------------------------------------------------------------===//
4
5//===---------------------------------------------------------------------===//
6
7This:
8
9#include <mmintrin.h>
10
11__v2si qux(int A) {
12  return (__v2si){ 0, A };
13}
14
15is compiled into:
16
17_qux:
18        subl $28, %esp
19        movl 32(%esp), %eax
20        movd %eax, %mm0
21        movq %mm0, (%esp)
22        movl (%esp), %eax
23        movl %eax, 20(%esp)
24        movq %mm0, 8(%esp)
25        movl 12(%esp), %eax
26        movl %eax, 16(%esp)
27        movq 16(%esp), %mm0
28        addl $28, %esp
29        ret
30
31Yuck!
32
33GCC gives us:
34
35_qux:
36        subl    $12, %esp
37        movl    16(%esp), %eax
38        movl    20(%esp), %edx
39        movl    $0, (%eax)
40        movl    %edx, 4(%eax)
41        addl    $12, %esp
42        ret     $4
43
44//===---------------------------------------------------------------------===//
45
46We generate crappy code for this:
47
48__m64 t() {
49  return _mm_cvtsi32_si64(1);
50}
51
52_t:
53	subl	$12, %esp
54	movl	$1, %eax
55	movd	%eax, %mm0
56	movq	%mm0, (%esp)
57	movl	(%esp), %eax
58	movl	4(%esp), %edx
59	addl	$12, %esp
60	ret
61
62The extra stack traffic is covered in the previous entry. But the other reason
63is we are not smart about materializing constants in MMX registers. With -m64
64
65	movl	$1, %eax
66	movd	%eax, %mm0
67	movd	%mm0, %rax
68	ret
69
70We should be using a constantpool load instead:
71	movq	LC0(%rip), %rax
72

README-SSE.txt

1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend: SSE-specific stuff.
3//===---------------------------------------------------------------------===//
4
5//===---------------------------------------------------------------------===//
6
7SSE Variable shift can be custom lowered to something like this, which uses a
8small table + unaligned load + shuffle instead of going through memory.
9
10__m128i_shift_right:
11	.byte	  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
12	.byte	 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
13
14...
15__m128i shift_right(__m128i value, unsigned long offset) {
16  return _mm_shuffle_epi8(value,
17               _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
18}
19
20//===---------------------------------------------------------------------===//
21
22SSE has instructions for doing operations on complex numbers, we should pattern
23match them.   For example, this should turn into a horizontal add:
24
25typedef float __attribute__((vector_size(16))) v4f32;
26float f32(v4f32 A) {
27  return A[0]+A[1]+A[2]+A[3];
28}
29
30Instead we get this:
31
32_f32:                                   ## @f32
33	pshufd	$1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
34	addss	%xmm0, %xmm1
35	pshufd	$3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
36	movhlps	%xmm0, %xmm0            ## xmm0 = xmm0[1,1]
37	movaps	%xmm0, %xmm3
38	addss	%xmm1, %xmm3
39	movdqa	%xmm2, %xmm0
40	addss	%xmm3, %xmm0
41	ret
42
43Also, there are cases where some simple local SLP would improve codegen a bit.
44compiling this:
45
46_Complex float f32(_Complex float A, _Complex float B) {
47  return A+B;
48}
49
50into:
51
52_f32:                                   ## @f32
53	movdqa	%xmm0, %xmm2
54	addss	%xmm1, %xmm2
55	pshufd	$1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
56	pshufd	$1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
57	addss	%xmm1, %xmm3
58	movaps	%xmm2, %xmm0
59	unpcklps	%xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
60	ret
61
62seems silly when it could just be one addps.
63
64
65//===---------------------------------------------------------------------===//
66
67Expand libm rounding functions inline:  Significant speedups possible.
68http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
69
70//===---------------------------------------------------------------------===//
71
72When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
73other fast SSE modes.
74
75//===---------------------------------------------------------------------===//
76
77Think about doing i64 math in SSE regs on x86-32.
78
79//===---------------------------------------------------------------------===//
80
81This testcase should have no SSE instructions in it, and only one load from
82a constant pool:
83
84double %test3(bool %B) {
85        %C = select bool %B, double 123.412, double 523.01123123
86        ret double %C
87}
88
89Currently, the select is being lowered, which prevents the dag combiner from
90turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
91
92The pattern isel got this one right.
93
94//===---------------------------------------------------------------------===//
95
96SSE should implement 'select_cc' using 'emulated conditional moves' that use
97pcmp/pand/pandn/por to do a selection instead of a conditional branch:
98
99double %X(double %Y, double %Z, double %A, double %B) {
100        %C = setlt double %A, %B
101        %z = fadd double %Z, 0.0    ;; select operand is not a load
102        %D = select bool %C, double %Y, double %z
103        ret double %D
104}
105
106We currently emit:
107
108_X:
109        subl $12, %esp
110        xorpd %xmm0, %xmm0
111        addsd 24(%esp), %xmm0
112        movsd 32(%esp), %xmm1
113        movsd 16(%esp), %xmm2
114        ucomisd 40(%esp), %xmm1
115        jb LBB_X_2
116LBB_X_1:
117        movsd %xmm0, %xmm2
118LBB_X_2:
119        movsd %xmm2, (%esp)
120        fldl (%esp)
121        addl $12, %esp
122        ret
123
124//===---------------------------------------------------------------------===//
125
126Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
127feasible.
128
129//===---------------------------------------------------------------------===//
130
131Codegen:
132  if (copysign(1.0, x) == copysign(1.0, y))
133into:
134  if (x^y & mask)
135when using SSE.
136
137//===---------------------------------------------------------------------===//
138
139Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
140of a v4sf value.
141
142//===---------------------------------------------------------------------===//
143
144Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
145Perhaps use pxor / xorp* to clear a XMM register first?
146
147//===---------------------------------------------------------------------===//
148
149External test Nurbs exposed some problems. Look for
150__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
151emits:
152
153        movaps    (%edx), %xmm2                                 #59.21
154        movaps    (%edx), %xmm5                                 #60.21
155        movaps    (%edx), %xmm4                                 #61.21
156        movaps    (%edx), %xmm3                                 #62.21
157        movl      40(%ecx), %ebp                                #69.49
158        shufps    $0, %xmm2, %xmm5                              #60.21
159        movl      100(%esp), %ebx                               #69.20
160        movl      (%ebx), %edi                                  #69.20
161        imull     %ebp, %edi                                    #69.49
162        addl      (%eax), %edi                                  #70.33
163        shufps    $85, %xmm2, %xmm4                             #61.21
164        shufps    $170, %xmm2, %xmm3                            #62.21
165        shufps    $255, %xmm2, %xmm2                            #63.21
166        lea       (%ebp,%ebp,2), %ebx                           #69.49
167        negl      %ebx                                          #69.49
168        lea       -3(%edi,%ebx), %ebx                           #70.33
169        shll      $4, %ebx                                      #68.37
170        addl      32(%ecx), %ebx                                #68.37
171        testb     $15, %bl                                      #91.13
172        jne       L_B1.24       # Prob 5%                       #91.13
173
174This is the llvm code after instruction scheduling:
175
176cond_next140 (0xa910740, LLVM BB @0xa90beb0):
177	%reg1078 = MOV32ri -3
178	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
179	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
180	%reg1080 = IMUL32rr %reg1079, %reg1037
181	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
182	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
183	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
184	%reg1082 = SHL32ri %reg1038, 4
185	%reg1039 = ADD32rr %reg1036, %reg1082
186	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
187	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
188	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
189	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
190	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
191	%reg1040 = MOV32rr %reg1039
192	%reg1084 = AND32ri8 %reg1039, 15
193	CMP32ri8 %reg1084, 0
194	JE mbb<cond_next204,0xa914d30>
195
196Still ok. After register allocation:
197
198cond_next140 (0xa910740, LLVM BB @0xa90beb0):
199	%EAX = MOV32ri -3
200	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
201	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
202	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
203	%EDX = MOV32rm %EDX, 1, %NOREG, 40
204	IMUL32rr %EAX<def&use>, %EDX
205	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
206	%ESI = MOV32rm %ESI, 1, %NOREG, 0
207	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
208	%EAX = LEA32r %ESI, 1, %EAX, -3
209	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
210	%ESI = MOV32rm %ESI, 1, %NOREG, 32
211	%EDI = MOV32rr %EAX
212	SHL32ri %EDI<def&use>, 4
213	ADD32rr %EDI<def&use>, %ESI
214	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
215	%XMM1 = MOVAPSrr %XMM0
216	SHUFPSrr %XMM1<def&use>, %XMM1, 170
217	%XMM2 = MOVAPSrr %XMM0
218	SHUFPSrr %XMM2<def&use>, %XMM2, 0
219	%XMM3 = MOVAPSrr %XMM0
220	SHUFPSrr %XMM3<def&use>, %XMM3, 255
221	SHUFPSrr %XMM0<def&use>, %XMM0, 85
222	%EBX = MOV32rr %EDI
223	AND32ri8 %EBX<def&use>, 15
224	CMP32ri8 %EBX, 0
225	JE mbb<cond_next204,0xa914d30>
226
227This looks really bad. The problem is shufps is a destructive opcode. Since it
228appears as operand two in more than one shufps ops. It resulted in a number of
229copies. Note icc also suffers from the same problem. Either the instruction
230selector should select pshufd or The register allocator can made the two-address
231to three-address transformation.
232
233It also exposes some other problems. See MOV32ri -3 and the spills.
234
235//===---------------------------------------------------------------------===//
236
237Consider:
238
239__m128 test(float a) {
240  return _mm_set_ps(0.0, 0.0, 0.0, a*a);
241}
242
243This compiles into:
244
245movss 4(%esp), %xmm1
246mulss %xmm1, %xmm1
247xorps %xmm0, %xmm0
248movss %xmm1, %xmm0
249ret
250
251Because mulss doesn't modify the top 3 elements, the top elements of
252xmm1 are already zero'd.  We could compile this to:
253
254movss 4(%esp), %xmm0
255mulss %xmm0, %xmm0
256ret
257
258//===---------------------------------------------------------------------===//
259
260Here's a sick and twisted idea.  Consider code like this:
261
262__m128 test(__m128 a) {
263  float b = *(float*)&A;
264  ...
265  return _mm_set_ps(0.0, 0.0, 0.0, b);
266}
267
268This might compile to this code:
269
270movaps c(%esp), %xmm1
271xorps %xmm0, %xmm0
272movss %xmm1, %xmm0
273ret
274
275Now consider if the ... code caused xmm1 to get spilled.  This might produce
276this code:
277
278movaps c(%esp), %xmm1
279movaps %xmm1, c2(%esp)
280...
281
282xorps %xmm0, %xmm0
283movaps c2(%esp), %xmm1
284movss %xmm1, %xmm0
285ret
286
287However, since the reload is only used by these instructions, we could
288"fold" it into the uses, producing something like this:
289
290movaps c(%esp), %xmm1
291movaps %xmm1, c2(%esp)
292...
293
294movss c2(%esp), %xmm0
295ret
296
297... saving two instructions.
298
299The basic idea is that a reload from a spill slot, can, if only one 4-byte
300chunk is used, bring in 3 zeros the one element instead of 4 elements.
301This can be used to simplify a variety of shuffle operations, where the
302elements are fixed zeros.
303
304//===---------------------------------------------------------------------===//
305
306This code generates ugly code, probably due to costs being off or something:
307
308define void @test(float* %P, <4 x float>* %P2 ) {
309        %xFloat0.688 = load float* %P
310        %tmp = load <4 x float>* %P2
311        %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
312        store <4 x float> %inFloat3.713, <4 x float>* %P2
313        ret void
314}
315
316Generates:
317
318_test:
319	movl	8(%esp), %eax
320	movaps	(%eax), %xmm0
321	pxor	%xmm1, %xmm1
322	movaps	%xmm0, %xmm2
323	shufps	$50, %xmm1, %xmm2
324	shufps	$132, %xmm2, %xmm0
325	movaps	%xmm0, (%eax)
326	ret
327
328Would it be better to generate:
329
330_test:
331        movl 8(%esp), %ecx
332        movaps (%ecx), %xmm0
333	xor %eax, %eax
334        pinsrw $6, %eax, %xmm0
335        pinsrw $7, %eax, %xmm0
336        movaps %xmm0, (%ecx)
337        ret
338
339?
340
341//===---------------------------------------------------------------------===//
342
343Some useful information in the Apple Altivec / SSE Migration Guide:
344
345http://developer.apple.com/documentation/Performance/Conceptual/
346Accelerate_sse_migration/index.html
347
348e.g. SSE select using and, andnot, or. Various SSE compare translations.
349
350//===---------------------------------------------------------------------===//
351
352Add hooks to commute some CMPP operations.
353
354//===---------------------------------------------------------------------===//
355
356Apply the same transformation that merged four float into a single 128-bit load
357to loads from constant pool.
358
359//===---------------------------------------------------------------------===//
360
361Floating point max / min are commutable when -enable-unsafe-fp-path is
362specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
363nodes which are selected to max / min instructions that are marked commutable.
364
365//===---------------------------------------------------------------------===//
366
367We should materialize vector constants like "all ones" and "signbit" with
368code like:
369
370     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
371
372and:
373     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
374     psrlq   xmm1, 31     ; xmm1 = all 100000000000...
375
376instead of using a load from the constant pool.  The later is important for
377ABS/NEG/copysign etc.
378
379//===---------------------------------------------------------------------===//
380
381These functions:
382
383#include <xmmintrin.h>
384__m128i a;
385void x(unsigned short n) {
386  a = _mm_slli_epi32 (a, n);
387}
388void y(unsigned n) {
389  a = _mm_slli_epi32 (a, n);
390}
391
392compile to ( -O3 -static -fomit-frame-pointer):
393_x:
394        movzwl  4(%esp), %eax
395        movd    %eax, %xmm0
396        movaps  _a, %xmm1
397        pslld   %xmm0, %xmm1
398        movaps  %xmm1, _a
399        ret
400_y:
401        movd    4(%esp), %xmm0
402        movaps  _a, %xmm1
403        pslld   %xmm0, %xmm1
404        movaps  %xmm1, _a
405        ret
406
407"y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
408like movd would be sufficient in both cases as the value is already zero
409extended in the 32-bit stack slot IIRC.  For signed short, it should also be
410save, as a really-signed value would be undefined for pslld.
411
412
413//===---------------------------------------------------------------------===//
414
415#include <math.h>
416int t1(double d) { return signbit(d); }
417
418This currently compiles to:
419	subl	$12, %esp
420	movsd	16(%esp), %xmm0
421	movsd	%xmm0, (%esp)
422	movl	4(%esp), %eax
423	shrl	$31, %eax
424	addl	$12, %esp
425	ret
426
427We should use movmskp{s|d} instead.
428
429//===---------------------------------------------------------------------===//
430
431CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
432(aligned) vector load.  This functionality has a couple of problems.
433
4341. The code to infer alignment from loads of globals is in the X86 backend,
435   not the dag combiner.  This is because dagcombine2 needs to be able to see
436   through the X86ISD::Wrapper node, which DAGCombine can't really do.
4372. The code for turning 4 x load into a single vector load is target
438   independent and should be moved to the dag combiner.
4393. The code for turning 4 x load into a vector load can only handle a direct
440   load from a global or a direct load from the stack.  It should be generalized
441   to handle any load from P, P+4, P+8, P+12, where P can be anything.
4424. The alignment inference code cannot handle loads from globals in non-static
443   mode because it doesn't look through the extra dyld stub load.  If you try
444   vec_align.ll without -relocation-model=static, you'll see what I mean.
445
446//===---------------------------------------------------------------------===//
447
448We should lower store(fneg(load p), q) into an integer load+xor+store, which
449eliminates a constant pool load.  For example, consider:
450
451define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
452entry:
453 %tmp6 = fsub float -0.000000e+00, %z.1		; <float> [#uses=1]
454 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
455 ret i64 %tmp20
456}
457declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
458
459This currently compiles to:
460
461LCPI1_0:					#  <4 x float>
462	.long	2147483648	# float -0
463	.long	2147483648	# float -0
464	.long	2147483648	# float -0
465	.long	2147483648	# float -0
466_ccosf:
467	subl	$12, %esp
468	movss	16(%esp), %xmm0
469	movss	%xmm0, 4(%esp)
470	movss	20(%esp), %xmm0
471	xorps	LCPI1_0, %xmm0
472	movss	%xmm0, (%esp)
473	call	L_ccoshf$stub
474	addl	$12, %esp
475	ret
476
477Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
478this code computes the pic base and does two loads to do the constant pool
479load, so the improvement is much bigger.
480
481The tricky part about this xform is that the argument load/store isn't exposed
482until post-legalize, and at that point, the fneg has been custom expanded into
483an X86 fxor.  This means that we need to handle this case in the x86 backend
484instead of in target independent code.
485
486//===---------------------------------------------------------------------===//
487
488Non-SSE4 insert into 16 x i8 is atrociously bad.
489
490//===---------------------------------------------------------------------===//
491
492<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
493is memory.
494
495//===---------------------------------------------------------------------===//
496
497SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
498sitting between the truncate and the extract.
499
500//===---------------------------------------------------------------------===//
501
502INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
503any number of 0.0 simultaneously.  Currently we only use it for simple
504insertions.
505
506See comments in LowerINSERT_VECTOR_ELT_SSE4.
507
508//===---------------------------------------------------------------------===//
509
510On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
511Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
512legal, it'll just take a few extra patterns written in the .td file.
513
514Note: this is not a code quality issue; the custom lowered code happens to be
515right, but we shouldn't have to custom lower anything.  This is probably related
516to <2 x i64> ops being so bad.
517
518//===---------------------------------------------------------------------===//
519
520'select' on vectors and scalars could be a whole lot better.  We currently
521lower them to conditional branches.  On x86-64 for example, we compile this:
522
523double test(double a, double b, double c, double d) { return a<b ? c : d; }
524
525to:
526
527_test:
528	ucomisd	%xmm0, %xmm1
529	ja	LBB1_2	# entry
530LBB1_1:	# entry
531	movapd	%xmm3, %xmm2
532LBB1_2:	# entry
533	movapd	%xmm2, %xmm0
534	ret
535
536instead of:
537
538_test:
539	cmpltsd	%xmm1, %xmm0
540	andpd	%xmm0, %xmm2
541	andnpd	%xmm3, %xmm0
542	orpd	%xmm2, %xmm0
543	ret
544
545For unpredictable branches, the later is much more efficient.  This should
546just be a matter of having scalar sse map to SELECT_CC and custom expanding
547or iseling it.
548
549//===---------------------------------------------------------------------===//
550
551LLVM currently generates stack realignment code, when it is not necessary
552needed. The problem is that we need to know about stack alignment too early,
553before RA runs.
554
555At that point we don't know, whether there will be vector spill, or not.
556Stack realignment logic is overly conservative here, but otherwise we can
557produce unaligned loads/stores.
558
559Fixing this will require some huge RA changes.
560
561Testcase:
562#include <emmintrin.h>
563
564typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
565
566static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
567- 22725, - 12873};;
568
569vSInt16 madd(vSInt16 b)
570{
571    return _mm_madd_epi16(a, b);
572}
573
574Generated code (x86-32, linux):
575madd:
576        pushl   %ebp
577        movl    %esp, %ebp
578        andl    $-16, %esp
579        movaps  .LCPI1_0, %xmm1
580        pmaddwd %xmm1, %xmm0
581        movl    %ebp, %esp
582        popl    %ebp
583        ret
584
585//===---------------------------------------------------------------------===//
586
587Consider:
588#include <emmintrin.h>
589__m128 foo2 (float x) {
590 return _mm_set_ps (0, 0, x, 0);
591}
592
593In x86-32 mode, we generate this spiffy code:
594
595_foo2:
596	movss	4(%esp), %xmm0
597	pshufd	$81, %xmm0, %xmm0
598	ret
599
600in x86-64 mode, we generate this code, which could be better:
601
602_foo2:
603	xorps	%xmm1, %xmm1
604	movss	%xmm0, %xmm1
605	pshufd	$81, %xmm1, %xmm0
606	ret
607
608In sse4 mode, we could use insertps to make both better.
609
610Here's another testcase that could use insertps [mem]:
611
612#include <xmmintrin.h>
613extern float x2, x3;
614__m128 foo1 (float x1, float x4) {
615 return _mm_set_ps (x2, x1, x3, x4);
616}
617
618gcc mainline compiles it to:
619
620foo1:
621       insertps        $0x10, x2(%rip), %xmm0
622       insertps        $0x10, x3(%rip), %xmm1
623       movaps  %xmm1, %xmm2
624       movlhps %xmm0, %xmm2
625       movaps  %xmm2, %xmm0
626       ret
627
628//===---------------------------------------------------------------------===//
629
630We compile vector multiply-by-constant into poor code:
631
632define <4 x i32> @f(<4 x i32> %i) nounwind  {
633	%A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
634	ret <4 x i32> %A
635}
636
637On targets without SSE4.1, this compiles into:
638
639LCPI1_0:					##  <4 x i32>
640	.long	10
641	.long	10
642	.long	10
643	.long	10
644	.text
645	.align	4,0x90
646	.globl	_f
647_f:
648	pshufd	$3, %xmm0, %xmm1
649	movd	%xmm1, %eax
650	imull	LCPI1_0+12, %eax
651	movd	%eax, %xmm1
652	pshufd	$1, %xmm0, %xmm2
653	movd	%xmm2, %eax
654	imull	LCPI1_0+4, %eax
655	movd	%eax, %xmm2
656	punpckldq	%xmm1, %xmm2
657	movd	%xmm0, %eax
658	imull	LCPI1_0, %eax
659	movd	%eax, %xmm1
660	movhlps	%xmm0, %xmm0
661	movd	%xmm0, %eax
662	imull	LCPI1_0+8, %eax
663	movd	%eax, %xmm0
664	punpckldq	%xmm0, %xmm1
665	movaps	%xmm1, %xmm0
666	punpckldq	%xmm2, %xmm0
667	ret
668
669It would be better to synthesize integer vector multiplication by constants
670using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
671simple cases such as multiplication by powers of two would be better as
672vector shifts than as multiplications.
673
674//===---------------------------------------------------------------------===//
675
676We compile this:
677
678__m128i
679foo2 (char x)
680{
681  return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
682}
683
684into:
685	movl	$1, %eax
686	xorps	%xmm0, %xmm0
687	pinsrw	$2, %eax, %xmm0
688	movzbl	4(%esp), %eax
689	pinsrw	$3, %eax, %xmm0
690	movl	$256, %eax
691	pinsrw	$7, %eax, %xmm0
692	ret
693
694
695gcc-4.2:
696	subl	$12, %esp
697	movzbl	16(%esp), %eax
698	movdqa	LC0, %xmm0
699	pinsrw	$3, %eax, %xmm0
700	addl	$12, %esp
701	ret
702	.const
703	.align 4
704LC0:
705	.word	0
706	.word	0
707	.word	1
708	.word	0
709	.word	0
710	.word	0
711	.word	0
712	.word	256
713
714With SSE4, it should be
715      movdqa  .LC0(%rip), %xmm0
716      pinsrb  $6, %edi, %xmm0
717
718//===---------------------------------------------------------------------===//
719
720We should transform a shuffle of two vectors of constants into a single vector
721of constants. Also, insertelement of a constant into a vector of constants
722should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
723
724We compiled it to something horrible:
725
726	.align	4
727LCPI1_1:					##  float
728	.long	1065353216	## float 1
729	.const
730
731	.align	4
732LCPI1_0:					##  <4 x float>
733	.space	4
734	.long	1065353216	## float 1
735	.space	4
736	.long	1065353216	## float 1
737	.text
738	.align	4,0x90
739	.globl	_t
740_t:
741	xorps	%xmm0, %xmm0
742	movhps	LCPI1_0, %xmm0
743	movss	LCPI1_1, %xmm1
744	movaps	%xmm0, %xmm2
745	shufps	$2, %xmm1, %xmm2
746	shufps	$132, %xmm2, %xmm0
747	movaps	%xmm0, 0
748
749//===---------------------------------------------------------------------===//
750rdar://5907648
751
752This function:
753
754float foo(unsigned char x) {
755  return x;
756}
757
758compiles to (x86-32):
759
760define float @foo(i8 zeroext  %x) nounwind  {
761	%tmp12 = uitofp i8 %x to float		; <float> [#uses=1]
762	ret float %tmp12
763}
764
765compiles to:
766
767_foo:
768	subl	$4, %esp
769	movzbl	8(%esp), %eax
770	cvtsi2ss	%eax, %xmm0
771	movss	%xmm0, (%esp)
772	flds	(%esp)
773	addl	$4, %esp
774	ret
775
776We should be able to use:
777  cvtsi2ss 8($esp), %xmm0
778since we know the stack slot is already zext'd.
779
780//===---------------------------------------------------------------------===//
781
782Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
783when code size is critical. movlps is slower than movsd on core2 but it's one
784byte shorter.
785
786//===---------------------------------------------------------------------===//
787
788We should use a dynamic programming based approach to tell when using FPStack
789operations is cheaper than SSE.  SciMark montecarlo contains code like this
790for example:
791
792double MonteCarlo_num_flops(int Num_samples) {
793    return ((double) Num_samples)* 4.0;
794}
795
796In fpstack mode, this compiles into:
797
798LCPI1_0:
799	.long	1082130432	## float 4.000000e+00
800_MonteCarlo_num_flops:
801	subl	$4, %esp
802	movl	8(%esp), %eax
803	movl	%eax, (%esp)
804	fildl	(%esp)
805	fmuls	LCPI1_0
806	addl	$4, %esp
807	ret
808
809in SSE mode, it compiles into significantly slower code:
810
811_MonteCarlo_num_flops:
812	subl	$12, %esp
813	cvtsi2sd	16(%esp), %xmm0
814	mulsd	LCPI1_0, %xmm0
815	movsd	%xmm0, (%esp)
816	fldl	(%esp)
817	addl	$12, %esp
818	ret
819
820There are also other cases in scimark where using fpstack is better, it is
821cheaper to do fld1 than load from a constant pool for example, so
822"load, add 1.0, store" is better done in the fp stack, etc.
823
824//===---------------------------------------------------------------------===//
825
826The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
827"cmpsd".  For example, this code:
828
829double d1(double x) { return x == x ? x : x + x; }
830
831Compiles into:
832
833_d1:
834	ucomisd	%xmm0, %xmm0
835	jnp	LBB1_2
836	addsd	%xmm0, %xmm0
837	ret
838LBB1_2:
839	ret
840
841Also, the 'ret's should be shared.  This is PR6032.
842
843//===---------------------------------------------------------------------===//
844
845These should compile into the same code (PR6214): Perhaps instcombine should
846canonicalize the former into the later?
847
848define float @foo(float %x) nounwind {
849  %t = bitcast float %x to i32
850  %s = and i32 %t, 2147483647
851  %d = bitcast i32 %s to float
852  ret float %d
853}
854
855declare float @fabsf(float %n)
856define float @bar(float %x) nounwind {
857  %d = call float @fabsf(float %x)
858  ret float %d
859}
860
861//===---------------------------------------------------------------------===//
862
863This IR (from PR6194):
864
865target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
866target triple = "x86_64-apple-darwin10.0.0"
867
868%0 = type { double, double }
869%struct.float3 = type { float, float, float }
870
871define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
872entry:
873  %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
874  %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
875  %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
876  %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
877  %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
878  %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
879  %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
880  store float %tmp12, float* %tmp5
881  ret void
882}
883
884Compiles to:
885
886_test:                                  ## @test
887	movd	%xmm0, %rax
888	shrq	$32, %rax
889	movl	%eax, 4(%rdi)
890	ret
891
892This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
893doing a shuffle from v[1] to v[0] then a float store.
894
895//===---------------------------------------------------------------------===//
896
897On SSE4 machines, we compile this code:
898
899define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
900       <2 x float> *%P) nounwind {
901  %Z = fadd <2 x float> %Q, %R
902
903  store <2 x float> %Z, <2 x float> *%P
904  ret <2 x float> %Z
905}
906
907into:
908
909_test2:                                 ## @test2
910## BB#0:
911	insertps	$0, %xmm2, %xmm2
912	insertps	$16, %xmm3, %xmm2
913	insertps	$0, %xmm0, %xmm3
914	insertps	$16, %xmm1, %xmm3
915	addps	%xmm2, %xmm3
916	movq	%xmm3, (%rdi)
917	movaps	%xmm3, %xmm0
918	pshufd	$1, %xmm3, %xmm1
919                                        ## kill: XMM1<def> XMM1<kill>
920	ret
921
922The insertps's of $0 are pointless complex copies.
923
924//===---------------------------------------------------------------------===//
925
926If SSE4.1 is available we should inline rounding functions instead of emitting
927a libcall.
928
929floor: roundsd $0x01, %xmm, %xmm
930ceil:  roundsd $0x02, %xmm, %xmm
931
932and likewise for the single precision versions.
933
934Currently, SelectionDAGBuilder doesn't turn calls to these functions into the
935corresponding nodes and some targets (including X86) aren't ready for them.
936
937//===---------------------------------------------------------------------===//
938

README-UNIMPLEMENTED.txt

1//===---------------------------------------------------------------------===//
2// Testcases that crash the X86 backend because they aren't implemented
3//===---------------------------------------------------------------------===//
4
5These are cases we know the X86 backend doesn't handle.  Patches are welcome
6and appreciated, because no one has signed up to implemented these yet.
7Implementing these would allow elimination of the corresponding intrinsics,
8which would be great.
9
101) vector shifts
112) vector comparisons
123) vector fp<->int conversions: PR2683, PR2684, PR2685, PR2686, PR2688
134) bitcasts from vectors to scalars: PR2804
145) llvm.atomic.cmp.swap.i128.p0i128: PR3462
15

README-X86-64.txt

1//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
2
3AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
4multiplication by a constant. How much of it applies to Intel's X86-64
5implementation? There are definite trade-offs to consider: latency vs. register
6pressure vs. code size.
7
8//===---------------------------------------------------------------------===//
9
10Are we better off using branches instead of cmove to implement FP to
11unsigned i64?
12
13_conv:
14	ucomiss	LC0(%rip), %xmm0
15	cvttss2siq	%xmm0, %rdx
16	jb	L3
17	subss	LC0(%rip), %xmm0
18	movabsq	$-9223372036854775808, %rax
19	cvttss2siq	%xmm0, %rdx
20	xorq	%rax, %rdx
21L3:
22	movq	%rdx, %rax
23	ret
24
25instead of
26
27_conv:
28	movss LCPI1_0(%rip), %xmm1
29	cvttss2siq %xmm0, %rcx
30	movaps %xmm0, %xmm2
31	subss %xmm1, %xmm2
32	cvttss2siq %xmm2, %rax
33	movabsq $-9223372036854775808, %rdx
34	xorq %rdx, %rax
35	ucomiss %xmm1, %xmm0
36	cmovb %rcx, %rax
37	ret
38
39Seems like the jb branch has high likelihood of being taken. It would have
40saved a few instructions.
41
42//===---------------------------------------------------------------------===//
43
44It's not possible to reference AH, BH, CH, and DH registers in an instruction
45requiring REX prefix. However, divb and mulb both produce results in AH. If isel
46emits a CopyFromReg which gets turned into a movb and that can be allocated a
47r8b - r15b.
48
49To get around this, isel emits a CopyFromReg from AX and then right shift it
50down by 8 and truncate it. It's not pretty but it works. We need some register
51allocation magic to make the hack go away (e.g. putting additional constraints
52on the result of the movb).
53
54//===---------------------------------------------------------------------===//
55
56The x86-64 ABI for hidden-argument struct returns requires that the
57incoming value of %rdi be copied into %rax by the callee upon return.
58
59The idea is that it saves callers from having to remember this value,
60which would often require a callee-saved register. Callees usually
61need to keep this value live for most of their body anyway, so it
62doesn't add a significant burden on them.
63
64We currently implement this in codegen, however this is suboptimal
65because it means that it would be quite awkward to implement the
66optimization for callers.
67
68A better implementation would be to relax the LLVM IR rules for sret
69arguments to allow a function with an sret argument to have a non-void
70return type, and to have the front-end to set up the sret argument value
71as the return value of the function. The front-end could more easily
72emit uses of the returned struct value to be in terms of the function's
73lowered return value, and it would free non-C frontends from a
74complication only required by a C-based ABI.
75
76//===---------------------------------------------------------------------===//
77
78We get a redundant zero extension for code like this:
79
80int mask[1000];
81int foo(unsigned x) {
82 if (x < 10)
83   x = x * 45;
84 else
85   x = x * 78;
86 return mask[x];
87}
88
89_foo:
90LBB1_0:	## entry
91	cmpl	$9, %edi
92	jbe	LBB1_3	## bb
93LBB1_1:	## bb1
94	imull	$78, %edi, %eax
95LBB1_2:	## bb2
96	movl	%eax, %eax                    <----
97	movq	_mask@GOTPCREL(%rip), %rcx
98	movl	(%rcx,%rax,4), %eax
99	ret
100LBB1_3:	## bb
101	imull	$45, %edi, %eax
102	jmp	LBB1_2	## bb2
103
104Before regalloc, we have:
105
106        %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def>
107        JMP mbb<bb2,0x203afb0>
108    Successors according to CFG: 0x203afb0 (#3)
109
110bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
111    Predecessors according to CFG: 0x203aec0 (#0)
112        %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def>
113    Successors according to CFG: 0x203afb0 (#3)
114
115bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
116    Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
117        %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>,
118                            %reg1026, mbb<bb1,0x203af60>
119        %reg1029<def> = MOVZX64rr32 %reg1027
120
121so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
122be able to recognize the zero extend.  This could also presumably be implemented
123if we have whole-function selectiondags.
124
125//===---------------------------------------------------------------------===//
126
127Take the following code
128(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
129extern unsigned long table[];
130unsigned long foo(unsigned char *p) {
131  unsigned long tag = *p;
132  return table[tag >> 4] + table[tag & 0xf];
133}
134
135Current code generated:
136	movzbl	(%rdi), %eax
137	movq	%rax, %rcx
138	andq	$240, %rcx
139	shrq	%rcx
140	andq	$15, %rax
141	movq	table(,%rax,8), %rax
142	addq	table(%rcx), %rax
143	ret
144
145Issues:
1461. First movq should be movl; saves a byte.
1472. Both andq's should be andl; saves another two bytes.  I think this was
148   implemented at one point, but subsequently regressed.
1493. shrq should be shrl; saves another byte.
1504. The first andq can be completely eliminated by using a slightly more
151   expensive addressing mode.
152
153//===---------------------------------------------------------------------===//
154
155Consider the following (contrived testcase, but contains common factors):
156
157#include <stdarg.h>
158int test(int x, ...) {
159  int sum, i;
160  va_list l;
161  va_start(l, x);
162  for (i = 0; i < x; i++)
163    sum += va_arg(l, int);
164  va_end(l);
165  return sum;
166}
167
168Testcase given in C because fixing it will likely involve changing the IR
169generated for it.  The primary issue with the result is that it doesn't do any
170of the optimizations which are possible if we know the address of a va_list
171in the current function is never taken:
1721. We shouldn't spill the XMM registers because we only call va_arg with "int".
1732. It would be nice if we could scalarrepl the va_list.
1743. Probably overkill, but it'd be cool if we could peel off the first five
175iterations of the loop.
176
177Other optimizations involving functions which use va_arg on floats which don't
178have the address of a va_list taken:
1791. Conversely to the above, we shouldn't spill general registers if we only
180   call va_arg on "double".
1812. If we know nothing more than 64 bits wide is read from the XMM registers,
182   we can change the spilling code to reduce the amount of stack used by half.
183
184//===---------------------------------------------------------------------===//
185

README.txt

1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend.
3//===---------------------------------------------------------------------===//
4
5We should add support for the "movbe" instruction, which does a byte-swapping
6copy (3-addr bswap + memory support?)  This is available on Atom processors.
7
8//===---------------------------------------------------------------------===//
9
10This should be one DIV/IDIV instruction, not a libcall:
11
12unsigned test(unsigned long long X, unsigned Y) {
13        return X/Y;
14}
15
16This can be done trivially with a custom legalizer.  What about overflow
17though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
18
19//===---------------------------------------------------------------------===//
20
21Improvements to the multiply -> shift/add algorithm:
22http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
23
24//===---------------------------------------------------------------------===//
25
26Improve code like this (occurs fairly frequently, e.g. in LLVM):
27long long foo(int x) { return 1LL << x; }
28
29http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
30http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
31http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
32
33Another useful one would be  ~0ULL >> X and ~0ULL << X.
34
35One better solution for 1LL << x is:
36        xorl    %eax, %eax
37        xorl    %edx, %edx
38        testb   $32, %cl
39        sete    %al
40        setne   %dl
41        sall    %cl, %eax
42        sall    %cl, %edx
43
44But that requires good 8-bit subreg support.
45
46Also, this might be better.  It's an extra shift, but it's one instruction
47shorter, and doesn't stress 8-bit subreg support.
48(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
49but without the unnecessary and.)
50        movl %ecx, %eax
51        shrl $5, %eax
52        movl %eax, %edx
53        xorl $1, %edx
54        sall %cl, %eax
55        sall %cl. %edx
56
5764-bit shifts (in general) expand to really bad code.  Instead of using
58cmovs, we should expand to a conditional branch like GCC produces.
59
60//===---------------------------------------------------------------------===//
61
62Some isel ideas:
63
641. Dynamic programming based approach when compile time if not an
65   issue.
662. Code duplication (addressing mode) during isel.
673. Other ideas from "Register-Sensitive Selection, Duplication, and
68   Sequencing of Instructions".
694. Scheduling for reduced register pressure.  E.g. "Minimum Register
70   Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
71   and other related papers.
72   http://citeseer.ist.psu.edu/govindarajan01minimum.html
73
74//===---------------------------------------------------------------------===//
75
76Should we promote i16 to i32 to avoid partial register update stalls?
77
78//===---------------------------------------------------------------------===//
79
80Leave any_extend as pseudo instruction and hint to register
81allocator. Delay codegen until post register allocation.
82Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
83the coalescer how to deal with it though.
84
85//===---------------------------------------------------------------------===//
86
87It appears icc use push for parameter passing. Need to investigate.
88
89//===---------------------------------------------------------------------===//
90
91This:
92
93void foo(void);
94void bar(int x, int *P) {
95  x >>= 2;
96  if (x)
97    foo();
98  *P = x;
99}
100
101compiles into:
102
103	movq	%rsi, %rbx
104	movl	%edi, %r14d
105	sarl	$2, %r14d
106	testl	%r14d, %r14d
107	je	LBB0_2
108
109Instead of doing an explicit test, we can use the flags off the sar.  This
110occurs in a bigger testcase like this, which is pretty common:
111
112#include <vector>
113int test1(std::vector<int> &X) {
114  int Sum = 0;
115  for (long i = 0, e = X.size(); i != e; ++i)
116    X[i] = 0;
117  return Sum;
118}
119
120//===---------------------------------------------------------------------===//
121
122Only use inc/neg/not instructions on processors where they are faster than
123add/sub/xor.  They are slower on the P4 due to only updating some processor
124flags.
125
126//===---------------------------------------------------------------------===//
127
128The instruction selector sometimes misses folding a load into a compare.  The
129pattern is written as (cmp reg, (load p)).  Because the compare isn't
130commutative, it is not matched with the load on both sides.  The dag combiner
131should be made smart enough to cannonicalize the load into the RHS of a compare
132when it can invert the result of the compare for free.
133
134//===---------------------------------------------------------------------===//
135
136In many cases, LLVM generates code like this:
137
138_test:
139        movl 8(%esp), %eax
140        cmpl %eax, 4(%esp)
141        setl %al
142        movzbl %al, %eax
143        ret
144
145on some processors (which ones?), it is more efficient to do this:
146
147_test:
148        movl 8(%esp), %ebx
149        xor  %eax, %eax
150        cmpl %ebx, 4(%esp)
151        setl %al
152        ret
153
154Doing this correctly is tricky though, as the xor clobbers the flags.
155
156//===---------------------------------------------------------------------===//
157
158We should generate bts/btr/etc instructions on targets where they are cheap or
159when codesize is important.  e.g., for:
160
161void setbit(int *target, int bit) {
162    *target |= (1 << bit);
163}
164void clearbit(int *target, int bit) {
165    *target &= ~(1 << bit);
166}
167
168//===---------------------------------------------------------------------===//
169
170Instead of the following for memset char*, 1, 10:
171
172	movl $16843009, 4(%edx)
173	movl $16843009, (%edx)
174	movw $257, 8(%edx)
175
176It might be better to generate
177
178	movl $16843009, %eax
179	movl %eax, 4(%edx)
180	movl %eax, (%edx)
181	movw al, 8(%edx)
182
183when we can spare a register. It reduces code size.
184
185//===---------------------------------------------------------------------===//
186
187Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
188get this:
189
190define i32 @test1(i32 %X) {
191    %Y = sdiv i32 %X, 8
192    ret i32 %Y
193}
194
195_test1:
196        movl 4(%esp), %eax
197        movl %eax, %ecx
198        sarl $31, %ecx
199        shrl $29, %ecx
200        addl %ecx, %eax
201        sarl $3, %eax
202        ret
203
204GCC knows several different ways to codegen it, one of which is this:
205
206_test1:
207        movl    4(%esp), %eax
208        cmpl    $-1, %eax
209        leal    7(%eax), %ecx
210        cmovle  %ecx, %eax
211        sarl    $3, %eax
212        ret
213
214which is probably slower, but it's interesting at least :)
215
216//===---------------------------------------------------------------------===//
217
218We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
219We should leave these as libcalls for everything over a much lower threshold,
220since libc is hand tuned for medium and large mem ops (avoiding RFO for large
221stores, TLB preheating, etc)
222
223//===---------------------------------------------------------------------===//
224
225Optimize this into something reasonable:
226 x * copysign(1.0, y) * copysign(1.0, z)
227
228//===---------------------------------------------------------------------===//
229
230Optimize copysign(x, *y) to use an integer load from y.
231
232//===---------------------------------------------------------------------===//
233
234The following tests perform worse with LSR:
235
236lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
237
238//===---------------------------------------------------------------------===//
239
240Adding to the list of cmp / test poor codegen issues:
241
242int test(__m128 *A, __m128 *B) {
243  if (_mm_comige_ss(*A, *B))
244    return 3;
245  else
246    return 4;
247}
248
249_test:
250	movl 8(%esp), %eax
251	movaps (%eax), %xmm0
252	movl 4(%esp), %eax
253	movaps (%eax), %xmm1
254	comiss %xmm0, %xmm1
255	setae %al
256	movzbl %al, %ecx
257	movl $3, %eax
258	movl $4, %edx
259	cmpl $0, %ecx
260	cmove %edx, %eax
261	ret
262
263Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
264are a number of issues. 1) We are introducing a setcc between the result of the
265intrisic call and select. 2) The intrinsic is expected to produce a i32 value
266so a any extend (which becomes a zero extend) is added.
267
268We probably need some kind of target DAG combine hook to fix this.
269
270//===---------------------------------------------------------------------===//
271
272We generate significantly worse code for this than GCC:
273http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
274http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
275
276There is also one case we do worse on PPC.
277
278//===---------------------------------------------------------------------===//
279
280For this:
281
282int test(int a)
283{
284  return a * 3;
285}
286
287We currently emits
288	imull $3, 4(%esp), %eax
289
290Perhaps this is what we really should generate is? Is imull three or four
291cycles? Note: ICC generates this:
292	movl	4(%esp), %eax
293	leal	(%eax,%eax,2), %eax
294
295The current instruction priority is based on pattern complexity. The former is
296more "complex" because it folds a load so the latter will not be emitted.
297
298Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
299should always try to match LEA first since the LEA matching code does some
300estimate to determine whether the match is profitable.
301
302However, if we care more about code size, then imull is better. It's two bytes
303shorter than movl + leal.
304
305On a Pentium M, both variants have the same characteristics with regard
306to throughput; however, the multiplication has a latency of four cycles, as
307opposed to two cycles for the movl+lea variant.
308
309//===---------------------------------------------------------------------===//
310
311__builtin_ffs codegen is messy.
312
313int ffs_(unsigned X) { return __builtin_ffs(X); }
314
315llvm produces:
316ffs_:
317        movl    4(%esp), %ecx
318        bsfl    %ecx, %eax
319        movl    $32, %edx
320        cmove   %edx, %eax
321        incl    %eax
322        xorl    %edx, %edx
323        testl   %ecx, %ecx
324        cmove   %edx, %eax
325        ret
326
327vs gcc:
328
329_ffs_:
330        movl    $-1, %edx
331        bsfl    4(%esp), %eax
332        cmove   %edx, %eax
333        addl    $1, %eax
334        ret
335
336Another example of __builtin_ffs (use predsimplify to eliminate a select):
337
338int foo (unsigned long j) {
339  if (j)
340    return __builtin_ffs (j) - 1;
341  else
342    return 0;
343}
344
345//===---------------------------------------------------------------------===//
346
347It appears gcc place string data with linkonce linkage in
348.section __TEXT,__const_coal,coalesced instead of
349.section __DATA,__const_coal,coalesced.
350Take a look at darwin.h, there are other Darwin assembler directives that we
351do not make use of.
352
353//===---------------------------------------------------------------------===//
354
355define i32 @foo(i32* %a, i32 %t) {
356entry:
357	br label %cond_true
358
359cond_true:		; preds = %cond_true, %entry
360	%x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]		; <i32> [#uses=3]
361	%t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
362	%tmp2 = getelementptr i32* %a, i32 %x.0.0		; <i32*> [#uses=1]
363	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
364	%tmp5 = add i32 %t_addr.0.0, %x.0.0		; <i32> [#uses=1]
365	%tmp7 = add i32 %tmp5, %tmp3		; <i32> [#uses=2]
366	%tmp9 = add i32 %x.0.0, 1		; <i32> [#uses=2]
367	%tmp = icmp sgt i32 %tmp9, 39		; <i1> [#uses=1]
368	br i1 %tmp, label %bb12, label %cond_true
369
370bb12:		; preds = %cond_true
371	ret i32 %tmp7
372}
373is pessimized by -loop-reduce and -indvars
374
375//===---------------------------------------------------------------------===//
376
377u32 to float conversion improvement:
378
379float uint32_2_float( unsigned u ) {
380  float fl = (int) (u & 0xffff);
381  float fh = (int) (u >> 16);
382  fh *= 0x1.0p16f;
383  return fh + fl;
384}
385
38600000000        subl    $0x04,%esp
38700000003        movl    0x08(%esp,1),%eax
38800000007        movl    %eax,%ecx
38900000009        shrl    $0x10,%ecx
3900000000c        cvtsi2ss        %ecx,%xmm0
39100000010        andl    $0x0000ffff,%eax
39200000015        cvtsi2ss        %eax,%xmm1
39300000019        mulss   0x00000078,%xmm0
39400000021        addss   %xmm1,%xmm0
39500000025        movss   %xmm0,(%esp,1)
3960000002a        flds    (%esp,1)
3970000002d        addl    $0x04,%esp
39800000030        ret
399
400//===---------------------------------------------------------------------===//
401
402When using fastcc abi, align stack slot of argument of type double on 8 byte
403boundary to improve performance.
404
405//===---------------------------------------------------------------------===//
406
407GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
408simplifications for integer "x cmp y ? a : b".
409
410//===---------------------------------------------------------------------===//
411
412Consider the expansion of:
413
414define i32 @test3(i32 %X) {
415        %tmp1 = urem i32 %X, 255
416        ret i32 %tmp1
417}
418
419Currently it compiles to:
420
421...
422        movl $2155905153, %ecx
423        movl 8(%esp), %esi
424        movl %esi, %eax
425        mull %ecx
426...
427
428This could be "reassociated" into:
429
430        movl $2155905153, %eax
431        movl 8(%esp), %ecx
432        mull %ecx
433
434to avoid the copy.  In fact, the existing two-address stuff would do this
435except that mul isn't a commutative 2-addr instruction.  I guess this has
436to be done at isel time based on the #uses to mul?
437
438//===---------------------------------------------------------------------===//
439
440Make sure the instruction which starts a loop does not cross a cacheline
441boundary. This requires knowning the exact length of each machine instruction.
442That is somewhat complicated, but doable. Example 256.bzip2:
443
444In the new trace, the hot loop has an instruction which crosses a cacheline
445boundary.  In addition to potential cache misses, this can't help decoding as I
446imagine there has to be some kind of complicated decoder reset and realignment
447to grab the bytes from the next cacheline.
448
449532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
450942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
451937  937 0x3d0a incl     %esi
4523    3   0x3d0b cmpb     %bl, %dl
45327   27  0x3d0d jnz      0x000062db <main+11707>
454
455//===---------------------------------------------------------------------===//
456
457In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
458
459//===---------------------------------------------------------------------===//
460
461This could be a single 16-bit load.
462
463int f(char *p) {
464    if ((p[0] == 1) & (p[1] == 2)) return 1;
465    return 0;
466}
467
468//===---------------------------------------------------------------------===//
469
470We should inline lrintf and probably other libc functions.
471
472//===---------------------------------------------------------------------===//
473
474Use the FLAGS values from arithmetic instructions more.  For example, compile:
475
476int add_zf(int *x, int y, int a, int b) {
477     if ((*x += y) == 0)
478          return a;
479     else
480          return b;
481}
482
483to:
484       addl    %esi, (%rdi)
485       movl    %edx, %eax
486       cmovne  %ecx, %eax
487       ret
488instead of:
489
490_add_zf:
491        addl (%rdi), %esi
492        movl %esi, (%rdi)
493        testl %esi, %esi
494        cmove %edx, %ecx
495        movl %ecx, %eax
496        ret
497
498As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
499without a test instruction.
500
501//===---------------------------------------------------------------------===//
502
503These two functions have identical effects:
504
505unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
506unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
507
508We currently compile them to:
509
510_f:
511        movl 4(%esp), %eax
512        movl %eax, %ecx
513        incl %ecx
514        movl 8(%esp), %edx
515        cmpl %edx, %ecx
516        jne LBB1_2      #UnifiedReturnBlock
517LBB1_1: #cond_true
518        addl $2, %eax
519        ret
520LBB1_2: #UnifiedReturnBlock
521        movl %ecx, %eax
522        ret
523_f2:
524        movl 4(%esp), %eax
525        movl %eax, %ecx
526        incl %ecx
527        cmpl 8(%esp), %ecx
528        sete %cl
529        movzbl %cl, %ecx
530        leal 1(%ecx,%eax), %eax
531        ret
532
533both of which are inferior to GCC's:
534
535_f:
536        movl    4(%esp), %edx
537        leal    1(%edx), %eax
538        addl    $2, %edx
539        cmpl    8(%esp), %eax
540        cmove   %edx, %eax
541        ret
542_f2:
543        movl    4(%esp), %eax
544        addl    $1, %eax
545        xorl    %edx, %edx
546        cmpl    8(%esp), %eax
547        sete    %dl
548        addl    %edx, %eax
549        ret
550
551//===---------------------------------------------------------------------===//
552
553This code:
554
555void test(int X) {
556  if (X) abort();
557}
558
559is currently compiled to:
560
561_test:
562        subl $12, %esp
563        cmpl $0, 16(%esp)
564        jne LBB1_1
565        addl $12, %esp
566        ret
567LBB1_1:
568        call L_abort$stub
569
570It would be better to produce:
571
572_test:
573        subl $12, %esp
574        cmpl $0, 16(%esp)
575        jne L_abort$stub
576        addl $12, %esp
577        ret
578
579This can be applied to any no-return function call that takes no arguments etc.
580Alternatively, the stack save/restore logic could be shrink-wrapped, producing
581something like this:
582
583_test:
584        cmpl $0, 4(%esp)
585        jne LBB1_1
586        ret
587LBB1_1:
588        subl $12, %esp
589        call L_abort$stub
590
591Both are useful in different situations.  Finally, it could be shrink-wrapped
592and tail called, like this:
593
594_test:
595        cmpl $0, 4(%esp)
596        jne LBB1_1
597        ret
598LBB1_1:
599        pop %eax   # realign stack.
600        call L_abort$stub
601
602Though this probably isn't worth it.
603
604//===---------------------------------------------------------------------===//
605
606Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
607a neg instead of a sub instruction.  Consider:
608
609int test(char X) { return 7-X; }
610
611we currently produce:
612_test:
613        movl $7, %eax
614        movsbl 4(%esp), %ecx
615        subl %ecx, %eax
616        ret
617
618We would use one fewer register if codegen'd as:
619
620        movsbl 4(%esp), %eax
621	neg %eax
622        add $7, %eax
623        ret
624
625Note that this isn't beneficial if the load can be folded into the sub.  In
626this case, we want a sub:
627
628int test(int X) { return 7-X; }
629_test:
630        movl $7, %eax
631        subl 4(%esp), %eax
632        ret
633
634//===---------------------------------------------------------------------===//
635
636Leaf functions that require one 4-byte spill slot have a prolog like this:
637
638_foo:
639        pushl   %esi
640        subl    $4, %esp
641...
642and an epilog like this:
643        addl    $4, %esp
644        popl    %esi
645        ret
646
647It would be smaller, and potentially faster, to push eax on entry and to
648pop into a dummy register instead of using addl/subl of esp.  Just don't pop
649into any return registers :)
650
651//===---------------------------------------------------------------------===//
652
653The X86 backend should fold (branch (or (setcc, setcc))) into multiple
654branches.  We generate really poor code for:
655
656double testf(double a) {
657       return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
658}
659
660For example, the entry BB is:
661
662_testf:
663        subl    $20, %esp
664        pxor    %xmm0, %xmm0
665        movsd   24(%esp), %xmm1
666        ucomisd %xmm0, %xmm1
667        setnp   %al
668        sete    %cl
669        testb   %cl, %al
670        jne     LBB1_5  # UnifiedReturnBlock
671LBB1_1: # cond_true
672
673
674it would be better to replace the last four instructions with:
675
676	jp LBB1_1
677	je LBB1_5
678LBB1_1:
679
680We also codegen the inner ?: into a diamond:
681
682       cvtss2sd        LCPI1_0(%rip), %xmm2
683        cvtss2sd        LCPI1_1(%rip), %xmm3
684        ucomisd %xmm1, %xmm0
685        ja      LBB1_3  # cond_true
686LBB1_2: # cond_true
687        movapd  %xmm3, %xmm2
688LBB1_3: # cond_true
689        movapd  %xmm2, %xmm0
690        ret
691
692We should sink the load into xmm3 into the LBB1_2 block.  This should
693be pretty easy, and will nuke all the copies.
694
695//===---------------------------------------------------------------------===//
696
697This:
698        #include <algorithm>
699        inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
700        { return std::make_pair(a + b, a + b < a); }
701        bool no_overflow(unsigned a, unsigned b)
702        { return !full_add(a, b).second; }
703
704Should compile to:
705	addl	%esi, %edi
706	setae	%al
707	movzbl	%al, %eax
708	ret
709
710on x86-64, instead of the rather stupid-looking:
711	addl	%esi, %edi
712	setb	%al
713	xorb	$1, %al
714	movzbl	%al, %eax
715	ret
716
717
718//===---------------------------------------------------------------------===//
719
720The following code:
721
722bb114.preheader:		; preds = %cond_next94
723	%tmp231232 = sext i16 %tmp62 to i32		; <i32> [#uses=1]
724	%tmp233 = sub i32 32, %tmp231232		; <i32> [#uses=1]
725	%tmp245246 = sext i16 %tmp65 to i32		; <i32> [#uses=1]
726	%tmp252253 = sext i16 %tmp68 to i32		; <i32> [#uses=1]
727	%tmp254 = sub i32 32, %tmp252253		; <i32> [#uses=1]
728	%tmp553554 = bitcast i16* %tmp37 to i8*		; <i8*> [#uses=2]
729	%tmp583584 = sext i16 %tmp98 to i32		; <i32> [#uses=1]
730	%tmp585 = sub i32 32, %tmp583584		; <i32> [#uses=1]
731	%tmp614615 = sext i16 %tmp101 to i32		; <i32> [#uses=1]
732	%tmp621622 = sext i16 %tmp104 to i32		; <i32> [#uses=1]
733	%tmp623 = sub i32 32, %tmp621622		; <i32> [#uses=1]
734	br label %bb114
735
736produces:
737
738LBB3_5:	# bb114.preheader
739	movswl	-68(%ebp), %eax
740	movl	$32, %ecx
741	movl	%ecx, -80(%ebp)
742	subl	%eax, -80(%ebp)
743	movswl	-52(%ebp), %eax
744	movl	%ecx, -84(%ebp)
745	subl	%eax, -84(%ebp)
746	movswl	-70(%ebp), %eax
747	movl	%ecx, -88(%ebp)
748	subl	%eax, -88(%ebp)
749	movswl	-50(%ebp), %eax
750	subl	%eax, %ecx
751	movl	%ecx, -76(%ebp)
752	movswl	-42(%ebp), %eax
753	movl	%eax, -92(%ebp)
754	movswl	-66(%ebp), %eax
755	movl	%eax, -96(%ebp)
756	movw	$0, -98(%ebp)
757
758This appears to be bad because the RA is not folding the store to the stack
759slot into the movl.  The above instructions could be:
760	movl    $32, -80(%ebp)
761...
762	movl    $32, -84(%ebp)
763...
764This seems like a cross between remat and spill folding.
765
766This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
767change, so we could simply subtract %eax from %ecx first and then use %ecx (or
768vice-versa).
769
770//===---------------------------------------------------------------------===//
771
772This code:
773
774	%tmp659 = icmp slt i16 %tmp654, 0		; <i1> [#uses=1]
775	br i1 %tmp659, label %cond_true662, label %cond_next715
776
777produces this:
778
779	testw	%cx, %cx
780	movswl	%cx, %esi
781	jns	LBB4_109	# cond_next715
782
783Shark tells us that using %cx in the testw instruction is sub-optimal. It
784suggests using the 32-bit register (which is what ICC uses).
785
786//===---------------------------------------------------------------------===//
787
788We compile this:
789
790void compare (long long foo) {
791  if (foo < 4294967297LL)
792    abort();
793}
794
795to:
796
797compare:
798        subl    $4, %esp
799        cmpl    $0, 8(%esp)
800        setne   %al
801        movzbw  %al, %ax
802        cmpl    $1, 12(%esp)
803        setg    %cl
804        movzbw  %cl, %cx
805        cmove   %ax, %cx
806        testb   $1, %cl
807        jne     .LBB1_2 # UnifiedReturnBlock
808.LBB1_1:        # ifthen
809        call    abort
810.LBB1_2:        # UnifiedReturnBlock
811        addl    $4, %esp
812        ret
813
814(also really horrible code on ppc).  This is due to the expand code for 64-bit
815compares.  GCC produces multiple branches, which is much nicer:
816
817compare:
818        subl    $12, %esp
819        movl    20(%esp), %edx
820        movl    16(%esp), %eax
821        decl    %edx
822        jle     .L7
823.L5:
824        addl    $12, %esp
825        ret
826        .p2align 4,,7
827.L7:
828        jl      .L4
829        cmpl    $0, %eax
830        .p2align 4,,8
831        ja      .L5
832.L4:
833        .p2align 4,,9
834        call    abort
835
836//===---------------------------------------------------------------------===//
837
838Tail call optimization improvements: Tail call optimization currently
839pushes all arguments on the top of the stack (their normal place for
840non-tail call optimized calls) that source from the callers arguments
841or  that source from a virtual register (also possibly sourcing from
842callers arguments).
843This is done to prevent overwriting of parameters (see example
844below) that might be used later.
845
846example:
847
848int callee(int32, int64);
849int caller(int32 arg1, int32 arg2) {
850  int64 local = arg2 * 2;
851  return callee(arg2, (int64)local);
852}
853
854[arg1]          [!arg2 no longer valid since we moved local onto it]
855[arg2]      ->  [(int64)
856[RETADDR]        local  ]
857
858Moving arg1 onto the stack slot of callee function would overwrite
859arg2 of the caller.
860
861Possible optimizations:
862
863
864 - Analyse the actual parameters of the callee to see which would
865   overwrite a caller parameter which is used by the callee and only
866   push them onto the top of the stack.
867
868   int callee (int32 arg1, int32 arg2);
869   int caller (int32 arg1, int32 arg2) {
870       return callee(arg1,arg2);
871   }
872
873   Here we don't need to write any variables to the top of the stack
874   since they don't overwrite each other.
875
876   int callee (int32 arg1, int32 arg2);
877   int caller (int32 arg1, int32 arg2) {
878       return callee(arg2,arg1);
879   }
880
881   Here we need to push the arguments because they overwrite each
882   other.
883
884//===---------------------------------------------------------------------===//
885
886main ()
887{
888  int i = 0;
889  unsigned long int z = 0;
890
891  do {
892    z -= 0x00004000;
893    i++;
894    if (i > 0x00040000)
895      abort ();
896  } while (z > 0);
897  exit (0);
898}
899
900gcc compiles this to:
901
902_main:
903	subl	$28, %esp
904	xorl	%eax, %eax
905	jmp	L2
906L3:
907	cmpl	$262144, %eax
908	je	L10
909L2:
910	addl	$1, %eax
911	cmpl	$262145, %eax
912	jne	L3
913	call	L_abort$stub
914L10:
915	movl	$0, (%esp)
916	call	L_exit$stub
917
918llvm:
919
920_main:
921	subl	$12, %esp
922	movl	$1, %eax
923	movl	$16384, %ecx
924LBB1_1:	# bb
925	cmpl	$262145, %eax
926	jge	LBB1_4	# cond_true
927LBB1_2:	# cond_next
928	incl	%eax
929	addl	$4294950912, %ecx
930	cmpl	$16384, %ecx
931	jne	LBB1_1	# bb
932LBB1_3:	# bb11
933	xorl	%eax, %eax
934	addl	$12, %esp
935	ret
936LBB1_4:	# cond_true
937	call	L_abort$stub
938
9391. LSR should rewrite the first cmp with induction variable %ecx.
9402. DAG combiner should fold
941        leal    1(%eax), %edx
942        cmpl    $262145, %edx
943   =>
944        cmpl    $262144, %eax
945
946//===---------------------------------------------------------------------===//
947
948define i64 @test(double %X) {
949	%Y = fptosi double %X to i64
950	ret i64 %Y
951}
952
953compiles to:
954
955_test:
956	subl	$20, %esp
957	movsd	24(%esp), %xmm0
958	movsd	%xmm0, 8(%esp)
959	fldl	8(%esp)
960	fisttpll	(%esp)
961	movl	4(%esp), %edx
962	movl	(%esp), %eax
963	addl	$20, %esp
964	#FP_REG_KILL
965	ret
966
967This should just fldl directly from the input stack slot.
968
969//===---------------------------------------------------------------------===//
970
971This code:
972int foo (int x) { return (x & 65535) | 255; }
973
974Should compile into:
975
976_foo:
977        movzwl  4(%esp), %eax
978        orl     $255, %eax
979        ret
980
981instead of:
982_foo:
983	movl	$65280, %eax
984	andl	4(%esp), %eax
985	orl	$255, %eax
986	ret
987
988//===---------------------------------------------------------------------===//
989
990We're codegen'ing multiply of long longs inefficiently:
991
992unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
993  return arg1 *  arg2;
994}
995
996We compile to (fomit-frame-pointer):
997
998_LLM:
999	pushl	%esi
1000	movl	8(%esp), %ecx
1001	movl	16(%esp), %esi
1002	movl	%esi, %eax
1003	mull	%ecx
1004	imull	12(%esp), %esi
1005	addl	%edx, %esi
1006	imull	20(%esp), %ecx
1007	movl	%esi, %edx
1008	addl	%ecx, %edx
1009	popl	%esi
1010	ret
1011
1012This looks like a scheduling deficiency and lack of remat of the load from
1013the argument area.  ICC apparently produces:
1014
1015        movl      8(%esp), %ecx
1016        imull     12(%esp), %ecx
1017        movl      16(%esp), %eax
1018        imull     4(%esp), %eax
1019        addl      %eax, %ecx
1020        movl      4(%esp), %eax
1021        mull      12(%esp)
1022        addl      %ecx, %edx
1023        ret
1024
1025Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
1026http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1027
1028//===---------------------------------------------------------------------===//
1029
1030We can fold a store into "zeroing a reg".  Instead of:
1031
1032xorl    %eax, %eax
1033movl    %eax, 124(%esp)
1034
1035we should get:
1036
1037movl    $0, 124(%esp)
1038
1039if the flags of the xor are dead.
1040
1041Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
1042be folded into: shl [mem], 1
1043
1044//===---------------------------------------------------------------------===//
1045
1046In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1047or and instruction, for example:
1048
1049	xorpd	LCPI1_0, %xmm2
1050
1051However, if xmm2 gets spilled, we end up with really ugly code like this:
1052
1053	movsd	(%esp), %xmm0
1054	xorpd	LCPI1_0, %xmm0
1055	movsd	%xmm0, (%esp)
1056
1057Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1058the neg/abs instruction, turning it into an *integer* operation, like this:
1059
1060	xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
1061
1062you could also use xorb, but xorl is less likely to lead to a partial register
1063stall.  Here is a contrived testcase:
1064
1065double a, b, c;
1066void test(double *P) {
1067  double X = *P;
1068  a = X;
1069  bar();
1070  X = -X;
1071  b = X;
1072  bar();
1073  c = X;
1074}
1075
1076//===---------------------------------------------------------------------===//
1077
1078The generated code on x86 for checking for signed overflow on a multiply the
1079obvious way is much longer than it needs to be.
1080
1081int x(int a, int b) {
1082  long long prod = (long long)a*b;
1083  return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1084}
1085
1086See PR2053 for more details.
1087
1088//===---------------------------------------------------------------------===//
1089
1090We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1091more aggressively; it should cost the same as a move+shift on any modern
1092processor, but it's a lot shorter. Downside is that it puts more
1093pressure on register allocation because it has fixed operands.
1094
1095Example:
1096int abs(int x) {return x < 0 ? -x : x;}
1097
1098gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1099abs:
1100        movl    4(%esp), %eax
1101        cltd
1102        xorl    %edx, %eax
1103        subl    %edx, %eax
1104        ret
1105
1106//===---------------------------------------------------------------------===//
1107
1108Take the following code (from
1109http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1110
1111extern unsigned char first_one[65536];
1112int FirstOnet(unsigned long long arg1)
1113{
1114  if (arg1 >> 48)
1115    return (first_one[arg1 >> 48]);
1116  return 0;
1117}
1118
1119
1120The following code is currently generated:
1121FirstOnet:
1122        movl    8(%esp), %eax
1123        cmpl    $65536, %eax
1124        movl    4(%esp), %ecx
1125        jb      .LBB1_2 # UnifiedReturnBlock
1126.LBB1_1:        # ifthen
1127        shrl    $16, %eax
1128        movzbl  first_one(%eax), %eax
1129        ret
1130.LBB1_2:        # UnifiedReturnBlock
1131        xorl    %eax, %eax
1132        ret
1133
1134We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
1135lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
1136
1137//===---------------------------------------------------------------------===//
1138
1139We compile this function:
1140
1141define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
1142entry:
1143	%tmp2 = icmp eq i8 %d, 0		; <i1> [#uses=1]
1144	br i1 %tmp2, label %bb7, label %bb
1145
1146bb:		; preds = %entry
1147	%tmp6 = add i32 %b, %a		; <i32> [#uses=1]
1148	ret i32 %tmp6
1149
1150bb7:		; preds = %entry
1151	%tmp10 = sub i32 %a, %c		; <i32> [#uses=1]
1152	ret i32 %tmp10
1153}
1154
1155to:
1156
1157foo:                                    # @foo
1158# BB#0:                                 # %entry
1159	movl	4(%esp), %ecx
1160	cmpb	$0, 16(%esp)
1161	je	.LBB0_2
1162# BB#1:                                 # %bb
1163	movl	8(%esp), %eax
1164	addl	%ecx, %eax
1165	ret
1166.LBB0_2:                                # %bb7
1167	movl	12(%esp), %edx
1168	movl	%ecx, %eax
1169	subl	%edx, %eax
1170	ret
1171
1172There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1173couple more movls by putting 4(%esp) into %eax instead of %ecx.
1174
1175//===---------------------------------------------------------------------===//
1176
1177See rdar://4653682.
1178
1179From flops:
1180
1181LBB1_15:        # bb310
1182        cvtss2sd        LCPI1_0, %xmm1
1183        addsd   %xmm1, %xmm0
1184        movsd   176(%esp), %xmm2
1185        mulsd   %xmm0, %xmm2
1186        movapd  %xmm2, %xmm3
1187        mulsd   %xmm3, %xmm3
1188        movapd  %xmm3, %xmm4
1189        mulsd   LCPI1_23, %xmm4
1190        addsd   LCPI1_24, %xmm4
1191        mulsd   %xmm3, %xmm4
1192        addsd   LCPI1_25, %xmm4
1193        mulsd   %xmm3, %xmm4
1194        addsd   LCPI1_26, %xmm4
1195        mulsd   %xmm3, %xmm4
1196        addsd   LCPI1_27, %xmm4
1197        mulsd   %xmm3, %xmm4
1198        addsd   LCPI1_28, %xmm4
1199        mulsd   %xmm3, %xmm4
1200        addsd   %xmm1, %xmm4
1201        mulsd   %xmm2, %xmm4
1202        movsd   152(%esp), %xmm1
1203        addsd   %xmm4, %xmm1
1204        movsd   %xmm1, 152(%esp)
1205        incl    %eax
1206        cmpl    %eax, %esi
1207        jge     LBB1_15 # bb310
1208LBB1_16:        # bb358.loopexit
1209        movsd   152(%esp), %xmm0
1210        addsd   %xmm0, %xmm0
1211        addsd   LCPI1_22, %xmm0
1212        movsd   %xmm0, 152(%esp)
1213
1214Rather than spilling the result of the last addsd in the loop, we should have
1215insert a copy to split the interval (one for the duration of the loop, one
1216extending to the fall through). The register pressure in the loop isn't high
1217enough to warrant the spill.
1218
1219Also check why xmm7 is not used at all in the function.
1220
1221//===---------------------------------------------------------------------===//
1222
1223Take the following:
1224
1225target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
1226target triple = "i386-apple-darwin8"
1227@in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
1228define fastcc void @abort_gzip() noreturn nounwind  {
1229entry:
1230	%tmp.b.i = load i1* @in_exit.4870.b		; <i1> [#uses=1]
1231	br i1 %tmp.b.i, label %bb.i, label %bb4.i
1232bb.i:		; preds = %entry
1233	tail call void @exit( i32 1 ) noreturn nounwind
1234	unreachable
1235bb4.i:		; preds = %entry
1236	store i1 true, i1* @in_exit.4870.b
1237	tail call void @exit( i32 1 ) noreturn nounwind
1238	unreachable
1239}
1240declare void @exit(i32) noreturn nounwind
1241
1242This compiles into:
1243_abort_gzip:                            ## @abort_gzip
1244## BB#0:                                ## %entry
1245	subl	$12, %esp
1246	movb	_in_exit.4870.b, %al
1247	cmpb	$1, %al
1248	jne	LBB0_2
1249
1250We somehow miss folding the movb into the cmpb.
1251
1252//===---------------------------------------------------------------------===//
1253
1254We compile:
1255
1256int test(int x, int y) {
1257  return x-y-1;
1258}
1259
1260into (-m64):
1261
1262_test:
1263	decl	%edi
1264	movl	%edi, %eax
1265	subl	%esi, %eax
1266	ret
1267
1268it would be better to codegen as: x+~y  (notl+addl)
1269
1270//===---------------------------------------------------------------------===//
1271
1272This code:
1273
1274int foo(const char *str,...)
1275{
1276 __builtin_va_list a; int x;
1277 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1278 return x;
1279}
1280
1281gets compiled into this on x86-64:
1282	subq    $200, %rsp
1283        movaps  %xmm7, 160(%rsp)
1284        movaps  %xmm6, 144(%rsp)
1285        movaps  %xmm5, 128(%rsp)
1286        movaps  %xmm4, 112(%rsp)
1287        movaps  %xmm3, 96(%rsp)
1288        movaps  %xmm2, 80(%rsp)
1289        movaps  %xmm1, 64(%rsp)
1290        movaps  %xmm0, 48(%rsp)
1291        movq    %r9, 40(%rsp)
1292        movq    %r8, 32(%rsp)
1293        movq    %rcx, 24(%rsp)
1294        movq    %rdx, 16(%rsp)
1295        movq    %rsi, 8(%rsp)
1296        leaq    (%rsp), %rax
1297        movq    %rax, 192(%rsp)
1298        leaq    208(%rsp), %rax
1299        movq    %rax, 184(%rsp)
1300        movl    $48, 180(%rsp)
1301        movl    $8, 176(%rsp)
1302        movl    176(%rsp), %eax
1303        cmpl    $47, %eax
1304        jbe     .LBB1_3 # bb
1305.LBB1_1:        # bb3
1306        movq    184(%rsp), %rcx
1307        leaq    8(%rcx), %rax
1308        movq    %rax, 184(%rsp)
1309.LBB1_2:        # bb4
1310        movl    (%rcx), %eax
1311        addq    $200, %rsp
1312        ret
1313.LBB1_3:        # bb
1314        movl    %eax, %ecx
1315        addl    $8, %eax
1316        addq    192(%rsp), %rcx
1317        movl    %eax, 176(%rsp)
1318        jmp     .LBB1_2 # bb4
1319
1320gcc 4.3 generates:
1321	subq    $96, %rsp
1322.LCFI0:
1323        leaq    104(%rsp), %rax
1324        movq    %rsi, -80(%rsp)
1325        movl    $8, -120(%rsp)
1326        movq    %rax, -112(%rsp)
1327        leaq    -88(%rsp), %rax
1328        movq    %rax, -104(%rsp)
1329        movl    $8, %eax
1330        cmpl    $48, %eax
1331        jb      .L6
1332        movq    -112(%rsp), %rdx
1333        movl    (%rdx), %eax
1334        addq    $96, %rsp
1335        ret
1336        .p2align 4,,10
1337        .p2align 3
1338.L6:
1339        mov     %eax, %edx
1340        addq    -104(%rsp), %rdx
1341        addl    $8, %eax
1342        movl    %eax, -120(%rsp)
1343        movl    (%rdx), %eax
1344        addq    $96, %rsp
1345        ret
1346
1347and it gets compiled into this on x86:
1348	pushl   %ebp
1349        movl    %esp, %ebp
1350        subl    $4, %esp
1351        leal    12(%ebp), %eax
1352        movl    %eax, -4(%ebp)
1353        leal    16(%ebp), %eax
1354        movl    %eax, -4(%ebp)
1355        movl    12(%ebp), %eax
1356        addl    $4, %esp
1357        popl    %ebp
1358        ret
1359
1360gcc 4.3 generates:
1361	pushl   %ebp
1362        movl    %esp, %ebp
1363        movl    12(%ebp), %eax
1364        popl    %ebp
1365        ret
1366
1367//===---------------------------------------------------------------------===//
1368
1369Teach tblgen not to check bitconvert source type in some cases. This allows us
1370to consolidate the following patterns in X86InstrMMX.td:
1371
1372def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1373                                                  (iPTR 0))))),
1374          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1375def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1376                                                  (iPTR 0))))),
1377          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1378def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1379                                                  (iPTR 0))))),
1380          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1381
1382There are other cases in various td files.
1383
1384//===---------------------------------------------------------------------===//
1385
1386Take something like the following on x86-32:
1387unsigned a(unsigned long long x, unsigned y) {return x % y;}
1388
1389We currently generate a libcall, but we really shouldn't: the expansion is
1390shorter and likely faster than the libcall.  The expected code is something
1391like the following:
1392
1393	movl	12(%ebp), %eax
1394	movl	16(%ebp), %ecx
1395	xorl	%edx, %edx
1396	divl	%ecx
1397	movl	8(%ebp), %eax
1398	divl	%ecx
1399	movl	%edx, %eax
1400	ret
1401
1402A similar code sequence works for division.
1403
1404//===---------------------------------------------------------------------===//
1405
1406These should compile to the same code, but the later codegen's to useless
1407instructions on X86. This may be a trivial dag combine (GCC PR7061):
1408
1409struct s1 { unsigned char a, b; };
1410unsigned long f1(struct s1 x) {
1411    return x.a + x.b;
1412}
1413struct s2 { unsigned a: 8, b: 8; };
1414unsigned long f2(struct s2 x) {
1415    return x.a + x.b;
1416}
1417
1418//===---------------------------------------------------------------------===//
1419
1420We currently compile this:
1421
1422define i32 @func1(i32 %v1, i32 %v2) nounwind {
1423entry:
1424  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1425  %sum = extractvalue {i32, i1} %t, 0
1426  %obit = extractvalue {i32, i1} %t, 1
1427  br i1 %obit, label %overflow, label %normal
1428normal:
1429  ret i32 %sum
1430overflow:
1431  call void @llvm.trap()
1432  unreachable
1433}
1434declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1435declare void @llvm.trap()
1436
1437to:
1438
1439_func1:
1440	movl	4(%esp), %eax
1441	addl	8(%esp), %eax
1442	jo	LBB1_2	## overflow
1443LBB1_1:	## normal
1444	ret
1445LBB1_2:	## overflow
1446	ud2
1447
1448it would be nice to produce "into" someday.
1449
1450//===---------------------------------------------------------------------===//
1451
1452This code:
1453
1454void vec_mpys1(int y[], const int x[], int scaler) {
1455int i;
1456for (i = 0; i < 150; i++)
1457 y[i] += (((long long)scaler * (long long)x[i]) >> 31);
1458}
1459
1460Compiles to this loop with GCC 3.x:
1461
1462.L5:
1463	movl	%ebx, %eax
1464	imull	(%edi,%ecx,4)
1465	shrdl	$31, %edx, %eax
1466	addl	%eax, (%esi,%ecx,4)
1467	incl	%ecx
1468	cmpl	$149, %ecx
1469	jle	.L5
1470
1471llvm-gcc compiles it to the much uglier:
1472
1473LBB1_1:	## bb1
1474	movl	24(%esp), %eax
1475	movl	(%eax,%edi,4), %ebx
1476	movl	%ebx, %ebp
1477	imull	%esi, %ebp
1478	movl	%ebx, %eax
1479	mull	%ecx
1480	addl	%ebp, %edx
1481	sarl	$31, %ebx
1482	imull	%ecx, %ebx
1483	addl	%edx, %ebx
1484	shldl	$1, %eax, %ebx
1485	movl	20(%esp), %eax
1486	addl	%ebx, (%eax,%edi,4)
1487	incl	%edi
1488	cmpl	$150, %edi
1489	jne	LBB1_1	## bb1
1490
1491The issue is that we hoist the cast of "scaler" to long long outside of the
1492loop, the value comes into the loop as two values, and
1493RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
1494constructed BUILD_PAIR which represents the cast value.
1495
1496This can be handled by making CodeGenPrepare sink the cast.
1497
1498//===---------------------------------------------------------------------===//
1499
1500Test instructions can be eliminated by using EFLAGS values from arithmetic
1501instructions. This is currently not done for mul, and, or, xor, neg, shl,
1502sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1503for read-modify-write instructions. It is also current not done if the
1504OF or CF flags are needed.
1505
1506The shift operators have the complication that when the shift count is
1507zero, EFLAGS is not set, so they can only subsume a test instruction if
1508the shift count is known to be non-zero. Also, using the EFLAGS value
1509from a shift is apparently very slow on some x86 implementations.
1510
1511In read-modify-write instructions, the root node in the isel match is
1512the store, and isel has no way for the use of the EFLAGS result of the
1513arithmetic to be remapped to the new node.
1514
1515Add and subtract instructions set OF on signed overflow and CF on unsiged
1516overflow, while test instructions always clear OF and CF. In order to
1517replace a test with an add or subtract in a situation where OF or CF is
1518needed, codegen must be able to prove that the operation cannot see
1519signed or unsigned overflow, respectively.
1520
1521//===---------------------------------------------------------------------===//
1522
1523memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
1524define <16 x float> @foo(<16 x float> %A) nounwind {
1525	%tmp = alloca <16 x float>, align 16
1526	%tmp2 = alloca <16 x float>, align 16
1527	store <16 x float> %A, <16 x float>* %tmp
1528	%s = bitcast <16 x float>* %tmp to i8*
1529	%s2 = bitcast <16 x float>* %tmp2 to i8*
1530	call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1531	%R = load <16 x float>* %tmp2
1532	ret <16 x float> %R
1533}
1534
1535declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1536
1537which compiles to:
1538
1539_foo:
1540	subl	$140, %esp
1541	movaps	%xmm3, 112(%esp)
1542	movaps	%xmm2, 96(%esp)
1543	movaps	%xmm1, 80(%esp)
1544	movaps	%xmm0, 64(%esp)
1545	movl	60(%esp), %eax
1546	movl	%eax, 124(%esp)
1547	movl	56(%esp), %eax
1548	movl	%eax, 120(%esp)
1549	movl	52(%esp), %eax
1550        <many many more 32-bit copies>
1551      	movaps	(%esp), %xmm0
1552	movaps	16(%esp), %xmm1
1553	movaps	32(%esp), %xmm2
1554	movaps	48(%esp), %xmm3
1555	addl	$140, %esp
1556	ret
1557
1558On Nehalem, it may even be cheaper to just use movups when unaligned than to
1559fall back to lower-granularity chunks.
1560
1561//===---------------------------------------------------------------------===//
1562
1563Implement processor-specific optimizations for parity with GCC on these
1564processors.  GCC does two optimizations:
1565
15661. ix86_pad_returns inserts a noop before ret instructions if immediately
1567   preceded by a conditional branch or is the target of a jump.
15682. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1569   code contains more than 3 branches.
1570
1571The first one is done for all AMDs, Core2, and "Generic"
1572The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1573  Core 2, and "Generic"
1574
1575//===---------------------------------------------------------------------===//
1576
1577Testcase:
1578int a(int x) { return (x & 127) > 31; }
1579
1580Current output:
1581	movl	4(%esp), %eax
1582	andl	$127, %eax
1583	cmpl	$31, %eax
1584	seta	%al
1585	movzbl	%al, %eax
1586	ret
1587
1588Ideal output:
1589	xorl	%eax, %eax
1590	testl	$96, 4(%esp)
1591	setne	%al
1592	ret
1593
1594This should definitely be done in instcombine, canonicalizing the range
1595condition into a != condition.  We get this IR:
1596
1597define i32 @a(i32 %x) nounwind readnone {
1598entry:
1599	%0 = and i32 %x, 127		; <i32> [#uses=1]
1600	%1 = icmp ugt i32 %0, 31		; <i1> [#uses=1]
1601	%2 = zext i1 %1 to i32		; <i32> [#uses=1]
1602	ret i32 %2
1603}
1604
1605Instcombine prefers to strength reduce relational comparisons to equality
1606comparisons when possible, this should be another case of that.  This could
1607be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
1608looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
1609be redesigned to use ComputeMaskedBits and friends.
1610
1611
1612//===---------------------------------------------------------------------===//
1613Testcase:
1614int x(int a) { return (a&0xf0)>>4; }
1615
1616Current output:
1617	movl	4(%esp), %eax
1618	shrl	$4, %eax
1619	andl	$15, %eax
1620	ret
1621
1622Ideal output:
1623	movzbl	4(%esp), %eax
1624	shrl	$4, %eax
1625	ret
1626
1627//===---------------------------------------------------------------------===//
1628
1629Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1630properly.
1631
1632When the return value is not used (i.e. only care about the value in the
1633memory), x86 does not have to use add to implement these. Instead, it can use
1634add, sub, inc, dec instructions with the "lock" prefix.
1635
1636This is currently implemented using a bit of instruction selection trick. The
1637issue is the target independent pattern produces one output and a chain and we
1638want to map it into one that just output a chain. The current trick is to select
1639it into a MERGE_VALUES with the first definition being an implicit_def. The
1640proper solution is to add new ISD opcodes for the no-output variant. DAG
1641combiner can then transform the node before it gets to target node selection.
1642
1643Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1644fact these instructions are identical to the non-lock versions. We need a way to
1645add target specific information to target nodes and have this information
1646carried over to machine instructions. Asm printer (or JIT) can use this
1647information to add the "lock" prefix.
1648
1649//===---------------------------------------------------------------------===//
1650
1651struct B {
1652  unsigned char y0 : 1;
1653};
1654
1655int bar(struct B* a) { return a->y0; }
1656
1657define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
1658  %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
1659  %2 = load i8* %1, align 1
1660  %3 = and i8 %2, 1
1661  %4 = zext i8 %3 to i32
1662  ret i32 %4
1663}
1664
1665bar:                                    # @bar
1666# BB#0:
1667        movb    (%rdi), %al
1668        andb    $1, %al
1669        movzbl  %al, %eax
1670        ret
1671
1672Missed optimization: should be movl+andl.
1673
1674//===---------------------------------------------------------------------===//
1675
1676The x86_64 abi says:
1677
1678Booleans, when stored in a memory object, are stored as single byte objects the
1679value of which is always 0 (false) or 1 (true).
1680
1681We are not using this fact:
1682
1683int bar(_Bool *a) { return *a; }
1684
1685define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
1686  %1 = load i8* %a, align 1, !tbaa !0
1687  %tmp = and i8 %1, 1
1688  %2 = zext i8 %tmp to i32
1689  ret i32 %2
1690}
1691
1692bar:
1693        movb    (%rdi), %al
1694        andb    $1, %al
1695        movzbl  %al, %eax
1696        ret
1697
1698GCC produces
1699
1700bar:
1701        movzbl  (%rdi), %eax
1702        ret
1703
1704//===---------------------------------------------------------------------===//
1705
1706Consider the following two functions compiled with clang:
1707_Bool foo(int *x) { return !(*x & 4); }
1708unsigned bar(int *x) { return !(*x & 4); }
1709
1710foo:
1711	movl	4(%esp), %eax
1712	testb	$4, (%eax)
1713	sete	%al
1714	movzbl	%al, %eax
1715	ret
1716
1717bar:
1718	movl	4(%esp), %eax
1719	movl	(%eax), %eax
1720	shrl	$2, %eax
1721	andl	$1, %eax
1722	xorl	$1, %eax
1723	ret
1724
1725The second function generates more code even though the two functions are
1726are functionally identical.
1727
1728//===---------------------------------------------------------------------===//
1729
1730Take the following C code:
1731int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1732
1733We generate the following IR with clang:
1734define i32 @f(i32 %a, i32 %b) nounwind readnone {
1735entry:
1736  %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
1737  %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
1738  %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
1739  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1740  ret i32 %conv5
1741}
1742
1743And the following x86 code:
1744	xorl	%esi, %edi
1745	testb	$-1, %dil
1746	sete	%al
1747	movzbl	%al, %eax
1748	ret
1749
1750A cmpb instead of the xorl+testb would be one instruction shorter.
1751
1752//===---------------------------------------------------------------------===//
1753
1754Given the following C code:
1755int f(int a, int b) { return (signed char)a == (signed char)b; }
1756
1757We generate the following IR with clang:
1758define i32 @f(i32 %a, i32 %b) nounwind readnone {
1759entry:
1760  %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
1761  %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
1762  %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
1763  %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
1764  %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
1765  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1766  ret i32 %conv5
1767}
1768
1769And the following x86 code:
1770	movsbl	%sil, %eax
1771	movsbl	%dil, %ecx
1772	cmpl	%eax, %ecx
1773	sete	%al
1774	movzbl	%al, %eax
1775	ret
1776
1777
1778It should be possible to eliminate the sign extensions.
1779
1780//===---------------------------------------------------------------------===//
1781
1782LLVM misses a load+store narrowing opportunity in this code:
1783
1784%struct.bf = type { i64, i16, i16, i32 }
1785
1786@bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
1787
1788define void @t1() nounwind ssp {
1789entry:
1790  %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1791  %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1792  %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
1793  %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
1794  %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
1795  store i32 %4, i32* %2, align 1
1796  %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1797  %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1798  %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
1799  %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
1800  %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
1801  store i32 %9, i32* %7, align 1
1802  ret void
1803}
1804
1805LLVM currently emits this:
1806
1807  movq  bfi(%rip), %rax
1808  andl  $-65537, 8(%rax)
1809  movq  bfi(%rip), %rax
1810  andl  $-131073, 8(%rax)
1811  ret
1812
1813It could narrow the loads and stores to emit this:
1814
1815  movq  bfi(%rip), %rax
1816  andb  $-2, 10(%rax)
1817  movq  bfi(%rip), %rax
1818  andb  $-3, 10(%rax)
1819  ret
1820
1821The trouble is that there is a TokenFactor between the store and the
1822load, making it non-trivial to determine if there's anything between
1823the load and the store which would prohibit narrowing.
1824
1825//===---------------------------------------------------------------------===//
1826
1827This code:
1828void foo(unsigned x) {
1829  if (x == 0) bar();
1830  else if (x == 1) qux();
1831}
1832
1833currently compiles into:
1834_foo:
1835	movl	4(%esp), %eax
1836	cmpl	$1, %eax
1837	je	LBB0_3
1838	testl	%eax, %eax
1839	jne	LBB0_4
1840
1841the testl could be removed:
1842_foo:
1843	movl	4(%esp), %eax
1844	cmpl	$1, %eax
1845	je	LBB0_3
1846	jb	LBB0_4
1847
18480 is the only unsigned number < 1.
1849
1850//===---------------------------------------------------------------------===//
1851
1852This code:
1853
1854%0 = type { i32, i1 }
1855
1856define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
1857entry:
1858  %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
1859  %cmp = extractvalue %0 %uadd, 1
1860  %inc = zext i1 %cmp to i32
1861  %add = add i32 %x, %sum
1862  %z.0 = add i32 %add, %inc
1863  ret i32 %z.0
1864}
1865
1866declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
1867
1868compiles to:
1869
1870_add32carry:                            ## @add32carry
1871	addl	%esi, %edi
1872	sbbl	%ecx, %ecx
1873	movl	%edi, %eax
1874	subl	%ecx, %eax
1875	ret
1876
1877But it could be:
1878
1879_add32carry:
1880	leal	(%rsi,%rdi), %eax
1881	cmpl	%esi, %eax
1882	adcl	$0, %eax
1883	ret
1884
1885//===---------------------------------------------------------------------===//
1886
1887The hot loop of 256.bzip2 contains code that looks a bit like this:
1888
1889int foo(char *P, char *Q, int x, int y) {
1890  if (P[0] != Q[0])
1891     return P[0] < Q[0];
1892  if (P[1] != Q[1])
1893     return P[1] < Q[1];
1894  if (P[2] != Q[2])
1895     return P[2] < Q[2];
1896   return P[3] < Q[3];
1897}
1898
1899In the real code, we get a lot more wrong than this.  However, even in this
1900code we generate:
1901
1902_foo:                                   ## @foo
1903## BB#0:                                ## %entry
1904	movb	(%rsi), %al
1905	movb	(%rdi), %cl
1906	cmpb	%al, %cl
1907	je	LBB0_2
1908LBB0_1:                                 ## %if.then
1909	cmpb	%al, %cl
1910	jmp	LBB0_5
1911LBB0_2:                                 ## %if.end
1912	movb	1(%rsi), %al
1913	movb	1(%rdi), %cl
1914	cmpb	%al, %cl
1915	jne	LBB0_1
1916## BB#3:                                ## %if.end38
1917	movb	2(%rsi), %al
1918	movb	2(%rdi), %cl
1919	cmpb	%al, %cl
1920	jne	LBB0_1
1921## BB#4:                                ## %if.end60
1922	movb	3(%rdi), %al
1923	cmpb	3(%rsi), %al
1924LBB0_5:                                 ## %if.end60
1925	setl	%al
1926	movzbl	%al, %eax
1927	ret
1928
1929Note that we generate jumps to LBB0_1 which does a redundant compare.  The
1930redundant compare also forces the register values to be live, which prevents
1931folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
1932
1933_foo:
1934	movzbl	(%rsi), %eax
1935	cmpb	%al, (%rdi)
1936	jne	L10
1937L12:
1938	movzbl	1(%rsi), %eax
1939	cmpb	%al, 1(%rdi)
1940	jne	L10
1941	movzbl	2(%rsi), %eax
1942	cmpb	%al, 2(%rdi)
1943	jne	L10
1944	movzbl	3(%rdi), %eax
1945	cmpb	3(%rsi), %al
1946L10:
1947	setl	%al
1948	movzbl	%al, %eax
1949	ret
1950
1951which is "perfect".
1952
1953//===---------------------------------------------------------------------===//
1954
1955For the branch in the following code:
1956int a();
1957int b(int x, int y) {
1958  if (x & (1<<(y&7)))
1959    return a();
1960  return y;
1961}
1962
1963We currently generate:
1964	movb	%sil, %al
1965	andb	$7, %al
1966	movzbl	%al, %eax
1967	btl	%eax, %edi
1968	jae	.LBB0_2
1969
1970movl+andl would be shorter than the movb+andb+movzbl sequence.
1971
1972//===---------------------------------------------------------------------===//
1973
1974For the following:
1975struct u1 {
1976    float x, y;
1977};
1978float foo(struct u1 u) {
1979    return u.x + u.y;
1980}
1981
1982We currently generate:
1983	movdqa	%xmm0, %xmm1
1984	pshufd	$1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
1985	addss	%xmm1, %xmm0
1986	ret
1987
1988We could save an instruction here by commuting the addss.
1989
1990//===---------------------------------------------------------------------===//
1991
1992This (from PR9661):
1993
1994float clamp_float(float a) {
1995        if (a > 1.0f)
1996                return 1.0f;
1997        else if (a < 0.0f)
1998                return 0.0f;
1999        else
2000                return a;
2001}
2002
2003Could compile to:
2004
2005clamp_float:                            # @clamp_float
2006        movss   .LCPI0_0(%rip), %xmm1
2007        minss   %xmm1, %xmm0
2008        pxor    %xmm1, %xmm1
2009        maxss   %xmm1, %xmm0
2010        ret
2011
2012with -ffast-math.
2013
2014//===---------------------------------------------------------------------===//
2015
2016This function (from PR9803):
2017
2018int clamp2(int a) {
2019        if (a > 5)
2020                a = 5;
2021        if (a < 0)
2022                return 0;
2023        return a;
2024}
2025
2026Compiles to:
2027
2028_clamp2:                                ## @clamp2
2029        pushq   %rbp
2030        movq    %rsp, %rbp
2031        cmpl    $5, %edi
2032        movl    $5, %ecx
2033        cmovlel %edi, %ecx
2034        testl   %ecx, %ecx
2035        movl    $0, %eax
2036        cmovnsl %ecx, %eax
2037        popq    %rbp
2038        ret
2039
2040The move of 0 could be scheduled above the test to make it is xor reg,reg.
2041
2042//===---------------------------------------------------------------------===//
2043
2044GCC PR48986.  We currently compile this:
2045
2046void bar(void);
2047void yyy(int* p) {
2048    if (__sync_fetch_and_add(p, -1) == 1)
2049      bar();
2050}
2051
2052into:
2053	movl	$-1, %eax
2054	lock
2055	xaddl	%eax, (%rdi)
2056	cmpl	$1, %eax
2057	je	LBB0_2
2058
2059Instead we could generate:
2060
2061	lock
2062	dec %rdi
2063	je LBB0_2
2064
2065The trick is to match "fetch_and_add(X, -C) == C".
2066
2067//===---------------------------------------------------------------------===//
2068
2069