• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend: SSE-specific stuff.
3//===---------------------------------------------------------------------===//
4
5//===---------------------------------------------------------------------===//
6
7SSE Variable shift can be custom lowered to something like this, which uses a
8small table + unaligned load + shuffle instead of going through memory.
9
10__m128i_shift_right:
11	.byte	  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
12	.byte	 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
13
14...
15__m128i shift_right(__m128i value, unsigned long offset) {
16  return _mm_shuffle_epi8(value,
17               _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
18}
19
20//===---------------------------------------------------------------------===//
21
22SSE has instructions for doing operations on complex numbers, we should pattern
23match them.   For example, this should turn into a horizontal add:
24
25typedef float __attribute__((vector_size(16))) v4f32;
26float f32(v4f32 A) {
27  return A[0]+A[1]+A[2]+A[3];
28}
29
30Instead we get this:
31
32_f32:                                   ## @f32
33	pshufd	$1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
34	addss	%xmm0, %xmm1
35	pshufd	$3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
36	movhlps	%xmm0, %xmm0            ## xmm0 = xmm0[1,1]
37	movaps	%xmm0, %xmm3
38	addss	%xmm1, %xmm3
39	movdqa	%xmm2, %xmm0
40	addss	%xmm3, %xmm0
41	ret
42
43Also, there are cases where some simple local SLP would improve codegen a bit.
44compiling this:
45
46_Complex float f32(_Complex float A, _Complex float B) {
47  return A+B;
48}
49
50into:
51
52_f32:                                   ## @f32
53	movdqa	%xmm0, %xmm2
54	addss	%xmm1, %xmm2
55	pshufd	$1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
56	pshufd	$1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
57	addss	%xmm1, %xmm3
58	movaps	%xmm2, %xmm0
59	unpcklps	%xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
60	ret
61
62seems silly when it could just be one addps.
63
64
65//===---------------------------------------------------------------------===//
66
67Expand libm rounding functions inline:  Significant speedups possible.
68http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
69
70//===---------------------------------------------------------------------===//
71
72When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
73other fast SSE modes.
74
75//===---------------------------------------------------------------------===//
76
77Think about doing i64 math in SSE regs on x86-32.
78
79//===---------------------------------------------------------------------===//
80
81This testcase should have no SSE instructions in it, and only one load from
82a constant pool:
83
84double %test3(bool %B) {
85        %C = select bool %B, double 123.412, double 523.01123123
86        ret double %C
87}
88
89Currently, the select is being lowered, which prevents the dag combiner from
90turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
91
92The pattern isel got this one right.
93
94//===---------------------------------------------------------------------===//
95
96Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
97feasible.
98
99//===---------------------------------------------------------------------===//
100
101Codegen:
102  if (copysign(1.0, x) == copysign(1.0, y))
103into:
104  if (x^y & mask)
105when using SSE.
106
107//===---------------------------------------------------------------------===//
108
109Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
110of a v4sf value.
111
112//===---------------------------------------------------------------------===//
113
114Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
115Perhaps use pxor / xorp* to clear a XMM register first?
116
117//===---------------------------------------------------------------------===//
118
119External test Nurbs exposed some problems. Look for
120__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
121emits:
122
123        movaps    (%edx), %xmm2                                 #59.21
124        movaps    (%edx), %xmm5                                 #60.21
125        movaps    (%edx), %xmm4                                 #61.21
126        movaps    (%edx), %xmm3                                 #62.21
127        movl      40(%ecx), %ebp                                #69.49
128        shufps    $0, %xmm2, %xmm5                              #60.21
129        movl      100(%esp), %ebx                               #69.20
130        movl      (%ebx), %edi                                  #69.20
131        imull     %ebp, %edi                                    #69.49
132        addl      (%eax), %edi                                  #70.33
133        shufps    $85, %xmm2, %xmm4                             #61.21
134        shufps    $170, %xmm2, %xmm3                            #62.21
135        shufps    $255, %xmm2, %xmm2                            #63.21
136        lea       (%ebp,%ebp,2), %ebx                           #69.49
137        negl      %ebx                                          #69.49
138        lea       -3(%edi,%ebx), %ebx                           #70.33
139        shll      $4, %ebx                                      #68.37
140        addl      32(%ecx), %ebx                                #68.37
141        testb     $15, %bl                                      #91.13
142        jne       L_B1.24       # Prob 5%                       #91.13
143
144This is the llvm code after instruction scheduling:
145
146cond_next140 (0xa910740, LLVM BB @0xa90beb0):
147	%reg1078 = MOV32ri -3
148	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0
149	%reg1037 = MOV32rm %reg1024, 1, %noreg, 40
150	%reg1080 = IMUL32rr %reg1079, %reg1037
151	%reg1081 = MOV32rm %reg1058, 1, %noreg, 0
152	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
153	%reg1036 = MOV32rm %reg1024, 1, %noreg, 32
154	%reg1082 = SHL32ri %reg1038, 4
155	%reg1039 = ADD32rr %reg1036, %reg1082
156	%reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0
157	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
158	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
159	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
160	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
161	%reg1040 = MOV32rr %reg1039
162	%reg1084 = AND32ri8 %reg1039, 15
163	CMP32ri8 %reg1084, 0
164	JE mbb<cond_next204,0xa914d30>
165
166Still ok. After register allocation:
167
168cond_next140 (0xa910740, LLVM BB @0xa90beb0):
169	%eax = MOV32ri -3
170	%edx = MOV32rm %stack.3, 1, %noreg, 0
171	ADD32rm %eax<def&use>, %edx, 1, %noreg, 0
172	%edx = MOV32rm %stack.7, 1, %noreg, 0
173	%edx = MOV32rm %edx, 1, %noreg, 40
174	IMUL32rr %eax<def&use>, %edx
175	%esi = MOV32rm %stack.5, 1, %noreg, 0
176	%esi = MOV32rm %esi, 1, %noreg, 0
177	MOV32mr %stack.4, 1, %noreg, 0, %esi
178	%eax = LEA32r %esi, 1, %eax, -3
179	%esi = MOV32rm %stack.7, 1, %noreg, 0
180	%esi = MOV32rm %esi, 1, %noreg, 32
181	%edi = MOV32rr %eax
182	SHL32ri %edi<def&use>, 4
183	ADD32rr %edi<def&use>, %esi
184	%xmm0 = MOVAPSrm %ecx, 1, %noreg, 0
185	%xmm1 = MOVAPSrr %xmm0
186	SHUFPSrr %xmm1<def&use>, %xmm1, 170
187	%xmm2 = MOVAPSrr %xmm0
188	SHUFPSrr %xmm2<def&use>, %xmm2, 0
189	%xmm3 = MOVAPSrr %xmm0
190	SHUFPSrr %xmm3<def&use>, %xmm3, 255
191	SHUFPSrr %xmm0<def&use>, %xmm0, 85
192	%ebx = MOV32rr %edi
193	AND32ri8 %ebx<def&use>, 15
194	CMP32ri8 %ebx, 0
195	JE mbb<cond_next204,0xa914d30>
196
197This looks really bad. The problem is shufps is a destructive opcode. Since it
198appears as operand two in more than one shufps ops. It resulted in a number of
199copies. Note icc also suffers from the same problem. Either the instruction
200selector should select pshufd or The register allocator can made the two-address
201to three-address transformation.
202
203It also exposes some other problems. See MOV32ri -3 and the spills.
204
205//===---------------------------------------------------------------------===//
206
207Consider:
208
209__m128 test(float a) {
210  return _mm_set_ps(0.0, 0.0, 0.0, a*a);
211}
212
213This compiles into:
214
215movss 4(%esp), %xmm1
216mulss %xmm1, %xmm1
217xorps %xmm0, %xmm0
218movss %xmm1, %xmm0
219ret
220
221Because mulss doesn't modify the top 3 elements, the top elements of
222xmm1 are already zero'd.  We could compile this to:
223
224movss 4(%esp), %xmm0
225mulss %xmm0, %xmm0
226ret
227
228//===---------------------------------------------------------------------===//
229
230Here's a sick and twisted idea.  Consider code like this:
231
232__m128 test(__m128 a) {
233  float b = *(float*)&A;
234  ...
235  return _mm_set_ps(0.0, 0.0, 0.0, b);
236}
237
238This might compile to this code:
239
240movaps c(%esp), %xmm1
241xorps %xmm0, %xmm0
242movss %xmm1, %xmm0
243ret
244
245Now consider if the ... code caused xmm1 to get spilled.  This might produce
246this code:
247
248movaps c(%esp), %xmm1
249movaps %xmm1, c2(%esp)
250...
251
252xorps %xmm0, %xmm0
253movaps c2(%esp), %xmm1
254movss %xmm1, %xmm0
255ret
256
257However, since the reload is only used by these instructions, we could
258"fold" it into the uses, producing something like this:
259
260movaps c(%esp), %xmm1
261movaps %xmm1, c2(%esp)
262...
263
264movss c2(%esp), %xmm0
265ret
266
267... saving two instructions.
268
269The basic idea is that a reload from a spill slot, can, if only one 4-byte
270chunk is used, bring in 3 zeros the one element instead of 4 elements.
271This can be used to simplify a variety of shuffle operations, where the
272elements are fixed zeros.
273
274//===---------------------------------------------------------------------===//
275
276This code generates ugly code, probably due to costs being off or something:
277
278define void @test(float* %P, <4 x float>* %P2 ) {
279        %xFloat0.688 = load float* %P
280        %tmp = load <4 x float>* %P2
281        %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
282        store <4 x float> %inFloat3.713, <4 x float>* %P2
283        ret void
284}
285
286Generates:
287
288_test:
289	movl	8(%esp), %eax
290	movaps	(%eax), %xmm0
291	pxor	%xmm1, %xmm1
292	movaps	%xmm0, %xmm2
293	shufps	$50, %xmm1, %xmm2
294	shufps	$132, %xmm2, %xmm0
295	movaps	%xmm0, (%eax)
296	ret
297
298Would it be better to generate:
299
300_test:
301        movl 8(%esp), %ecx
302        movaps (%ecx), %xmm0
303	xor %eax, %eax
304        pinsrw $6, %eax, %xmm0
305        pinsrw $7, %eax, %xmm0
306        movaps %xmm0, (%ecx)
307        ret
308
309?
310
311//===---------------------------------------------------------------------===//
312
313Some useful information in the Apple Altivec / SSE Migration Guide:
314
315http://developer.apple.com/documentation/Performance/Conceptual/
316Accelerate_sse_migration/index.html
317
318e.g. SSE select using and, andnot, or. Various SSE compare translations.
319
320//===---------------------------------------------------------------------===//
321
322Add hooks to commute some CMPP operations.
323
324//===---------------------------------------------------------------------===//
325
326Apply the same transformation that merged four float into a single 128-bit load
327to loads from constant pool.
328
329//===---------------------------------------------------------------------===//
330
331Floating point max / min are commutable when -enable-unsafe-fp-path is
332specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
333nodes which are selected to max / min instructions that are marked commutable.
334
335//===---------------------------------------------------------------------===//
336
337We should materialize vector constants like "all ones" and "signbit" with
338code like:
339
340     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
341
342and:
343     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
344     psrlq   xmm1, 31     ; xmm1 = all 100000000000...
345
346instead of using a load from the constant pool.  The later is important for
347ABS/NEG/copysign etc.
348
349//===---------------------------------------------------------------------===//
350
351These functions:
352
353#include <xmmintrin.h>
354__m128i a;
355void x(unsigned short n) {
356  a = _mm_slli_epi32 (a, n);
357}
358void y(unsigned n) {
359  a = _mm_slli_epi32 (a, n);
360}
361
362compile to ( -O3 -static -fomit-frame-pointer):
363_x:
364        movzwl  4(%esp), %eax
365        movd    %eax, %xmm0
366        movaps  _a, %xmm1
367        pslld   %xmm0, %xmm1
368        movaps  %xmm1, _a
369        ret
370_y:
371        movd    4(%esp), %xmm0
372        movaps  _a, %xmm1
373        pslld   %xmm0, %xmm1
374        movaps  %xmm1, _a
375        ret
376
377"y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
378like movd would be sufficient in both cases as the value is already zero
379extended in the 32-bit stack slot IIRC.  For signed short, it should also be
380save, as a really-signed value would be undefined for pslld.
381
382
383//===---------------------------------------------------------------------===//
384
385#include <math.h>
386int t1(double d) { return signbit(d); }
387
388This currently compiles to:
389	subl	$12, %esp
390	movsd	16(%esp), %xmm0
391	movsd	%xmm0, (%esp)
392	movl	4(%esp), %eax
393	shrl	$31, %eax
394	addl	$12, %esp
395	ret
396
397We should use movmskp{s|d} instead.
398
399//===---------------------------------------------------------------------===//
400
401CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
402(aligned) vector load.  This functionality has a couple of problems.
403
4041. The code to infer alignment from loads of globals is in the X86 backend,
405   not the dag combiner.  This is because dagcombine2 needs to be able to see
406   through the X86ISD::Wrapper node, which DAGCombine can't really do.
4072. The code for turning 4 x load into a single vector load is target
408   independent and should be moved to the dag combiner.
4093. The code for turning 4 x load into a vector load can only handle a direct
410   load from a global or a direct load from the stack.  It should be generalized
411   to handle any load from P, P+4, P+8, P+12, where P can be anything.
4124. The alignment inference code cannot handle loads from globals in non-static
413   mode because it doesn't look through the extra dyld stub load.  If you try
414   vec_align.ll without -relocation-model=static, you'll see what I mean.
415
416//===---------------------------------------------------------------------===//
417
418We should lower store(fneg(load p), q) into an integer load+xor+store, which
419eliminates a constant pool load.  For example, consider:
420
421define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
422entry:
423 %tmp6 = fsub float -0.000000e+00, %z.1		; <float> [#uses=1]
424 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
425 ret i64 %tmp20
426}
427declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
428
429This currently compiles to:
430
431LCPI1_0:					#  <4 x float>
432	.long	2147483648	# float -0
433	.long	2147483648	# float -0
434	.long	2147483648	# float -0
435	.long	2147483648	# float -0
436_ccosf:
437	subl	$12, %esp
438	movss	16(%esp), %xmm0
439	movss	%xmm0, 4(%esp)
440	movss	20(%esp), %xmm0
441	xorps	LCPI1_0, %xmm0
442	movss	%xmm0, (%esp)
443	call	L_ccoshf$stub
444	addl	$12, %esp
445	ret
446
447Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
448this code computes the pic base and does two loads to do the constant pool
449load, so the improvement is much bigger.
450
451The tricky part about this xform is that the argument load/store isn't exposed
452until post-legalize, and at that point, the fneg has been custom expanded into
453an X86 fxor.  This means that we need to handle this case in the x86 backend
454instead of in target independent code.
455
456//===---------------------------------------------------------------------===//
457
458Non-SSE4 insert into 16 x i8 is atrociously bad.
459
460//===---------------------------------------------------------------------===//
461
462<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
463is memory.
464
465//===---------------------------------------------------------------------===//
466
467INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
468any number of 0.0 simultaneously.  Currently we only use it for simple
469insertions.
470
471See comments in LowerINSERT_VECTOR_ELT_SSE4.
472
473//===---------------------------------------------------------------------===//
474
475On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
476Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
477legal, it'll just take a few extra patterns written in the .td file.
478
479Note: this is not a code quality issue; the custom lowered code happens to be
480right, but we shouldn't have to custom lower anything.  This is probably related
481to <2 x i64> ops being so bad.
482
483//===---------------------------------------------------------------------===//
484
485LLVM currently generates stack realignment code, when it is not necessary
486needed. The problem is that we need to know about stack alignment too early,
487before RA runs.
488
489At that point we don't know, whether there will be vector spill, or not.
490Stack realignment logic is overly conservative here, but otherwise we can
491produce unaligned loads/stores.
492
493Fixing this will require some huge RA changes.
494
495Testcase:
496#include <emmintrin.h>
497
498typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
499
500static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
501- 22725, - 12873};;
502
503vSInt16 madd(vSInt16 b)
504{
505    return _mm_madd_epi16(a, b);
506}
507
508Generated code (x86-32, linux):
509madd:
510        pushl   %ebp
511        movl    %esp, %ebp
512        andl    $-16, %esp
513        movaps  .LCPI1_0, %xmm1
514        pmaddwd %xmm1, %xmm0
515        movl    %ebp, %esp
516        popl    %ebp
517        ret
518
519//===---------------------------------------------------------------------===//
520
521Consider:
522#include <emmintrin.h>
523__m128 foo2 (float x) {
524 return _mm_set_ps (0, 0, x, 0);
525}
526
527In x86-32 mode, we generate this spiffy code:
528
529_foo2:
530	movss	4(%esp), %xmm0
531	pshufd	$81, %xmm0, %xmm0
532	ret
533
534in x86-64 mode, we generate this code, which could be better:
535
536_foo2:
537	xorps	%xmm1, %xmm1
538	movss	%xmm0, %xmm1
539	pshufd	$81, %xmm1, %xmm0
540	ret
541
542In sse4 mode, we could use insertps to make both better.
543
544Here's another testcase that could use insertps [mem]:
545
546#include <xmmintrin.h>
547extern float x2, x3;
548__m128 foo1 (float x1, float x4) {
549 return _mm_set_ps (x2, x1, x3, x4);
550}
551
552gcc mainline compiles it to:
553
554foo1:
555       insertps        $0x10, x2(%rip), %xmm0
556       insertps        $0x10, x3(%rip), %xmm1
557       movaps  %xmm1, %xmm2
558       movlhps %xmm0, %xmm2
559       movaps  %xmm2, %xmm0
560       ret
561
562//===---------------------------------------------------------------------===//
563
564We compile vector multiply-by-constant into poor code:
565
566define <4 x i32> @f(<4 x i32> %i) nounwind  {
567	%A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
568	ret <4 x i32> %A
569}
570
571On targets without SSE4.1, this compiles into:
572
573LCPI1_0:					##  <4 x i32>
574	.long	10
575	.long	10
576	.long	10
577	.long	10
578	.text
579	.align	4,0x90
580	.globl	_f
581_f:
582	pshufd	$3, %xmm0, %xmm1
583	movd	%xmm1, %eax
584	imull	LCPI1_0+12, %eax
585	movd	%eax, %xmm1
586	pshufd	$1, %xmm0, %xmm2
587	movd	%xmm2, %eax
588	imull	LCPI1_0+4, %eax
589	movd	%eax, %xmm2
590	punpckldq	%xmm1, %xmm2
591	movd	%xmm0, %eax
592	imull	LCPI1_0, %eax
593	movd	%eax, %xmm1
594	movhlps	%xmm0, %xmm0
595	movd	%xmm0, %eax
596	imull	LCPI1_0+8, %eax
597	movd	%eax, %xmm0
598	punpckldq	%xmm0, %xmm1
599	movaps	%xmm1, %xmm0
600	punpckldq	%xmm2, %xmm0
601	ret
602
603It would be better to synthesize integer vector multiplication by constants
604using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
605simple cases such as multiplication by powers of two would be better as
606vector shifts than as multiplications.
607
608//===---------------------------------------------------------------------===//
609
610We compile this:
611
612__m128i
613foo2 (char x)
614{
615  return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
616}
617
618into:
619	movl	$1, %eax
620	xorps	%xmm0, %xmm0
621	pinsrw	$2, %eax, %xmm0
622	movzbl	4(%esp), %eax
623	pinsrw	$3, %eax, %xmm0
624	movl	$256, %eax
625	pinsrw	$7, %eax, %xmm0
626	ret
627
628
629gcc-4.2:
630	subl	$12, %esp
631	movzbl	16(%esp), %eax
632	movdqa	LC0, %xmm0
633	pinsrw	$3, %eax, %xmm0
634	addl	$12, %esp
635	ret
636	.const
637	.align 4
638LC0:
639	.word	0
640	.word	0
641	.word	1
642	.word	0
643	.word	0
644	.word	0
645	.word	0
646	.word	256
647
648With SSE4, it should be
649      movdqa  .LC0(%rip), %xmm0
650      pinsrb  $6, %edi, %xmm0
651
652//===---------------------------------------------------------------------===//
653
654We should transform a shuffle of two vectors of constants into a single vector
655of constants. Also, insertelement of a constant into a vector of constants
656should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
657
658We compiled it to something horrible:
659
660	.align	4
661LCPI1_1:					##  float
662	.long	1065353216	## float 1
663	.const
664
665	.align	4
666LCPI1_0:					##  <4 x float>
667	.space	4
668	.long	1065353216	## float 1
669	.space	4
670	.long	1065353216	## float 1
671	.text
672	.align	4,0x90
673	.globl	_t
674_t:
675	xorps	%xmm0, %xmm0
676	movhps	LCPI1_0, %xmm0
677	movss	LCPI1_1, %xmm1
678	movaps	%xmm0, %xmm2
679	shufps	$2, %xmm1, %xmm2
680	shufps	$132, %xmm2, %xmm0
681	movaps	%xmm0, 0
682
683//===---------------------------------------------------------------------===//
684rdar://5907648
685
686This function:
687
688float foo(unsigned char x) {
689  return x;
690}
691
692compiles to (x86-32):
693
694define float @foo(i8 zeroext  %x) nounwind  {
695	%tmp12 = uitofp i8 %x to float		; <float> [#uses=1]
696	ret float %tmp12
697}
698
699compiles to:
700
701_foo:
702	subl	$4, %esp
703	movzbl	8(%esp), %eax
704	cvtsi2ss	%eax, %xmm0
705	movss	%xmm0, (%esp)
706	flds	(%esp)
707	addl	$4, %esp
708	ret
709
710We should be able to use:
711  cvtsi2ss 8($esp), %xmm0
712since we know the stack slot is already zext'd.
713
714//===---------------------------------------------------------------------===//
715
716Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
717when code size is critical. movlps is slower than movsd on core2 but it's one
718byte shorter.
719
720//===---------------------------------------------------------------------===//
721
722We should use a dynamic programming based approach to tell when using FPStack
723operations is cheaper than SSE.  SciMark montecarlo contains code like this
724for example:
725
726double MonteCarlo_num_flops(int Num_samples) {
727    return ((double) Num_samples)* 4.0;
728}
729
730In fpstack mode, this compiles into:
731
732LCPI1_0:
733	.long	1082130432	## float 4.000000e+00
734_MonteCarlo_num_flops:
735	subl	$4, %esp
736	movl	8(%esp), %eax
737	movl	%eax, (%esp)
738	fildl	(%esp)
739	fmuls	LCPI1_0
740	addl	$4, %esp
741	ret
742
743in SSE mode, it compiles into significantly slower code:
744
745_MonteCarlo_num_flops:
746	subl	$12, %esp
747	cvtsi2sd	16(%esp), %xmm0
748	mulsd	LCPI1_0, %xmm0
749	movsd	%xmm0, (%esp)
750	fldl	(%esp)
751	addl	$12, %esp
752	ret
753
754There are also other cases in scimark where using fpstack is better, it is
755cheaper to do fld1 than load from a constant pool for example, so
756"load, add 1.0, store" is better done in the fp stack, etc.
757
758//===---------------------------------------------------------------------===//
759
760These should compile into the same code (PR6214): Perhaps instcombine should
761canonicalize the former into the later?
762
763define float @foo(float %x) nounwind {
764  %t = bitcast float %x to i32
765  %s = and i32 %t, 2147483647
766  %d = bitcast i32 %s to float
767  ret float %d
768}
769
770declare float @fabsf(float %n)
771define float @bar(float %x) nounwind {
772  %d = call float @fabsf(float %x)
773  ret float %d
774}
775
776//===---------------------------------------------------------------------===//
777
778This IR (from PR6194):
779
780target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
781target triple = "x86_64-apple-darwin10.0.0"
782
783%0 = type { double, double }
784%struct.float3 = type { float, float, float }
785
786define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
787entry:
788  %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
789  %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
790  %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
791  %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
792  %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
793  %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
794  %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
795  store float %tmp12, float* %tmp5
796  ret void
797}
798
799Compiles to:
800
801_test:                                  ## @test
802	movd	%xmm0, %rax
803	shrq	$32, %rax
804	movl	%eax, 4(%rdi)
805	ret
806
807This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
808doing a shuffle from v[1] to v[0] then a float store.
809
810//===---------------------------------------------------------------------===//
811
812[UNSAFE FP]
813
814void foo(double, double, double);
815void norm(double x, double y, double z) {
816  double scale = __builtin_sqrt(x*x + y*y + z*z);
817  foo(x/scale, y/scale, z/scale);
818}
819
820We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
821slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
822and emit 3 mulsd in place of the divs. This can be done as a target-independent
823transform.
824
825If we're dealing with floats instead of doubles we could even replace the sqrtss
826and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
827cost of reduced accuracy.
828
829//===---------------------------------------------------------------------===//
830
831This function should be matched to haddpd when the appropriate CPU is enabled:
832
833#include <x86intrin.h>
834double f (__m128d p) {
835  return p[0] + p[1];
836}
837
838similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
839turn into hsubpd also.
840
841//===---------------------------------------------------------------------===//
842