• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -O3 -verify-machineinstrs -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 < %s | FileCheck %s
2
3target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
4target triple = "x86_64-unknown-unknown"
5
6; Stack reload folding tests.
7;
8; By including a nop call with sideeffects we can force a partial register spill of the
9; relevant registers and check that the reload is correctly folded into the instruction.
10
11define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
12  ;CHECK-LABEL: stack_fold_addpd
13  ;CHECK:       addpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
14  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
15  %2 = fadd <2 x double> %a0, %a1
16  ret <2 x double> %2
17}
18
19define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
20  ;CHECK-LABEL: stack_fold_addps
21  ;CHECK:       addps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
22  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
23  %2 = fadd <4 x float> %a0, %a1
24  ret <4 x float> %2
25}
26
27define double @stack_fold_addsd(double %a0, double %a1) {
28  ;CHECK-LABEL: stack_fold_addsd
29  ;CHECK:       addsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
30  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
31  %2 = fadd double %a0, %a1
32  ret double %2
33}
34
35define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
36  ;CHECK-LABEL: stack_fold_addsd_int
37  ;CHECK:       addsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
38  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
39  %2 = extractelement <2 x double> %a0, i32 0
40  %3 = extractelement <2 x double> %a1, i32 0
41  %4 = fadd double %2, %3
42  %5 = insertelement <2 x double> %a0, double %4, i32 0
43  ret <2 x double> %5
44}
45
46define float @stack_fold_addss(float %a0, float %a1) {
47  ;CHECK-LABEL: stack_fold_addss
48  ;CHECK:       addss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
49  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
50  %2 = fadd float %a0, %a1
51  ret float %2
52}
53
54define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
55  ;CHECK-LABEL: stack_fold_addss_int
56  ;CHECK:       addss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
57  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
58  %2 = extractelement <4 x float> %a0, i32 0
59  %3 = extractelement <4 x float> %a1, i32 0
60  %4 = fadd float %2, %3
61  %5 = insertelement <4 x float> %a0, float %4, i32 0
62  ret <4 x float> %5
63}
64
65define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
66  ;CHECK-LABEL: stack_fold_addsubpd
67  ;CHECK:       addsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
68  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
69  %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
70  ret <2 x double> %2
71}
72declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
73
74define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
75  ;CHECK-LABEL: stack_fold_addsubps
76  ;CHECK:       addsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
77  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
78  %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
79  ret <4 x float> %2
80}
81declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
82
83define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
84  ;CHECK-LABEL: stack_fold_andnpd
85  ;CHECK:       andnpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
86  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
87  %2 = bitcast <2 x double> %a0 to <2 x i64>
88  %3 = bitcast <2 x double> %a1 to <2 x i64>
89  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
90  %5 = and <2 x i64> %4, %3
91  %6 = bitcast <2 x i64> %5 to <2 x double>
92  ; fadd forces execution domain
93  %7 = fadd <2 x double> %6, <double 0x0, double 0x0>
94  ret <2 x double> %7
95}
96
97define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
98  ;CHECK-LABEL: stack_fold_andnps
99  ;CHECK:       andnps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
100  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
101  %2 = bitcast <4 x float> %a0 to <2 x i64>
102  %3 = bitcast <4 x float> %a1 to <2 x i64>
103  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
104  %5 = and <2 x i64> %4, %3
105  %6 = bitcast <2 x i64> %5 to <4 x float>
106  ; fadd forces execution domain
107  %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0>
108  ret <4 x float> %7
109}
110
111define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
112  ;CHECK-LABEL: stack_fold_andpd
113  ;CHECK:       andpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
114  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
115  %2 = bitcast <2 x double> %a0 to <2 x i64>
116  %3 = bitcast <2 x double> %a1 to <2 x i64>
117  %4 = and <2 x i64> %2, %3
118  %5 = bitcast <2 x i64> %4 to <2 x double>
119  ; fadd forces execution domain
120  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
121  ret <2 x double> %6
122}
123
124define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
125  ;CHECK-LABEL: stack_fold_andps
126  ;CHECK:       andps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
127  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
128  %2 = bitcast <4 x float> %a0 to <2 x i64>
129  %3 = bitcast <4 x float> %a1 to <2 x i64>
130  %4 = and <2 x i64> %2, %3
131  %5 = bitcast <2 x i64> %4 to <4 x float>
132  ; fadd forces execution domain
133  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
134  ret <4 x float> %6
135}
136
137define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
138  ;CHECK-LABEL: stack_fold_blendpd
139  ;CHECK:       blendpd $2, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
140  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
141  %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1
142  ; fadd forces execution domain
143  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
144  ret <2 x double> %3
145}
146
147define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
148  ;CHECK-LABEL: stack_fold_blendps
149  ;CHECK:       blendps $6, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
150  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
151  %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1
152  ; fadd forces execution domain
153  %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
154  ret <4 x float> %3
155}
156
157define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
158  ;CHECK-LABEL: stack_fold_blendvpd
159  ;CHECK:       blendvpd %xmm0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
160  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
161  %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
162  ret <2 x double> %2
163}
164declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
165
166define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
167  ;CHECK-LABEL: stack_fold_blendvps
168  ;CHECK:       blendvps %xmm0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
169  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
170  %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
171  ret <4 x float> %2
172}
173declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
174
175define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
176  ;CHECK-LABEL: stack_fold_cmppd
177  ;CHECK:       cmpeqpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
178  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
179  %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
180  ret <2 x double> %2
181}
182declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
183
184define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
185  ;CHECK-LABEL: stack_fold_cmpps
186  ;CHECK:       cmpeqps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
187  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
188  %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
189  ret <4 x float> %2
190}
191declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
192
193define i32 @stack_fold_cmpsd(double %a0, double %a1) {
194  ;CHECK-LABEL: stack_fold_cmpsd
195  ;CHECK:       cmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
196  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
197  %2 = fcmp oeq double %a0, %a1
198  %3 = zext i1 %2 to i32
199  ret i32 %3
200}
201
202define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
203  ;CHECK-LABEL: stack_fold_cmpsd_int
204  ;CHECK:       cmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
205  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
206  %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
207  ret <2 x double> %2
208}
209declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
210
211define i32 @stack_fold_cmpss(float %a0, float %a1) {
212  ;CHECK-LABEL: stack_fold_cmpss
213  ;CHECK:       cmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
214  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
215  %2 = fcmp oeq float %a0, %a1
216  %3 = zext i1 %2 to i32
217  ret i32 %3
218}
219
220define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
221  ;CHECK-LABEL: stack_fold_cmpss_int
222  ;CHECK:       cmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
223  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
224  %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
225  ret <4 x float> %2
226}
227declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
228
229; TODO stack_fold_comisd
230
231define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
232  ;CHECK-LABEL: stack_fold_comisd_int
233  ;CHECK:       comisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
234  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
235  %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
236  ret i32 %2
237}
238declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
239
240; TODO stack_fold_comiss
241
242define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
243  ;CHECK-LABEL: stack_fold_comiss_int
244  ;CHECK:       comiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
245  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
246  %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
247  ret i32 %2
248}
249declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
250
251define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
252  ;CHECK-LABEL: stack_fold_cvtdq2pd
253  ;CHECK:       cvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
254  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
255  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
256  %3 = sitofp <2 x i32> %2 to <2 x double>
257  ret <2 x double> %3
258}
259
260define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
261  ;CHECK-LABEL: stack_fold_cvtdq2pd_int
262  ;CHECK:       cvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
263  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
264  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a0, <2 x i32> <i32 0, i32 1>
265  %cvt = sitofp <2 x i32> %2 to <2 x double>
266  ret <2 x double> %cvt
267}
268
269define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
270  ;CHECK-LABEL: stack_fold_cvtdq2ps
271  ;CHECK:       cvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
272  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
273  %2 = sitofp <4 x i32> %a0 to <4 x float>
274  ret <4 x float> %2
275}
276
277define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
278  ;CHECK-LABEL: stack_fold_cvtpd2dq
279  ;CHECK:       cvtpd2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
280  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
281  %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
282  ret <4 x i32> %2
283}
284declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
285
286define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
287  ;CHECK-LABEL: stack_fold_cvtpd2ps
288  ;CHECK:       cvtpd2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
289  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
290  %2 = fptrunc <2 x double> %a0 to <2 x float>
291  ret <2 x float> %2
292}
293
294define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
295  ;CHECK-LABEL: stack_fold_cvtps2dq
296  ;CHECK:       cvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
297  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
298  %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
299  ret <4 x i32> %2
300}
301declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
302
303define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
304  ;CHECK-LABEL: stack_fold_cvtps2pd
305  ;CHECK:       cvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
306  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
307  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
308  %3 = fpext <2 x float> %2 to <2 x double>
309  ret <2 x double> %3
310}
311
312define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
313  ;CHECK-LABEL: stack_fold_cvtps2pd_int
314  ;CHECK:       cvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
315  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
316  %2 = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32> <i32 0, i32 1>
317  %cvtps2pd = fpext <2 x float> %2 to <2 x double>
318  ret <2 x double> %cvtps2pd
319}
320
321; TODO stack_fold_cvtsd2si
322
323define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
324  ;CHECK-LABEL: stack_fold_cvtsd2si_int
325  ;CHECK:       cvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
326  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
327  %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
328  ret i32 %2
329}
330declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
331
332; TODO stack_fold_cvtsd2si64
333
334define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
335  ;CHECK-LABEL: stack_fold_cvtsd2si64_int
336  ;CHECK:       cvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
337  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
338  %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
339  ret i64 %2
340}
341declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
342
343define float @stack_fold_cvtsd2ss(double %a0) minsize {
344  ;CHECK-LABEL: stack_fold_cvtsd2ss
345  ;CHECK:       cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
346  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
347  %2 = fptrunc double %a0 to float
348  ret float %2
349}
350
351define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) optsize {
352  ;CHECK-LABEL: stack_fold_cvtsd2ss_int
353  ;CHECK:       cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
354  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
355  %2 = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, <2 x double> %a0)
356  ret <4 x float> %2
357}
358declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
359
360define double @stack_fold_cvtsi2sd(i32 %a0) minsize {
361  ;CHECK-LABEL: stack_fold_cvtsi2sd
362  ;CHECK:       cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
363  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
364  %2 = sitofp i32 %a0 to double
365  ret double %2
366}
367
368define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0, <2 x double> %b0) {
369  ;CHECK-LABEL: stack_fold_cvtsi2sd_int
370  ;CHECK:       cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
371  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
372  %2 = sitofp i32 %a0 to double
373  %3 = insertelement <2 x double> %b0, double %2, i64 0
374  ret <2 x double> %3
375}
376
377define double @stack_fold_cvtsi642sd(i64 %a0) optsize {
378  ;CHECK-LABEL: stack_fold_cvtsi642sd
379  ;CHECK:       cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
380  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
381  %2 = sitofp i64 %a0 to double
382  ret double %2
383}
384
385define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0, <2 x double> %b0) {
386  ;CHECK-LABEL: stack_fold_cvtsi642sd_int
387  ;CHECK:       cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
388  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
389  %2 = sitofp i64 %a0 to double
390  %3 = insertelement <2 x double> %b0, double %2, i64 0
391  ret <2 x double> %3
392}
393
394define float @stack_fold_cvtsi2ss(i32 %a0) minsize {
395  ;CHECK-LABEL: stack_fold_cvtsi2ss
396  ;CHECK:       cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
397  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
398  %2 = sitofp i32 %a0 to float
399  ret float %2
400}
401
402define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0, <4 x float> %b0) {
403  ;CHECK-LABEL: stack_fold_cvtsi2ss_int
404  ;CHECK:  cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
405  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
406  %2 = sitofp i32 %a0 to float
407  %3 = insertelement <4 x float> %b0, float %2, i64 0
408  ret <4 x float> %3
409}
410
411define float @stack_fold_cvtsi642ss(i64 %a0) optsize {
412  ;CHECK-LABEL: stack_fold_cvtsi642ss
413  ;CHECK:       cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
414  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
415  %2 = sitofp i64 %a0 to float
416  ret float %2
417}
418
419define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0, <4 x float> %b0) {
420  ;CHECK-LABEL: stack_fold_cvtsi642ss_int
421  ;CHECK:  cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
422  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
423  %2 = sitofp i64 %a0 to float
424  %3 = insertelement <4 x float> %b0, float %2, i64 0
425  ret <4 x float> %3
426}
427
428define double @stack_fold_cvtss2sd(float %a0) minsize {
429  ;CHECK-LABEL: stack_fold_cvtss2sd
430  ;CHECK:       cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
431  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
432  %2 = fpext float %a0 to double
433  ret double %2
434}
435
436define <2 x double> @stack_fold_cvtss2sd_int(<4 x float> %a0) optsize {
437  ;CHECK-LABEL: stack_fold_cvtss2sd_int
438  ;CHECK:       cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
439  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
440  %2 = extractelement <4 x float> %a0, i64 0
441  %3 = fpext float %2 to double
442  %4 = insertelement <2 x double> zeroinitializer, double %3, i64 0
443  ret <2 x double> %4
444}
445
446; TODO stack_fold_cvtss2si
447
448define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
449  ;CHECK-LABEL: stack_fold_cvtss2si_int
450  ;CHECK:       cvtss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
451  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
452  %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
453  ret i32 %2
454}
455declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
456
457; TODO stack_fold_cvtss2si64
458
459define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) {
460  ;CHECK-LABEL: stack_fold_cvtss2si64_int
461  ;CHECK:       cvtss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
462  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
463  %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
464  ret i64 %2
465}
466declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
467
468define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {
469  ;CHECK-LABEL: stack_fold_cvttpd2dq
470  ;CHECK:       cvttpd2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
471  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
472  %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
473  ret <4 x i32> %2
474}
475declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
476
477define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) {
478  ;CHECK-LABEL: stack_fold_cvttps2dq
479  ;CHECK:       cvttps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
480  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
481  %2 = fptosi <4 x float> %a0 to <4 x i32>
482  ret <4 x i32> %2
483}
484
485define i32 @stack_fold_cvttsd2si(double %a0) {
486  ;CHECK-LABEL: stack_fold_cvttsd2si
487  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 8-byte Folded Reload
488  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
489  %2 = fptosi double %a0 to i32
490  ret i32 %2
491}
492
493define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) {
494  ;CHECK-LABEL: stack_fold_cvttsd2si_int
495  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
496  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
497  %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
498  ret i32 %2
499}
500declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
501
502define i64 @stack_fold_cvttsd2si64(double %a0) {
503  ;CHECK-LABEL: stack_fold_cvttsd2si64
504  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
505  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
506  %2 = fptosi double %a0 to i64
507  ret i64 %2
508}
509
510define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) {
511  ;CHECK-LABEL: stack_fold_cvttsd2si64_int
512  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
513  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
514  %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
515  ret i64 %2
516}
517declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
518
519define i32 @stack_fold_cvttss2si(float %a0) {
520  ;CHECK-LABEL: stack_fold_cvttss2si
521  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
522  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
523  %2 = fptosi float %a0 to i32
524  ret i32 %2
525}
526
527define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) {
528  ;CHECK-LABEL: stack_fold_cvttss2si_int
529  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
530  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
531  %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
532  ret i32 %2
533}
534declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
535
536define i64 @stack_fold_cvttss2si64(float %a0) {
537  ;CHECK-LABEL: stack_fold_cvttss2si64
538  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
539  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
540  %2 = fptosi float %a0 to i64
541  ret i64 %2
542}
543
544define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
545  ;CHECK-LABEL: stack_fold_cvttss2si64_int
546  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
547  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
548  %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
549  ret i64 %2
550}
551declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
552
553define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
554  ;CHECK-LABEL: stack_fold_divpd
555  ;CHECK:       divpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
556  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
557  %2 = fdiv <2 x double> %a0, %a1
558  ret <2 x double> %2
559}
560
561define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) {
562  ;CHECK-LABEL: stack_fold_divps
563  ;CHECK:       divps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
564  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
565  %2 = fdiv <4 x float> %a0, %a1
566  ret <4 x float> %2
567}
568
569define double @stack_fold_divsd(double %a0, double %a1) {
570  ;CHECK-LABEL: stack_fold_divsd
571  ;CHECK:       divsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
572  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
573  %2 = fdiv double %a0, %a1
574  ret double %2
575}
576
577define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
578  ;CHECK-LABEL: stack_fold_divsd_int
579  ;CHECK:       divsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
580  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
581  %2 = extractelement <2 x double> %a0, i32 0
582  %3 = extractelement <2 x double> %a1, i32 0
583  %4 = fdiv double %2, %3
584  %5 = insertelement <2 x double> %a0, double %4, i32 0
585  ret <2 x double> %5
586}
587declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
588
589define float @stack_fold_divss(float %a0, float %a1) {
590  ;CHECK-LABEL: stack_fold_divss
591  ;CHECK:       divss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
592  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
593  %2 = fdiv float %a0, %a1
594  ret float %2
595}
596
597define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
598  ;CHECK-LABEL: stack_fold_divss_int
599  ;CHECK:       divss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
600  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
601  %2 = extractelement <4 x float> %a0, i32 0
602  %3 = extractelement <4 x float> %a1, i32 0
603  %4 = fdiv float %2, %3
604  %5 = insertelement <4 x float> %a0, float %4, i32 0
605  ret <4 x float> %5
606}
607declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
608
609define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {
610  ;CHECK-LABEL: stack_fold_dppd
611  ;CHECK:       dppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
612  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
613  %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
614  ret <2 x double> %2
615}
616declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
617
618define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
619  ;CHECK-LABEL: stack_fold_dpps
620  ;CHECK:       dpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
621  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
622  %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
623  ret <4 x float> %2
624}
625declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
626
627define i32 @stack_fold_extractps(<4 x float> %a0) {
628  ;CHECK-LABEL: stack_fold_extractps
629  ;CHECK:       extractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
630  ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
631  ; fadd forces execution domain
632  %1 = fadd <4 x float> %a0, <float 1.0, float 2.0, float 3.0, float 4.0>
633  %2 = extractelement <4 x float> %1, i32 1
634  %3 = bitcast float %2 to i32
635  %4 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
636  ret i32 %3
637}
638
639define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
640  ;CHECK-LABEL: stack_fold_haddpd
641  ;CHECK:       haddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
642  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
643  %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
644  ret <2 x double> %2
645}
646declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
647
648define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) {
649  ;CHECK-LABEL: stack_fold_haddps
650  ;CHECK:       haddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
651  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
652  %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
653  ret <4 x float> %2
654}
655declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
656
657define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) {
658  ;CHECK-LABEL: stack_fold_hsubpd
659  ;CHECK:       hsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
660  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
661  %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
662  ret <2 x double> %2
663}
664declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
665
666define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
667  ;CHECK-LABEL: stack_fold_hsubps
668  ;CHECK:       hsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
669  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
670  %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
671  ret <4 x float> %2
672}
673declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
674
675define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
676  ;CHECK-LABEL: stack_fold_insertps
677  ;CHECK:       insertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
678  ;CHECK-NEXT:                                                        {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
679  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
680  %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
681  ret <4 x float> %2
682}
683declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
684
685define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
686  ;CHECK-LABEL: stack_fold_maxpd
687  ;CHECK:       maxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
688  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
689  %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
690  ret <2 x double> %2
691}
692declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
693
694define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
695  ;CHECK-LABEL: stack_fold_maxpd_commutable
696  ;CHECK:       maxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
697  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
698  %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
699  ret <2 x double> %2
700}
701
702define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
703  ;CHECK-LABEL: stack_fold_maxps
704  ;CHECK:       maxps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
705  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
706  %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
707  ret <4 x float> %2
708}
709declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
710
711define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
712  ;CHECK-LABEL: stack_fold_maxps_commutable
713  ;CHECK:       maxps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
714  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
715  %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
716  ret <4 x float> %2
717}
718
719define double @stack_fold_maxsd(double %a0, double %a1) #0 {
720  ;CHECK-LABEL: stack_fold_maxsd
721  ;CHECK:       maxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
722  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
723  %2 = fcmp ogt double %a0, %a1
724  %3 = select i1 %2, double %a0, double %a1
725  ret double %3
726}
727
728define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
729  ;CHECK-LABEL: stack_fold_maxsd_commutable
730  ;CHECK:       maxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
731  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
732  %2 = fcmp ogt double %a0, %a1
733  %3 = select i1 %2, double %a0, double %a1
734  ret double %3
735}
736
737define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
738  ;CHECK-LABEL: stack_fold_maxsd_int
739  ;CHECK:       maxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
740  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
741  %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
742  ret <2 x double> %2
743}
744declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
745
746define float @stack_fold_maxss(float %a0, float %a1) #0 {
747  ;CHECK-LABEL: stack_fold_maxss
748  ;CHECK:       maxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
749  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
750  %2 = fcmp ogt float %a0, %a1
751  %3 = select i1 %2, float %a0, float %a1
752  ret float %3
753}
754
755define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
756  ;CHECK-LABEL: stack_fold_maxss_commutable
757  ;CHECK:       maxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
758  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
759  %2 = fcmp ogt float %a0, %a1
760  %3 = select i1 %2, float %a0, float %a1
761  ret float %3
762}
763
764define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
765  ;CHECK-LABEL: stack_fold_maxss_int
766  ;CHECK:       maxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
767  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
768  %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
769  ret <4 x float> %2
770}
771declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
772
773define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
774  ;CHECK-LABEL: stack_fold_minpd
775  ;CHECK:       minpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
776  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
777  %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
778  ret <2 x double> %2
779}
780declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
781
782define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
783  ;CHECK-LABEL: stack_fold_minpd_commutable
784  ;CHECK:       minpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
785  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
786  %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
787  ret <2 x double> %2
788}
789
790define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
791  ;CHECK-LABEL: stack_fold_minps
792  ;CHECK:       minps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
793  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
794  %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
795  ret <4 x float> %2
796}
797declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
798
799define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
800  ;CHECK-LABEL: stack_fold_minps_commutable
801  ;CHECK:       minps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
802  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
803  %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
804  ret <4 x float> %2
805}
806
807define double @stack_fold_minsd(double %a0, double %a1) #0 {
808  ;CHECK-LABEL: stack_fold_minsd
809  ;CHECK:       minsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
810  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
811  %2 = fcmp olt double %a0, %a1
812  %3 = select i1 %2, double %a0, double %a1
813  ret double %3
814}
815
816define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
817  ;CHECK-LABEL: stack_fold_minsd_commutable
818  ;CHECK:       minsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
819  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
820  %2 = fcmp olt double %a0, %a1
821  %3 = select i1 %2, double %a0, double %a1
822  ret double %3
823}
824
825define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
826  ;CHECK-LABEL: stack_fold_minsd_int
827  ;CHECK:       minsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
828  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
829  %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
830  ret <2 x double> %2
831}
832declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
833
834define float @stack_fold_minss(float %a0, float %a1) #0 {
835  ;CHECK-LABEL: stack_fold_minss
836  ;CHECK:       minss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
837  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
838  %2 = fcmp olt float %a0, %a1
839  %3 = select i1 %2, float %a0, float %a1
840  ret float %3
841}
842
843define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
844  ;CHECK-LABEL: stack_fold_minss_commutable
845  ;CHECK:       minss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
846  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
847  %2 = fcmp olt float %a0, %a1
848  %3 = select i1 %2, float %a0, float %a1
849  ret float %3
850}
851
852define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
853  ;CHECK-LABEL: stack_fold_minss_int
854  ;CHECK:       minss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
855  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
856  %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
857  ret <4 x float> %2
858}
859declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
860
861define <2 x double> @stack_fold_movddup(<2 x double> %a0) {
862  ;CHECK-LABEL: stack_fold_movddup
863  ;CHECK:   movddup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
864  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
865  %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
866  ret <2 x double> %2
867}
868; TODO stack_fold_movhpd (load / store)
869; TODO stack_fold_movhps (load / store)
870
871; TODO stack_fold_movlpd (load / store)
872; TODO stack_fold_movlps (load / store)
873
874define <4 x float> @stack_fold_movshdup(<4 x float> %a0) {
875  ;CHECK-LABEL: stack_fold_movshdup
876  ;CHECK:       movshdup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
877  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
878  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
879  ret <4 x float> %2
880}
881
882define <4 x float> @stack_fold_movsldup(<4 x float> %a0) {
883  ;CHECK-LABEL: stack_fold_movsldup
884  ;CHECK:       movsldup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
885  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
886  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
887  ret <4 x float> %2
888}
889
890define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) {
891  ;CHECK-LABEL: stack_fold_mulpd
892  ;CHECK:       mulpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
893  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
894  %2 = fmul <2 x double> %a0, %a1
895  ret <2 x double> %2
896}
897
898define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) {
899  ;CHECK-LABEL: stack_fold_mulps
900  ;CHECK:       mulps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
901  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
902  %2 = fmul <4 x float> %a0, %a1
903  ret <4 x float> %2
904}
905
906define double @stack_fold_mulsd(double %a0, double %a1) {
907  ;CHECK-LABEL: stack_fold_mulsd
908  ;CHECK:       mulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
909  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
910  %2 = fmul double %a0, %a1
911  ret double %2
912}
913
914define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
915  ;CHECK-LABEL: stack_fold_mulsd_int
916  ;CHECK:       mulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
917  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
918  %2 = extractelement <2 x double> %a0, i32 0
919  %3 = extractelement <2 x double> %a1, i32 0
920  %4 = fmul double %2, %3
921  %5 = insertelement <2 x double> %a0, double %4, i32 0
922  ret <2 x double> %5
923}
924
925define float @stack_fold_mulss(float %a0, float %a1) {
926  ;CHECK-LABEL: stack_fold_mulss
927  ;CHECK:       mulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
928  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
929  %2 = fmul float %a0, %a1
930  ret float %2
931}
932
933define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
934  ;CHECK-LABEL: stack_fold_mulss_int
935  ;CHECK:       mulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
936  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
937  %2 = extractelement <4 x float> %a0, i32 0
938  %3 = extractelement <4 x float> %a1, i32 0
939  %4 = fmul float %2, %3
940  %5 = insertelement <4 x float> %a0, float %4, i32 0
941  ret <4 x float> %5
942}
943
944define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
945  ;CHECK-LABEL: stack_fold_orpd
946  ;CHECK:       orpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
947  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
948  %2 = bitcast <2 x double> %a0 to <2 x i64>
949  %3 = bitcast <2 x double> %a1 to <2 x i64>
950  %4 = or <2 x i64> %2, %3
951  %5 = bitcast <2 x i64> %4 to <2 x double>
952  ; fadd forces execution domain
953  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
954  ret <2 x double> %6
955}
956
957define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
958  ;CHECK-LABEL: stack_fold_orps
959  ;CHECK:       orps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
960  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
961  %2 = bitcast <4 x float> %a0 to <2 x i64>
962  %3 = bitcast <4 x float> %a1 to <2 x i64>
963  %4 = or <2 x i64> %2, %3
964  %5 = bitcast <2 x i64> %4 to <4 x float>
965  ; fadd forces execution domain
966  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
967  ret <4 x float> %6
968}
969
970; TODO stack_fold_rcpps
971
972define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
973  ;CHECK-LABEL: stack_fold_rcpps_int
974  ;CHECK:       rcpps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
975  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
976  %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
977  ret <4 x float> %2
978}
979declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
980
981; TODO stack_fold_rcpss
982
983define <4 x float> @stack_fold_rcpss_int(<4 x float> %a0, <4 x float> %a1) optsize {
984  ;CHECK-LABEL: stack_fold_rcpss_int
985  ;CHECK:       rcpss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
986  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
987  %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a1)
988  %3 = extractelement <4 x float> %2, i32 0
989  %4 = insertelement <4 x float> %a0, float %3, i32 0
990  ret <4 x float> %4
991}
992declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>)
993
994define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
995  ;CHECK-LABEL: stack_fold_roundpd
996  ;CHECK:       roundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
997  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
998  %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
999  ret <2 x double> %2
1000}
1001declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
1002
1003define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
1004  ;CHECK-LABEL: stack_fold_roundps
1005  ;CHECK:       roundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1006  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1007  %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
1008  ret <4 x float> %2
1009}
1010declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
1011
1012define double @stack_fold_roundsd(double %a0) optsize {
1013  ;CHECK-LABEL: stack_fold_roundsd
1014  ;CHECK:       roundsd $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
1015  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1016  %2 = call double @llvm.floor.f64(double %a0)
1017  ret double %2
1018}
1019declare double @llvm.floor.f64(double) nounwind readnone
1020
1021define <2 x double> @stack_fold_roundsd_int(<2 x double> %a0, <2 x double> %a1) optsize {
1022  ;CHECK-LABEL: stack_fold_roundsd_int
1023  ;CHECK:       roundsd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1024  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1025  %2 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7)
1026  ret <2 x double> %2
1027}
1028declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
1029
1030define float @stack_fold_roundss(float %a0) minsize {
1031  ;CHECK-LABEL: stack_fold_roundss
1032  ;CHECK:       roundss $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
1033  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1034  %2 = call float @llvm.floor.f32(float %a0)
1035  ret float %2
1036}
1037declare float @llvm.floor.f32(float) nounwind readnone
1038
1039define <4 x float> @stack_fold_roundss_int(<4 x float> %a0, <4 x float> %a1) optsize {
1040  ;CHECK-LABEL: stack_fold_roundss_int
1041  ;CHECK:       roundss $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1042  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1043  %2 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7)
1044  ret <4 x float> %2
1045}
1046declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
1047
1048; TODO stack_fold_rsqrtps
1049
1050define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
1051  ;CHECK-LABEL: stack_fold_rsqrtps_int
1052  ;CHECK:       rsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1053  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1054  %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
1055  ret <4 x float> %2
1056}
1057declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
1058
1059; TODO stack_fold_rsqrtss
1060
1061define <4 x float> @stack_fold_rsqrtss_int(<4 x float> %a0, <4 x float> %a1) optsize {
1062  ;CHECK-LABEL: stack_fold_rsqrtss_int
1063  ;CHECK:       rsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1064  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1065  %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a1)
1066  %3 = extractelement <4 x float> %2, i32 0
1067  %4 = insertelement <4 x float> %a0, float %3, i32 0
1068  ret <4 x float> %4
1069}
1070declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>)
1071
1072define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
1073  ;CHECK-LABEL: stack_fold_shufpd
1074  ;CHECK:       shufpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1075  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1076  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
1077  ; fadd forces execution domain
1078  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
1079  ret <2 x double> %3
1080}
1081
1082define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
1083  ;CHECK-LABEL: stack_fold_shufps
1084  ;CHECK:       shufps $200, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1085  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1086  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
1087  ret <4 x float> %2
1088}
1089
1090define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) {
1091  ;CHECK-LABEL: stack_fold_sqrtpd
1092  ;CHECK:       sqrtpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1093  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1094  %2 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
1095  ret <2 x double> %2
1096}
1097declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
1098
1099define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
1100  ;CHECK-LABEL: stack_fold_sqrtps
1101  ;CHECK:       sqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1102  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1103  %2 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
1104  ret <4 x float> %2
1105}
1106declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
1107
1108define double @stack_fold_sqrtsd(double %a0) optsize {
1109  ;CHECK-LABEL: stack_fold_sqrtsd
1110  ;CHECK:       sqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
1111  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1112  %2 = call double @llvm.sqrt.f64(double %a0)
1113  ret double %2
1114}
1115declare double @llvm.sqrt.f64(double) nounwind readnone
1116
1117define <2 x double> @stack_fold_sqrtsd_int(<2 x double> %a0, <2 x double> %a1) optsize {
1118  ;CHECK-LABEL: stack_fold_sqrtsd_int
1119  ;CHECK:       sqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1120  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1121  %2 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1)
1122  %3 = extractelement <2 x double> %2, i32 0
1123  %4 = insertelement <2 x double> %a0, double %3, i32 0
1124  ret <2 x double> %4
1125}
1126declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
1127
1128define float @stack_fold_sqrtss(float %a0) minsize {
1129  ;CHECK-LABEL: stack_fold_sqrtss
1130  ;CHECK:       sqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
1131  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1132  %2 = call float @llvm.sqrt.f32(float %a0)
1133  ret float %2
1134}
1135declare float @llvm.sqrt.f32(float) nounwind readnone
1136
1137define <4 x float> @stack_fold_sqrtss_int(<4 x float> %a0, <4 x float> %a1) optsize {
1138  ;CHECK-LABEL: stack_fold_sqrtss_int
1139  ;CHECK:       sqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1140  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1141  %2 = extractelement <4 x float> %a1, i64 0
1142  %3 = call float @llvm.sqrt.f32(float %2)
1143  %4 = insertelement <4 x float> %a1, float %3, i64 0
1144  %5 = extractelement <4 x float> %4, i32 0
1145  %6 = insertelement <4 x float> %a0, float %5, i32 0
1146  ret <4 x float> %6
1147}
1148
1149define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
1150  ;CHECK-LABEL: stack_fold_subpd
1151  ;CHECK:       subpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1152  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1153  %2 = fsub <2 x double> %a0, %a1
1154  ret <2 x double> %2
1155}
1156
1157define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) {
1158  ;CHECK-LABEL: stack_fold_subps
1159  ;CHECK:       subps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1160  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1161  %2 = fsub <4 x float> %a0, %a1
1162  ret <4 x float> %2
1163}
1164
1165define double @stack_fold_subsd(double %a0, double %a1) {
1166  ;CHECK-LABEL: stack_fold_subsd
1167  ;CHECK:       subsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
1168  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1169  %2 = fsub double %a0, %a1
1170  ret double %2
1171}
1172
1173define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
1174  ;CHECK-LABEL: stack_fold_subsd_int
1175  ;CHECK:       subsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1176  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1177  %2 = extractelement <2 x double> %a0, i32 0
1178  %3 = extractelement <2 x double> %a1, i32 0
1179  %4 = fsub double %2, %3
1180  %5 = insertelement <2 x double> %a0, double %4, i32 0
1181  ret <2 x double> %5
1182}
1183
1184define float @stack_fold_subss(float %a0, float %a1) {
1185  ;CHECK-LABEL: stack_fold_subss
1186  ;CHECK:       subss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
1187  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1188  %2 = fsub float %a0, %a1
1189  ret float %2
1190}
1191
1192define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
1193  ;CHECK-LABEL: stack_fold_subss_int
1194  ;CHECK:       subss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1195  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1196  %2 = extractelement <4 x float> %a0, i32 0
1197  %3 = extractelement <4 x float> %a1, i32 0
1198  %4 = fsub float %2, %3
1199  %5 = insertelement <4 x float> %a0, float %4, i32 0
1200  ret <4 x float> %5
1201}
1202
1203define i32 @stack_fold_ucomisd(double %a0, double %a1) {
1204  ;CHECK-LABEL: stack_fold_ucomisd
1205  ;CHECK:       ucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
1206  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1207  %2 = fcmp ueq double %a0, %a1
1208  %3 = select i1 %2, i32 1, i32 -1
1209  ret i32 %3
1210}
1211
1212define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) {
1213  ;CHECK-LABEL: stack_fold_ucomisd_int
1214  ;CHECK:       ucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1215  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1216  %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
1217  ret i32 %2
1218}
1219declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
1220
1221define i32 @stack_fold_ucomiss(float %a0, float %a1) {
1222  ;CHECK-LABEL: stack_fold_ucomiss
1223  ;CHECK:       ucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
1224  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1225  %2 = fcmp ueq float %a0, %a1
1226  %3 = select i1 %2, i32 1, i32 -1
1227  ret i32 %3
1228}
1229
1230define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) {
1231  ;CHECK-LABEL: stack_fold_ucomiss_int
1232  ;CHECK:       ucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1233  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1234  %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
1235  ret i32 %2
1236}
1237declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
1238
1239define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) {
1240  ;CHECK-LABEL: stack_fold_unpckhpd
1241  ;CHECK:       unpckhpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1242  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1243  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
1244  ; fadd forces execution domain
1245  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
1246  ret <2 x double> %3
1247}
1248
1249define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) {
1250  ;CHECK-LABEL: stack_fold_unpckhps
1251  ;CHECK:       unpckhps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1252  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1253  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
1254  ; fadd forces execution domain
1255  %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
1256  ret <4 x float> %3
1257}
1258
1259define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) {
1260  ;CHECK-LABEL: stack_fold_unpcklpd
1261  ;CHECK:       unpcklpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1262  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1263  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
1264  ; fadd forces execution domain
1265  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
1266  ret <2 x double> %3
1267}
1268
1269define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) {
1270  ;CHECK-LABEL: stack_fold_unpcklps
1271  ;CHECK:       unpcklps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1272  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1273  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
1274  ; fadd forces execution domain
1275  %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
1276  ret <4 x float> %3
1277}
1278
1279define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
1280  ;CHECK-LABEL: stack_fold_xorpd
1281  ;CHECK:       xorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1282  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1283  %2 = bitcast <2 x double> %a0 to <2 x i64>
1284  %3 = bitcast <2 x double> %a1 to <2 x i64>
1285  %4 = xor <2 x i64> %2, %3
1286  %5 = bitcast <2 x i64> %4 to <2 x double>
1287  ; fadd forces execution domain
1288  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
1289  ret <2 x double> %6
1290}
1291
1292define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
1293  ;CHECK-LABEL: stack_fold_xorps
1294  ;CHECK:       xorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1295  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1296  %2 = bitcast <4 x float> %a0 to <2 x i64>
1297  %3 = bitcast <4 x float> %a1 to <2 x i64>
1298  %4 = xor <2 x i64> %2, %3
1299  %5 = bitcast <2 x i64> %4 to <4 x float>
1300  ; fadd forces execution domain
1301  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
1302  ret <4 x float> %6
1303}
1304
1305attributes #0 = { "unsafe-fp-math"="false" }
1306attributes #1 = { "unsafe-fp-math"="true" }
1307