• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.2 < %s | FileCheck %s
2
3target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
4target triple = "x86_64-unknown-unknown"
5
6; Stack reload folding tests.
7;
8; By including a nop call with sideeffects we can force a partial register spill of the
9; relevant registers and check that the reload is correctly folded into the instruction.
10
11define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
12  ;CHECK-LABEL: stack_fold_addpd
13  ;CHECK:       addpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
14  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
15  %2 = fadd <2 x double> %a0, %a1
16  ret <2 x double> %2
17}
18
19define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
20  ;CHECK-LABEL: stack_fold_addps
21  ;CHECK:       addps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
22  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
23  %2 = fadd <4 x float> %a0, %a1
24  ret <4 x float> %2
25}
26
27define double @stack_fold_addsd(double %a0, double %a1) {
28  ;CHECK-LABEL: stack_fold_addsd
29  ;CHECK:       addsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
30  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
31  %2 = fadd double %a0, %a1
32  ret double %2
33}
34
35define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
36  ;CHECK-LABEL: stack_fold_addsd_int
37  ;CHECK:       addsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
38  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
39  %2 = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1)
40  ret <2 x double> %2
41}
42declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
43
44define float @stack_fold_addss(float %a0, float %a1) {
45  ;CHECK-LABEL: stack_fold_addss
46  ;CHECK:       addss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
47  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
48  %2 = fadd float %a0, %a1
49  ret float %2
50}
51
52define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
53  ;CHECK-LABEL: stack_fold_addss_int
54  ;CHECK:       addss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
55  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
56  %2 = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1)
57  ret <4 x float> %2
58}
59declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
60
61define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
62  ;CHECK-LABEL: stack_fold_addsubpd
63  ;CHECK:       addsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
64  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
65  %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
66  ret <2 x double> %2
67}
68declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
69
70define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
71  ;CHECK-LABEL: stack_fold_addsubps
72  ;CHECK:       addsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
73  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
74  %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
75  ret <4 x float> %2
76}
77declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
78
79define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
80  ;CHECK-LABEL: stack_fold_andnpd
81  ;CHECK:       andnpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
82  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
83  %2 = bitcast <2 x double> %a0 to <2 x i64>
84  %3 = bitcast <2 x double> %a1 to <2 x i64>
85  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
86  %5 = and <2 x i64> %4, %3
87  %6 = bitcast <2 x i64> %5 to <2 x double>
88  ; fadd forces execution domain
89  %7 = fadd <2 x double> %6, <double 0x0, double 0x0>
90  ret <2 x double> %7
91}
92
93define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
94  ;CHECK-LABEL: stack_fold_andnps
95  ;CHECK:       andnps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
96  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
97  %2 = bitcast <4 x float> %a0 to <2 x i64>
98  %3 = bitcast <4 x float> %a1 to <2 x i64>
99  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
100  %5 = and <2 x i64> %4, %3
101  %6 = bitcast <2 x i64> %5 to <4 x float>
102  ; fadd forces execution domain
103  %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0>
104  ret <4 x float> %7
105}
106
107define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
108  ;CHECK-LABEL: stack_fold_andpd
109  ;CHECK:       andpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
110  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
111  %2 = bitcast <2 x double> %a0 to <2 x i64>
112  %3 = bitcast <2 x double> %a1 to <2 x i64>
113  %4 = and <2 x i64> %2, %3
114  %5 = bitcast <2 x i64> %4 to <2 x double>
115  ; fadd forces execution domain
116  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
117  ret <2 x double> %6
118}
119
120define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
121  ;CHECK-LABEL: stack_fold_andps
122  ;CHECK:       andps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
123  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
124  %2 = bitcast <4 x float> %a0 to <2 x i64>
125  %3 = bitcast <4 x float> %a1 to <2 x i64>
126  %4 = and <2 x i64> %2, %3
127  %5 = bitcast <2 x i64> %4 to <4 x float>
128  ; fadd forces execution domain
129  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
130  ret <4 x float> %6
131}
132
133define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
134  ;CHECK-LABEL: stack_fold_blendpd
135  ;CHECK:       blendpd $2, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
136  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
137  %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1
138  ret <2 x double> %2
139}
140
141define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
142  ;CHECK-LABEL: stack_fold_blendps
143  ;CHECK:       blendps $6, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
144  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
145  %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1
146  ret <4 x float> %2
147}
148
149define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
150  ;CHECK-LABEL: stack_fold_blendvpd
151  ;CHECK:       blendvpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
152  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
153  %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
154  ret <2 x double> %2
155}
156declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
157
158define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
159  ;CHECK-LABEL: stack_fold_blendvps
160  ;CHECK:       blendvps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
161  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
162  %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
163  ret <4 x float> %2
164}
165declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
166
167define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
168  ;CHECK-LABEL: stack_fold_cmppd
169  ;CHECK:       cmpeqpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
170  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
171  %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
172  ret <2 x double> %2
173}
174declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
175
176define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
177  ;CHECK-LABEL: stack_fold_cmpps
178  ;CHECK:       cmpeqps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
179  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
180  %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
181  ret <4 x float> %2
182}
183declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
184
185define i32 @stack_fold_cmpsd(double %a0, double %a1) {
186  ;CHECK-LABEL: stack_fold_cmpsd
187  ;CHECK:       cmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
188  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
189  %2 = fcmp oeq double %a0, %a1
190  %3 = zext i1 %2 to i32
191  ret i32 %3
192}
193
194define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
195  ;CHECK-LABEL: stack_fold_cmpsd_int
196  ;CHECK:       cmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
197  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
198  %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
199  ret <2 x double> %2
200}
201declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
202
203define i32 @stack_fold_cmpss(float %a0, float %a1) {
204  ;CHECK-LABEL: stack_fold_cmpss
205  ;CHECK:       cmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
206  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
207  %2 = fcmp oeq float %a0, %a1
208  %3 = zext i1 %2 to i32
209  ret i32 %3
210}
211
212define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
213  ;CHECK-LABEL: stack_fold_cmpss_int
214  ;CHECK:       cmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
215  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
216  %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
217  ret <4 x float> %2
218}
219declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
220
221; TODO stack_fold_comisd
222
223define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
224  ;CHECK-LABEL: stack_fold_comisd_int
225  ;CHECK:       comisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
226  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
227  %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
228  ret i32 %2
229}
230declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
231
232; TODO stack_fold_comiss
233
234define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
235  ;CHECK-LABEL: stack_fold_comiss_int
236  ;CHECK:       comiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
237  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
238  %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
239  ret i32 %2
240}
241declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
242
243define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
244  ;CHECK-LABEL: stack_fold_cvtdq2pd
245  ;CHECK:       cvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
246  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
247  %2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0)
248  ret <2 x double> %2
249}
250declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
251
252define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
253  ;CHECK-LABEL: stack_fold_cvtdq2ps
254  ;CHECK:       cvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
255  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
256  %2 = sitofp <4 x i32> %a0 to <4 x float>
257  ret <4 x float> %2
258}
259
260define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
261  ;CHECK-LABEL: stack_fold_cvtpd2dq
262  ;CHECK:       cvtpd2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
263  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
264  %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
265  ret <4 x i32> %2
266}
267declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
268
269define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
270  ;CHECK-LABEL: stack_fold_cvtpd2ps
271  ;CHECK:       cvtpd2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
272  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
273  %2 = fptrunc <2 x double> %a0 to <2 x float>
274  ret <2 x float> %2
275}
276
277define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
278  ;CHECK-LABEL: stack_fold_cvtps2dq
279  ;CHECK:       cvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
280  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
281  %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
282  ret <4 x i32> %2
283}
284declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
285
286define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
287  ;CHECK-LABEL: stack_fold_cvtps2pd
288  ;CHECK:       cvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
289  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
290  %2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0)
291  ret <2 x double> %2
292}
293declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
294
295; TODO stack_fold_cvtsd2si
296
297define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
298  ;CHECK-LABEL: stack_fold_cvtsd2si_int
299  ;CHECK:       cvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
300  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
301  %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
302  ret i32 %2
303}
304declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
305
306; TODO stack_fold_cvtsd2si64
307
308define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
309  ;CHECK-LABEL: stack_fold_cvtsd2si64_int
310  ;CHECK:       cvtsd2siq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
311  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
312  %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
313  ret i64 %2
314}
315declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
316
317; TODO stack_fold_cvtsd2ss
318
319define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) optsize {
320  ;CHECK-LABEL: stack_fold_cvtsd2ss_int
321  ;CHECK:       cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
322  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
323  %2 = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, <2 x double> %a0)
324  ret <4 x float> %2
325}
326declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
327
328define double @stack_fold_cvtsi2sd(i32 %a0) optsize {
329  ;CHECK-LABEL: stack_fold_cvtsi2sd
330  ;CHECK:       cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
331  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
332  %2 = sitofp i32 %a0 to double
333  ret double %2
334}
335
336define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) {
337  ;CHECK-LABEL: stack_fold_cvtsi2sd_int
338  ;CHECK:       cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
339  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
340  %2 = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 0x0, double 0x0>, i32 %a0)
341  ret <2 x double> %2
342}
343declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
344
345define double @stack_fold_cvtsi642sd(i64 %a0) optsize {
346  ;CHECK-LABEL: stack_fold_cvtsi642sd
347  ;CHECK:       cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
348  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
349  %2 = sitofp i64 %a0 to double
350  ret double %2
351}
352
353define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) {
354  ;CHECK-LABEL: stack_fold_cvtsi642sd_int
355  ;CHECK:       cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
356  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
357  %2 = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> <double 0x0, double 0x0>, i64 %a0)
358  ret <2 x double> %2
359}
360declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
361
362define float @stack_fold_cvtsi2ss(i32 %a0) optsize {
363  ;CHECK-LABEL: stack_fold_cvtsi2ss
364  ;CHECK:       cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
365  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
366  %2 = sitofp i32 %a0 to float
367  ret float %2
368}
369
370define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
371  ;CHECK-LABEL: stack_fold_cvtsi2ss_int
372  ;CHECK:  cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
373  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
374  %2 = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i32 %a0)
375  ret <4 x float> %2
376}
377declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
378
379define float @stack_fold_cvtsi642ss(i64 %a0) optsize {
380  ;CHECK-LABEL: stack_fold_cvtsi642ss
381  ;CHECK:       cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
382  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
383  %2 = sitofp i64 %a0 to float
384  ret float %2
385}
386
387define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
388  ;CHECK-LABEL: stack_fold_cvtsi642ss_int
389  ;CHECK:  cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
390  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
391  %2 = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i64 %a0)
392  ret <4 x float> %2
393}
394declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
395
396define double @stack_fold_cvtss2sd(float %a0) optsize {
397  ;CHECK-LABEL: stack_fold_cvtss2sd
398  ;CHECK:       cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
399  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
400  %2 = fpext float %a0 to double
401  ret double %2
402}
403
404define <2 x double> @stack_fold_cvtss2sd_int(<4 x float> %a0) optsize {
405  ;CHECK-LABEL: stack_fold_cvtss2sd_int
406  ;CHECK:       cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
407  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
408  %2 = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %a0)
409  ret <2 x double> %2
410}
411declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
412
413; TODO stack_fold_cvtss2si
414
415define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
416  ;CHECK-LABEL: stack_fold_cvtss2si_int
417  ;CHECK:       cvtss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
418  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
419  %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
420  ret i32 %2
421}
422declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
423
424; TODO stack_fold_cvtss2si64
425
426define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) {
427  ;CHECK-LABEL: stack_fold_cvtss2si64_int
428  ;CHECK:       cvtss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
429  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
430  %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
431  ret i64 %2
432}
433declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
434
435define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {
436  ;CHECK-LABEL: stack_fold_cvttpd2dq
437  ;CHECK:       cvttpd2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
438  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
439  %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
440  ret <4 x i32> %2
441}
442declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
443
444define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) {
445  ;CHECK-LABEL: stack_fold_cvttps2dq
446  ;CHECK:       cvttps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
447  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
448  %2 = fptosi <4 x float> %a0 to <4 x i32>
449  ret <4 x i32> %2
450}
451
452define i32 @stack_fold_cvttsd2si(double %a0) {
453  ;CHECK-LABEL: stack_fold_cvttsd2si
454  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 8-byte Folded Reload
455  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
456  %2 = fptosi double %a0 to i32
457  ret i32 %2
458}
459
460define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) {
461  ;CHECK-LABEL: stack_fold_cvttsd2si_int
462  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
463  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
464  %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
465  ret i32 %2
466}
467declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
468
469define i64 @stack_fold_cvttsd2si64(double %a0) {
470  ;CHECK-LABEL: stack_fold_cvttsd2si64
471  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
472  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
473  %2 = fptosi double %a0 to i64
474  ret i64 %2
475}
476
477define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) {
478  ;CHECK-LABEL: stack_fold_cvttsd2si64_int
479  ;CHECK:       cvttsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
480  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
481  %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
482  ret i64 %2
483}
484declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
485
486define i32 @stack_fold_cvttss2si(float %a0) {
487  ;CHECK-LABEL: stack_fold_cvttss2si
488  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
489  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
490  %2 = fptosi float %a0 to i32
491  ret i32 %2
492}
493
494define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) {
495  ;CHECK-LABEL: stack_fold_cvttss2si_int
496  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
497  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
498  %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
499  ret i32 %2
500}
501declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
502
503define i64 @stack_fold_cvttss2si64(float %a0) {
504  ;CHECK-LABEL: stack_fold_cvttss2si64
505  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
506  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
507  %2 = fptosi float %a0 to i64
508  ret i64 %2
509}
510
511define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
512  ;CHECK-LABEL: stack_fold_cvttss2si64_int
513  ;CHECK:       cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
514  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
515  %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
516  ret i64 %2
517}
518declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
519
520define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
521  ;CHECK-LABEL: stack_fold_divpd
522  ;CHECK:       divpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
523  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
524  %2 = fdiv <2 x double> %a0, %a1
525  ret <2 x double> %2
526}
527
528define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) {
529  ;CHECK-LABEL: stack_fold_divps
530  ;CHECK:       divps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
531  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
532  %2 = fdiv <4 x float> %a0, %a1
533  ret <4 x float> %2
534}
535
536define double @stack_fold_divsd(double %a0, double %a1) {
537  ;CHECK-LABEL: stack_fold_divsd
538  ;CHECK:       divsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
539  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
540  %2 = fdiv double %a0, %a1
541  ret double %2
542}
543
544define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
545  ;CHECK-LABEL: stack_fold_divsd_int
546  ;CHECK:       divsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
547  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
548  %2 = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1)
549  ret <2 x double> %2
550}
551declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
552
553define float @stack_fold_divss(float %a0, float %a1) {
554  ;CHECK-LABEL: stack_fold_divss
555  ;CHECK:       divss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
556  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
557  %2 = fdiv float %a0, %a1
558  ret float %2
559}
560
561define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
562  ;CHECK-LABEL: stack_fold_divss_int
563  ;CHECK:       divss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
564  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
565  %2 = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1)
566  ret <4 x float> %2
567}
568declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
569
570define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {
571  ;CHECK-LABEL: stack_fold_dppd
572  ;CHECK:       dppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
573  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
574  %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
575  ret <2 x double> %2
576}
577declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
578
579define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
580  ;CHECK-LABEL: stack_fold_dpps
581  ;CHECK:       dpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
582  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
583  %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
584  ret <4 x float> %2
585}
586declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
587
588define i32 @stack_fold_extractps(<4 x float> %a0) {
589  ;CHECK-LABEL: stack_fold_extractps
590  ;CHECK:       extractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
591  ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
592  %1 = extractelement <4 x float> %a0, i32 1
593  %2 = bitcast float %1 to i32
594  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
595  ret i32 %2
596}
597
598define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
599  ;CHECK-LABEL: stack_fold_haddpd
600  ;CHECK:       haddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
601  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
602  %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
603  ret <2 x double> %2
604}
605declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
606
607define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) {
608  ;CHECK-LABEL: stack_fold_haddps
609  ;CHECK:       haddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
610  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
611  %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
612  ret <4 x float> %2
613}
614declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
615
616define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) {
617  ;CHECK-LABEL: stack_fold_hsubpd
618  ;CHECK:       hsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
619  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
620  %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
621  ret <2 x double> %2
622}
623declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
624
625define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
626  ;CHECK-LABEL: stack_fold_hsubps
627  ;CHECK:       hsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
628  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
629  %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
630  ret <4 x float> %2
631}
632declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
633
634; TODO stack_fold_insertps
635
636define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
637  ;CHECK-LABEL: stack_fold_maxpd
638  ;CHECK:       maxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
639  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
640  %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
641  ret <2 x double> %2
642}
643declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
644
645define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
646  ;CHECK-LABEL: stack_fold_maxps
647  ;CHECK:       maxps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
648  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
649  %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
650  ret <4 x float> %2
651}
652declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
653
654define double @stack_fold_maxsd(double %a0, double %a1) {
655  ;CHECK-LABEL: stack_fold_maxsd
656  ;CHECK:       maxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
657  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
658  %2 = fcmp ogt double %a0, %a1
659  %3 = select i1 %2, double %a0, double %a1
660  ret double %3
661}
662
663define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
664  ;CHECK-LABEL: stack_fold_maxsd_int
665  ;CHECK:       maxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
666  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
667  %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
668  ret <2 x double> %2
669}
670declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
671
672define float @stack_fold_maxss(float %a0, float %a1) {
673  ;CHECK-LABEL: stack_fold_maxss
674  ;CHECK:       maxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
675  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
676  %2 = fcmp ogt float %a0, %a1
677  %3 = select i1 %2, float %a0, float %a1
678  ret float %3
679}
680
681define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
682  ;CHECK-LABEL: stack_fold_maxss_int
683  ;CHECK:       maxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
684  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
685  %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
686  ret <4 x float> %2
687}
688declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
689
690define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
691  ;CHECK-LABEL: stack_fold_minpd
692  ;CHECK:       minpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
693  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
694  %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
695  ret <2 x double> %2
696}
697declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
698
699define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
700  ;CHECK-LABEL: stack_fold_minps
701  ;CHECK:       minps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
702  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
703  %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
704  ret <4 x float> %2
705}
706declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
707
708define double @stack_fold_minsd(double %a0, double %a1) {
709  ;CHECK-LABEL: stack_fold_minsd
710  ;CHECK:       minsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
711  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
712  %2 = fcmp olt double %a0, %a1
713  %3 = select i1 %2, double %a0, double %a1
714  ret double %3
715}
716
717define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
718  ;CHECK-LABEL: stack_fold_minsd_int
719  ;CHECK:       minsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
720  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
721  %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
722  ret <2 x double> %2
723}
724declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
725
726define float @stack_fold_minss(float %a0, float %a1) {
727  ;CHECK-LABEL: stack_fold_minss
728  ;CHECK:       minss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
729  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
730  %2 = fcmp olt float %a0, %a1
731  %3 = select i1 %2, float %a0, float %a1
732  ret float %3
733}
734
735define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
736  ;CHECK-LABEL: stack_fold_minss_int
737  ;CHECK:       minss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
738  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
739  %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
740  ret <4 x float> %2
741}
742declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
743
744define <2 x double> @stack_fold_movddup(<2 x double> %a0) {
745  ;CHECK-LABEL: stack_fold_movddup
746  ;CHECK:   movddup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
747  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
748  %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
749  ret <2 x double> %2
750}
751; TODO stack_fold_movhpd (load / store)
752; TODO stack_fold_movhps (load / store)
753
754; TODO stack_fold_movlpd (load / store)
755; TODO stack_fold_movlps (load / store)
756
757define <4 x float> @stack_fold_movshdup(<4 x float> %a0) {
758  ;CHECK-LABEL: stack_fold_movshdup
759  ;CHECK:       movshdup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
760  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
761  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
762  ret <4 x float> %2
763}
764
765define <4 x float> @stack_fold_movsldup(<4 x float> %a0) {
766  ;CHECK-LABEL: stack_fold_movsldup
767  ;CHECK:       movsldup {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
768  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
769  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
770  ret <4 x float> %2
771}
772
773define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) {
774  ;CHECK-LABEL: stack_fold_mulpd
775  ;CHECK:       mulpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
776  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
777  %2 = fmul <2 x double> %a0, %a1
778  ret <2 x double> %2
779}
780
781define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) {
782  ;CHECK-LABEL: stack_fold_mulps
783  ;CHECK:       mulps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
784  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
785  %2 = fmul <4 x float> %a0, %a1
786  ret <4 x float> %2
787}
788
789define double @stack_fold_mulsd(double %a0, double %a1) {
790  ;CHECK-LABEL: stack_fold_mulsd
791  ;CHECK:       mulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
792  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
793  %2 = fmul double %a0, %a1
794  ret double %2
795}
796
797define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
798  ;CHECK-LABEL: stack_fold_mulsd_int
799  ;CHECK:       mulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
800  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
801  %2 = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1)
802  ret <2 x double> %2
803}
804declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone
805
806define float @stack_fold_mulss(float %a0, float %a1) {
807  ;CHECK-LABEL: stack_fold_mulss
808  ;CHECK:       mulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
809  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
810  %2 = fmul float %a0, %a1
811  ret float %2
812}
813
814define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
815  ;CHECK-LABEL: stack_fold_mulss_int
816  ;CHECK:       mulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
817  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
818  %2 = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1)
819  ret <4 x float> %2
820}
821declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone
822
823define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
824  ;CHECK-LABEL: stack_fold_orpd
825  ;CHECK:       orpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
826  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
827  %2 = bitcast <2 x double> %a0 to <2 x i64>
828  %3 = bitcast <2 x double> %a1 to <2 x i64>
829  %4 = or <2 x i64> %2, %3
830  %5 = bitcast <2 x i64> %4 to <2 x double>
831  ; fadd forces execution domain
832  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
833  ret <2 x double> %6
834}
835
836define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
837  ;CHECK-LABEL: stack_fold_orps
838  ;CHECK:       orps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
839  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
840  %2 = bitcast <4 x float> %a0 to <2 x i64>
841  %3 = bitcast <4 x float> %a1 to <2 x i64>
842  %4 = or <2 x i64> %2, %3
843  %5 = bitcast <2 x i64> %4 to <4 x float>
844  ; fadd forces execution domain
845  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
846  ret <4 x float> %6
847}
848
849; TODO stack_fold_rcpps
850
851define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
852  ;CHECK-LABEL: stack_fold_rcpps_int
853  ;CHECK:       rcpps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
854  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
855  %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
856  ret <4 x float> %2
857}
858declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
859
860; TODO stack_fold_rcpss
861; TODO stack_fold_rcpss_int
862
863define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
864  ;CHECK-LABEL: stack_fold_roundpd
865  ;CHECK:       roundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
866  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
867  %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
868  ret <2 x double> %2
869}
870declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
871
872define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
873  ;CHECK-LABEL: stack_fold_roundps
874  ;CHECK:       roundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
875  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
876  %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
877  ret <4 x float> %2
878}
879declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
880
881; TODO stack_fold_roundsd
882; TODO stack_fold_roundsd_int
883
884; TODO stack_fold_roundss
885; TODO stack_fold_roundss_int
886
887; TODO stack_fold_rsqrtps
888
889define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
890  ;CHECK-LABEL: stack_fold_rsqrtps_int
891  ;CHECK:       rsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
892  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
893  %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
894  ret <4 x float> %2
895}
896declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
897
898; TODO stack_fold_rsqrtss
899; TODO stack_fold_rsqrtss_int
900
901define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
902  ;CHECK-LABEL: stack_fold_shufpd
903  ;CHECK:       shufpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
904  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
905  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
906  ret <2 x double> %2
907}
908
909define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
910  ;CHECK-LABEL: stack_fold_shufps
911  ;CHECK:       shufps $200, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
912  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
913  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
914  ret <4 x float> %2
915}
916
917define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) {
918  ;CHECK-LABEL: stack_fold_sqrtpd
919  ;CHECK:       sqrtpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
920  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
921  %2 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
922  ret <2 x double> %2
923}
924declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
925
926define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
927  ;CHECK-LABEL: stack_fold_sqrtps
928  ;CHECK:       sqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
929  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
930  %2 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
931  ret <4 x float> %2
932}
933declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
934
935; TODO stack_fold_sqrtsd
936declare double @llvm.sqrt.f64(double) nounwind readnone
937
938; TODO stack_fold_sqrtsd_int
939declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
940
941; TODO stack_fold_sqrtss
942declare float @llvm.sqrt.f32(float) nounwind readnone
943
944; TODO stack_fold_sqrtss_int
945declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
946
947define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
948  ;CHECK-LABEL: stack_fold_subpd
949  ;CHECK:       subpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
950  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
951  %2 = fsub <2 x double> %a0, %a1
952  ret <2 x double> %2
953}
954
955define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) {
956  ;CHECK-LABEL: stack_fold_subps
957  ;CHECK:       subps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
958  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
959  %2 = fsub <4 x float> %a0, %a1
960  ret <4 x float> %2
961}
962
963define double @stack_fold_subsd(double %a0, double %a1) {
964  ;CHECK-LABEL: stack_fold_subsd
965  ;CHECK:       subsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
966  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
967  %2 = fsub double %a0, %a1
968  ret double %2
969}
970
971define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
972  ;CHECK-LABEL: stack_fold_subsd_int
973  ;CHECK:       subsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
974  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
975  %2 = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1)
976  ret <2 x double> %2
977}
978declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone
979
980define float @stack_fold_subss(float %a0, float %a1) {
981  ;CHECK-LABEL: stack_fold_subss
982  ;CHECK:       subss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
983  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
984  %2 = fsub float %a0, %a1
985  ret float %2
986}
987
988define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
989  ;CHECK-LABEL: stack_fold_subss_int
990  ;CHECK:       subss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
991  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
992  %2 = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1)
993  ret <4 x float> %2
994}
995declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone
996
997define i32 @stack_fold_ucomisd(double %a0, double %a1) {
998  ;CHECK-LABEL: stack_fold_ucomisd
999  ;CHECK:       ucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
1000  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1001  %2 = fcmp ueq double %a0, %a1
1002  %3 = select i1 %2, i32 1, i32 -1
1003  ret i32 %3
1004}
1005
1006define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) {
1007  ;CHECK-LABEL: stack_fold_ucomisd_int
1008  ;CHECK:       ucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1009  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1010  %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
1011  ret i32 %2
1012}
1013declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
1014
1015define i32 @stack_fold_ucomiss(float %a0, float %a1) {
1016  ;CHECK-LABEL: stack_fold_ucomiss
1017  ;CHECK:       ucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
1018  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1019  %2 = fcmp ueq float %a0, %a1
1020  %3 = select i1 %2, i32 1, i32 -1
1021  ret i32 %3
1022}
1023
1024define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) {
1025  ;CHECK-LABEL: stack_fold_ucomiss_int
1026  ;CHECK:       ucomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1027  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1028  %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
1029  ret i32 %2
1030}
1031declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
1032
1033define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) {
1034  ;CHECK-LABEL: stack_fold_unpckhpd
1035  ;CHECK:       unpckhpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1036  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1037  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
1038  ret <2 x double> %2
1039}
1040
1041define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) {
1042  ;CHECK-LABEL: stack_fold_unpckhps
1043  ;CHECK:       unpckhps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1044  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1045  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
1046  ret <4 x float> %2
1047}
1048
1049define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) {
1050  ;CHECK-LABEL: stack_fold_unpcklpd
1051  ;CHECK:       unpcklpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1052  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1053  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
1054  ret <2 x double> %2
1055}
1056
1057define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) {
1058  ;CHECK-LABEL: stack_fold_unpcklps
1059  ;CHECK:       unpcklps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1060  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1061  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
1062  ret <4 x float> %2
1063}
1064
1065define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
1066  ;CHECK-LABEL: stack_fold_xorpd
1067  ;CHECK:       xorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1068  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1069  %2 = bitcast <2 x double> %a0 to <2 x i64>
1070  %3 = bitcast <2 x double> %a1 to <2 x i64>
1071  %4 = xor <2 x i64> %2, %3
1072  %5 = bitcast <2 x i64> %4 to <2 x double>
1073  ; fadd forces execution domain
1074  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
1075  ret <2 x double> %6
1076}
1077
1078define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
1079  ;CHECK-LABEL: stack_fold_xorps
1080  ;CHECK:       xorps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
1081  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1082  %2 = bitcast <4 x float> %a0 to <2 x i64>
1083  %3 = bitcast <4 x float> %a1 to <2 x i64>
1084  %4 = xor <2 x i64> %2, %3
1085  %5 = bitcast <2 x i64> %4 to <4 x float>
1086  ; fadd forces execution domain
1087  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
1088  ret <4 x float> %6
1089}
1090