• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=ALL --check-prefix=X32
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
6
7define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
8; X32-LABEL: test_mm_add_ps:
9; X32:       # BB#0:
10; X32-NEXT:    addps %xmm1, %xmm0
11; X32-NEXT:    retl
12;
13; X64-LABEL: test_mm_add_ps:
14; X64:       # BB#0:
15; X64-NEXT:    addps %xmm1, %xmm0
16; X64-NEXT:    retq
17  %res = fadd <4 x float> %a0, %a1
18  ret <4 x float> %res
19}
20
21define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
22; X32-LABEL: test_mm_add_ss:
23; X32:       # BB#0:
24; X32-NEXT:    addss %xmm1, %xmm0
25; X32-NEXT:    retl
26;
27; X64-LABEL: test_mm_add_ss:
28; X64:       # BB#0:
29; X64-NEXT:    addss %xmm1, %xmm0
30; X64-NEXT:    retq
31  %ext0 = extractelement <4 x float> %a0, i32 0
32  %ext1 = extractelement <4 x float> %a1, i32 0
33  %fadd = fadd float %ext0, %ext1
34  %res = insertelement <4 x float> %a0, float %fadd, i32 0
35  ret <4 x float> %res
36}
37
38define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
39; X32-LABEL: test_mm_and_ps:
40; X32:       # BB#0:
41; X32-NEXT:    pushl %ebp
42; X32-NEXT:    movl %esp, %ebp
43; X32-NEXT:    pushl %esi
44; X32-NEXT:    andl $-16, %esp
45; X32-NEXT:    subl $64, %esp
46; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
47; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
48; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
49; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
50; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
51; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
52; X32-NEXT:    andl {{[0-9]+}}(%esp), %esi
53; X32-NEXT:    movl %esi, (%esp)
54; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
55; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
56; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
57; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
58; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
59; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
60; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
61; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
62; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
63; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
64; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
65; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
66; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
67; X32-NEXT:    leal -4(%ebp), %esp
68; X32-NEXT:    popl %esi
69; X32-NEXT:    popl %ebp
70; X32-NEXT:    retl
71;
72; X64-LABEL: test_mm_and_ps:
73; X64:       # BB#0:
74; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
75; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
76; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8
77; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
78; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
79; X64-NEXT:    movq %rdx, %rsi
80; X64-NEXT:    andl %eax, %edx
81; X64-NEXT:    shrq $32, %rax
82; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
83; X64-NEXT:    movq %rcx, %rdi
84; X64-NEXT:    andl %r8d, %ecx
85; X64-NEXT:    shrq $32, %r8
86; X64-NEXT:    shrq $32, %rsi
87; X64-NEXT:    shrq $32, %rdi
88; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
89; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
90; X64-NEXT:    andl %r8d, %edi
91; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
92; X64-NEXT:    andl %eax, %esi
93; X64-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
94; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
95; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
96; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
97; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
98; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
99; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
100; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
101; X64-NEXT:    retq
102  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
103  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
104  %res = and <4 x i32> %arg0, %arg1
105  %bc = bitcast <4 x i32> %res to <4 x float>
106  ret <4 x float> %bc
107}
108
109define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
110; X32-LABEL: test_mm_andnot_ps:
111; X32:       # BB#0:
112; X32-NEXT:    pushl %ebp
113; X32-NEXT:    movl %esp, %ebp
114; X32-NEXT:    pushl %esi
115; X32-NEXT:    andl $-16, %esp
116; X32-NEXT:    subl $64, %esp
117; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
118; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
119; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
120; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
121; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
122; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
123; X32-NEXT:    notl %edx
124; X32-NEXT:    notl %ecx
125; X32-NEXT:    notl %esi
126; X32-NEXT:    notl %eax
127; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
128; X32-NEXT:    movl %eax, (%esp)
129; X32-NEXT:    andl {{[0-9]+}}(%esp), %esi
130; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
131; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
132; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
133; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
134; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
135; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
136; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
137; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
138; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
139; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
140; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
141; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
142; X32-NEXT:    leal -4(%ebp), %esp
143; X32-NEXT:    popl %esi
144; X32-NEXT:    popl %ebp
145; X32-NEXT:    retl
146;
147; X64-LABEL: test_mm_andnot_ps:
148; X64:       # BB#0:
149; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
150; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
151; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
152; X64-NEXT:    movq %rcx, %rdx
153; X64-NEXT:    shrq $32, %rdx
154; X64-NEXT:    movq %rax, %rsi
155; X64-NEXT:    shrq $32, %rsi
156; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
157; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi
158; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8
159; X64-NEXT:    notl %eax
160; X64-NEXT:    andl %edi, %eax
161; X64-NEXT:    shrq $32, %rdi
162; X64-NEXT:    notl %ecx
163; X64-NEXT:    andl %r8d, %ecx
164; X64-NEXT:    shrq $32, %r8
165; X64-NEXT:    notl %esi
166; X64-NEXT:    notl %edx
167; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
168; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
169; X64-NEXT:    andl %r8d, %edx
170; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
171; X64-NEXT:    andl %edi, %esi
172; X64-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
173; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
174; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
175; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
176; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
177; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
178; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
179; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
180; X64-NEXT:    retq
181  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
182  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
183  %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
184  %res = and <4 x i32> %not, %arg1
185  %bc = bitcast <4 x i32> %res to <4 x float>
186  ret <4 x float> %bc
187}
188
189define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
190; X32-LABEL: test_mm_cmpeq_ps:
191; X32:       # BB#0:
192; X32-NEXT:    cmpeqps %xmm1, %xmm0
193; X32-NEXT:    retl
194;
195; X64-LABEL: test_mm_cmpeq_ps:
196; X64:       # BB#0:
197; X64-NEXT:    cmpeqps %xmm1, %xmm0
198; X64-NEXT:    retq
199  %cmp = fcmp oeq <4 x float> %a0, %a1
200  %sext = sext <4 x i1> %cmp to <4 x i32>
201  %res = bitcast <4 x i32> %sext to <4 x float>
202  ret <4 x float> %res
203}
204
205define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
206; X32-LABEL: test_mm_cmpeq_ss:
207; X32:       # BB#0:
208; X32-NEXT:    cmpeqss %xmm1, %xmm0
209; X32-NEXT:    retl
210;
211; X64-LABEL: test_mm_cmpeq_ss:
212; X64:       # BB#0:
213; X64-NEXT:    cmpeqss %xmm1, %xmm0
214; X64-NEXT:    retq
215  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
216  ret <4 x float> %res
217}
218declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
219
220define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
221; X32-LABEL: test_mm_cmpge_ps:
222; X32:       # BB#0:
223; X32-NEXT:    cmpleps %xmm0, %xmm1
224; X32-NEXT:    movaps %xmm1, %xmm0
225; X32-NEXT:    retl
226;
227; X64-LABEL: test_mm_cmpge_ps:
228; X64:       # BB#0:
229; X64-NEXT:    cmpleps %xmm0, %xmm1
230; X64-NEXT:    movaps %xmm1, %xmm0
231; X64-NEXT:    retq
232  %cmp = fcmp ole <4 x float> %a1, %a0
233  %sext = sext <4 x i1> %cmp to <4 x i32>
234  %res = bitcast <4 x i32> %sext to <4 x float>
235  ret <4 x float> %res
236}
237
238define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
239; X32-LABEL: test_mm_cmpge_ss:
240; X32:       # BB#0:
241; X32-NEXT:    cmpless %xmm0, %xmm1
242; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
243; X32-NEXT:    retl
244;
245; X64-LABEL: test_mm_cmpge_ss:
246; X64:       # BB#0:
247; X64-NEXT:    cmpless %xmm0, %xmm1
248; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
249; X64-NEXT:    retq
250  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
251  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
252  ret <4 x float> %res
253}
254
255define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
256; X32-LABEL: test_mm_cmpgt_ps:
257; X32:       # BB#0:
258; X32-NEXT:    cmpltps %xmm0, %xmm1
259; X32-NEXT:    movaps %xmm1, %xmm0
260; X32-NEXT:    retl
261;
262; X64-LABEL: test_mm_cmpgt_ps:
263; X64:       # BB#0:
264; X64-NEXT:    cmpltps %xmm0, %xmm1
265; X64-NEXT:    movaps %xmm1, %xmm0
266; X64-NEXT:    retq
267  %cmp = fcmp olt <4 x float> %a1, %a0
268  %sext = sext <4 x i1> %cmp to <4 x i32>
269  %res = bitcast <4 x i32> %sext to <4 x float>
270  ret <4 x float> %res
271}
272
273define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
274; X32-LABEL: test_mm_cmpgt_ss:
275; X32:       # BB#0:
276; X32-NEXT:    cmpltss %xmm0, %xmm1
277; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
278; X32-NEXT:    retl
279;
280; X64-LABEL: test_mm_cmpgt_ss:
281; X64:       # BB#0:
282; X64-NEXT:    cmpltss %xmm0, %xmm1
283; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
284; X64-NEXT:    retq
285  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
286  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
287  ret <4 x float> %res
288}
289
290define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
291; X32-LABEL: test_mm_cmple_ps:
292; X32:       # BB#0:
293; X32-NEXT:    cmpleps %xmm1, %xmm0
294; X32-NEXT:    retl
295;
296; X64-LABEL: test_mm_cmple_ps:
297; X64:       # BB#0:
298; X64-NEXT:    cmpleps %xmm1, %xmm0
299; X64-NEXT:    retq
300  %cmp = fcmp ole <4 x float> %a0, %a1
301  %sext = sext <4 x i1> %cmp to <4 x i32>
302  %res = bitcast <4 x i32> %sext to <4 x float>
303  ret <4 x float> %res
304}
305
306define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
307; X32-LABEL: test_mm_cmple_ss:
308; X32:       # BB#0:
309; X32-NEXT:    cmpless %xmm1, %xmm0
310; X32-NEXT:    retl
311;
312; X64-LABEL: test_mm_cmple_ss:
313; X64:       # BB#0:
314; X64-NEXT:    cmpless %xmm1, %xmm0
315; X64-NEXT:    retq
316  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
317  ret <4 x float> %res
318}
319
320define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
321; X32-LABEL: test_mm_cmplt_ps:
322; X32:       # BB#0:
323; X32-NEXT:    cmpltps %xmm1, %xmm0
324; X32-NEXT:    retl
325;
326; X64-LABEL: test_mm_cmplt_ps:
327; X64:       # BB#0:
328; X64-NEXT:    cmpltps %xmm1, %xmm0
329; X64-NEXT:    retq
330  %cmp = fcmp olt <4 x float> %a0, %a1
331  %sext = sext <4 x i1> %cmp to <4 x i32>
332  %res = bitcast <4 x i32> %sext to <4 x float>
333  ret <4 x float> %res
334}
335
336define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
337; X32-LABEL: test_mm_cmplt_ss:
338; X32:       # BB#0:
339; X32-NEXT:    cmpltss %xmm1, %xmm0
340; X32-NEXT:    retl
341;
342; X64-LABEL: test_mm_cmplt_ss:
343; X64:       # BB#0:
344; X64-NEXT:    cmpltss %xmm1, %xmm0
345; X64-NEXT:    retq
346  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
347  ret <4 x float> %res
348}
349
350define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
351; X32-LABEL: test_mm_cmpneq_ps:
352; X32:       # BB#0:
353; X32-NEXT:    cmpneqps %xmm1, %xmm0
354; X32-NEXT:    retl
355;
356; X64-LABEL: test_mm_cmpneq_ps:
357; X64:       # BB#0:
358; X64-NEXT:    cmpneqps %xmm1, %xmm0
359; X64-NEXT:    retq
360  %cmp = fcmp une <4 x float> %a0, %a1
361  %sext = sext <4 x i1> %cmp to <4 x i32>
362  %res = bitcast <4 x i32> %sext to <4 x float>
363  ret <4 x float> %res
364}
365
366define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
367; X32-LABEL: test_mm_cmpneq_ss:
368; X32:       # BB#0:
369; X32-NEXT:    cmpneqss %xmm1, %xmm0
370; X32-NEXT:    retl
371;
372; X64-LABEL: test_mm_cmpneq_ss:
373; X64:       # BB#0:
374; X64-NEXT:    cmpneqss %xmm1, %xmm0
375; X64-NEXT:    retq
376  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
377  ret <4 x float> %res
378}
379
380define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
381; X32-LABEL: test_mm_cmpnge_ps:
382; X32:       # BB#0:
383; X32-NEXT:    cmpnleps %xmm0, %xmm1
384; X32-NEXT:    movaps %xmm1, %xmm0
385; X32-NEXT:    retl
386;
387; X64-LABEL: test_mm_cmpnge_ps:
388; X64:       # BB#0:
389; X64-NEXT:    cmpnleps %xmm0, %xmm1
390; X64-NEXT:    movaps %xmm1, %xmm0
391; X64-NEXT:    retq
392  %cmp = fcmp ugt <4 x float> %a1, %a0
393  %sext = sext <4 x i1> %cmp to <4 x i32>
394  %res = bitcast <4 x i32> %sext to <4 x float>
395  ret <4 x float> %res
396}
397
398define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
399; X32-LABEL: test_mm_cmpnge_ss:
400; X32:       # BB#0:
401; X32-NEXT:    cmpnless %xmm0, %xmm1
402; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
403; X32-NEXT:    retl
404;
405; X64-LABEL: test_mm_cmpnge_ss:
406; X64:       # BB#0:
407; X64-NEXT:    cmpnless %xmm0, %xmm1
408; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
409; X64-NEXT:    retq
410  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
411  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
412  ret <4 x float> %res
413}
414
415define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
416; X32-LABEL: test_mm_cmpngt_ps:
417; X32:       # BB#0:
418; X32-NEXT:    cmpnltps %xmm0, %xmm1
419; X32-NEXT:    movaps %xmm1, %xmm0
420; X32-NEXT:    retl
421;
422; X64-LABEL: test_mm_cmpngt_ps:
423; X64:       # BB#0:
424; X64-NEXT:    cmpnltps %xmm0, %xmm1
425; X64-NEXT:    movaps %xmm1, %xmm0
426; X64-NEXT:    retq
427  %cmp = fcmp uge <4 x float> %a1, %a0
428  %sext = sext <4 x i1> %cmp to <4 x i32>
429  %res = bitcast <4 x i32> %sext to <4 x float>
430  ret <4 x float> %res
431}
432
433define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
434; X32-LABEL: test_mm_cmpngt_ss:
435; X32:       # BB#0:
436; X32-NEXT:    cmpnltss %xmm0, %xmm1
437; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
438; X32-NEXT:    retl
439;
440; X64-LABEL: test_mm_cmpngt_ss:
441; X64:       # BB#0:
442; X64-NEXT:    cmpnltss %xmm0, %xmm1
443; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
444; X64-NEXT:    retq
445  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
446  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
447  ret <4 x float> %res
448}
449
450define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
451; X32-LABEL: test_mm_cmpnle_ps:
452; X32:       # BB#0:
453; X32-NEXT:    cmpnleps %xmm1, %xmm0
454; X32-NEXT:    retl
455;
456; X64-LABEL: test_mm_cmpnle_ps:
457; X64:       # BB#0:
458; X64-NEXT:    cmpnleps %xmm1, %xmm0
459; X64-NEXT:    retq
460  %cmp = fcmp ugt <4 x float> %a0, %a1
461  %sext = sext <4 x i1> %cmp to <4 x i32>
462  %res = bitcast <4 x i32> %sext to <4 x float>
463  ret <4 x float> %res
464}
465
466define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
467; X32-LABEL: test_mm_cmpnle_ss:
468; X32:       # BB#0:
469; X32-NEXT:    cmpnless %xmm1, %xmm0
470; X32-NEXT:    retl
471;
472; X64-LABEL: test_mm_cmpnle_ss:
473; X64:       # BB#0:
474; X64-NEXT:    cmpnless %xmm1, %xmm0
475; X64-NEXT:    retq
476  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
477  ret <4 x float> %res
478}
479
480define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
481; X32-LABEL: test_mm_cmpnlt_ps:
482; X32:       # BB#0:
483; X32-NEXT:    cmpnltps %xmm1, %xmm0
484; X32-NEXT:    retl
485;
486; X64-LABEL: test_mm_cmpnlt_ps:
487; X64:       # BB#0:
488; X64-NEXT:    cmpnltps %xmm1, %xmm0
489; X64-NEXT:    retq
490  %cmp = fcmp uge <4 x float> %a0, %a1
491  %sext = sext <4 x i1> %cmp to <4 x i32>
492  %res = bitcast <4 x i32> %sext to <4 x float>
493  ret <4 x float> %res
494}
495
496define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
497; X32-LABEL: test_mm_cmpnlt_ss:
498; X32:       # BB#0:
499; X32-NEXT:    cmpnltss %xmm1, %xmm0
500; X32-NEXT:    retl
501;
502; X64-LABEL: test_mm_cmpnlt_ss:
503; X64:       # BB#0:
504; X64-NEXT:    cmpnltss %xmm1, %xmm0
505; X64-NEXT:    retq
506  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
507  ret <4 x float> %res
508}
509
510define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
511; X32-LABEL: test_mm_cmpord_ps:
512; X32:       # BB#0:
513; X32-NEXT:    cmpordps %xmm1, %xmm0
514; X32-NEXT:    retl
515;
516; X64-LABEL: test_mm_cmpord_ps:
517; X64:       # BB#0:
518; X64-NEXT:    cmpordps %xmm1, %xmm0
519; X64-NEXT:    retq
520  %cmp = fcmp ord <4 x float> %a0, %a1
521  %sext = sext <4 x i1> %cmp to <4 x i32>
522  %res = bitcast <4 x i32> %sext to <4 x float>
523  ret <4 x float> %res
524}
525
526define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
527; X32-LABEL: test_mm_cmpord_ss:
528; X32:       # BB#0:
529; X32-NEXT:    cmpordss %xmm1, %xmm0
530; X32-NEXT:    retl
531;
532; X64-LABEL: test_mm_cmpord_ss:
533; X64:       # BB#0:
534; X64-NEXT:    cmpordss %xmm1, %xmm0
535; X64-NEXT:    retq
536  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
537  ret <4 x float> %res
538}
539
540define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
541; X32-LABEL: test_mm_cmpunord_ps:
542; X32:       # BB#0:
543; X32-NEXT:    cmpunordps %xmm1, %xmm0
544; X32-NEXT:    retl
545;
546; X64-LABEL: test_mm_cmpunord_ps:
547; X64:       # BB#0:
548; X64-NEXT:    cmpunordps %xmm1, %xmm0
549; X64-NEXT:    retq
550  %cmp = fcmp uno <4 x float> %a0, %a1
551  %sext = sext <4 x i1> %cmp to <4 x i32>
552  %res = bitcast <4 x i32> %sext to <4 x float>
553  ret <4 x float> %res
554}
555
556define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
557; X32-LABEL: test_mm_cmpunord_ss:
558; X32:       # BB#0:
559; X32-NEXT:    cmpunordss %xmm1, %xmm0
560; X32-NEXT:    retl
561;
562; X64-LABEL: test_mm_cmpunord_ss:
563; X64:       # BB#0:
564; X64-NEXT:    cmpunordss %xmm1, %xmm0
565; X64-NEXT:    retq
566  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
567  ret <4 x float> %res
568}
569
570define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
571; X32-LABEL: test_mm_comieq_ss:
572; X32:       # BB#0:
573; X32-NEXT:    comiss %xmm1, %xmm0
574; X32-NEXT:    setnp %al
575; X32-NEXT:    sete %cl
576; X32-NEXT:    andb %al, %cl
577; X32-NEXT:    movzbl %cl, %eax
578; X32-NEXT:    retl
579;
580; X64-LABEL: test_mm_comieq_ss:
581; X64:       # BB#0:
582; X64-NEXT:    comiss %xmm1, %xmm0
583; X64-NEXT:    setnp %al
584; X64-NEXT:    sete %cl
585; X64-NEXT:    andb %al, %cl
586; X64-NEXT:    movzbl %cl, %eax
587; X64-NEXT:    retq
588  %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
589  ret i32 %res
590}
591declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
592
593define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
594; X32-LABEL: test_mm_comige_ss:
595; X32:       # BB#0:
596; X32-NEXT:    xorl %eax, %eax
597; X32-NEXT:    comiss %xmm1, %xmm0
598; X32-NEXT:    setae %al
599; X32-NEXT:    retl
600;
601; X64-LABEL: test_mm_comige_ss:
602; X64:       # BB#0:
603; X64-NEXT:    xorl %eax, %eax
604; X64-NEXT:    comiss %xmm1, %xmm0
605; X64-NEXT:    setae %al
606; X64-NEXT:    retq
607  %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1)
608  ret i32 %res
609}
610declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
611
612define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
613; X32-LABEL: test_mm_comigt_ss:
614; X32:       # BB#0:
615; X32-NEXT:    xorl %eax, %eax
616; X32-NEXT:    comiss %xmm1, %xmm0
617; X32-NEXT:    seta %al
618; X32-NEXT:    retl
619;
620; X64-LABEL: test_mm_comigt_ss:
621; X64:       # BB#0:
622; X64-NEXT:    xorl %eax, %eax
623; X64-NEXT:    comiss %xmm1, %xmm0
624; X64-NEXT:    seta %al
625; X64-NEXT:    retq
626  %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1)
627  ret i32 %res
628}
629declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
630
631define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
632; X32-LABEL: test_mm_comile_ss:
633; X32:       # BB#0:
634; X32-NEXT:    xorl %eax, %eax
635; X32-NEXT:    comiss %xmm0, %xmm1
636; X32-NEXT:    setae %al
637; X32-NEXT:    retl
638;
639; X64-LABEL: test_mm_comile_ss:
640; X64:       # BB#0:
641; X64-NEXT:    xorl %eax, %eax
642; X64-NEXT:    comiss %xmm0, %xmm1
643; X64-NEXT:    setae %al
644; X64-NEXT:    retq
645  %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1)
646  ret i32 %res
647}
648declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
649
650define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
651; X32-LABEL: test_mm_comilt_ss:
652; X32:       # BB#0:
653; X32-NEXT:    xorl %eax, %eax
654; X32-NEXT:    comiss %xmm0, %xmm1
655; X32-NEXT:    seta %al
656; X32-NEXT:    retl
657;
658; X64-LABEL: test_mm_comilt_ss:
659; X64:       # BB#0:
660; X64-NEXT:    xorl %eax, %eax
661; X64-NEXT:    comiss %xmm0, %xmm1
662; X64-NEXT:    seta %al
663; X64-NEXT:    retq
664  %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1)
665  ret i32 %res
666}
667declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
668
669define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
670; X32-LABEL: test_mm_comineq_ss:
671; X32:       # BB#0:
672; X32-NEXT:    comiss %xmm1, %xmm0
673; X32-NEXT:    setp %al
674; X32-NEXT:    setne %cl
675; X32-NEXT:    orb %al, %cl
676; X32-NEXT:    movzbl %cl, %eax
677; X32-NEXT:    retl
678;
679; X64-LABEL: test_mm_comineq_ss:
680; X64:       # BB#0:
681; X64-NEXT:    comiss %xmm1, %xmm0
682; X64-NEXT:    setp %al
683; X64-NEXT:    setne %cl
684; X64-NEXT:    orb %al, %cl
685; X64-NEXT:    movzbl %cl, %eax
686; X64-NEXT:    retq
687  %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
688  ret i32 %res
689}
690declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
691
692define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
693; X32-LABEL: test_mm_cvt_ss2si:
694; X32:       # BB#0:
695; X32-NEXT:    cvtss2si %xmm0, %eax
696; X32-NEXT:    retl
697;
698; X64-LABEL: test_mm_cvt_ss2si:
699; X64:       # BB#0:
700; X64-NEXT:    cvtss2si %xmm0, %eax
701; X64-NEXT:    retq
702  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
703  ret i32 %res
704}
705declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
706
707define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
708; X32-LABEL: test_mm_cvtsi32_ss:
709; X32:       # BB#0:
710; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
711; X32-NEXT:    cvtsi2ssl %eax, %xmm1
712; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
713; X32-NEXT:    retl
714;
715; X64-LABEL: test_mm_cvtsi32_ss:
716; X64:       # BB#0:
717; X64-NEXT:    cvtsi2ssl %edi, %xmm1
718; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
719; X64-NEXT:    retq
720  %cvt = sitofp i32 %a1 to float
721  %res = insertelement <4 x float> %a0, float %cvt, i32 0
722  ret <4 x float> %res
723}
724
725define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
726; X32-LABEL: test_mm_cvtss_f32:
727; X32:       # BB#0:
728; X32-NEXT:    pushl %eax
729; X32-NEXT:    movss %xmm0, (%esp)
730; X32-NEXT:    flds (%esp)
731; X32-NEXT:    popl %eax
732; X32-NEXT:    retl
733;
734; X64-LABEL: test_mm_cvtss_f32:
735; X64:       # BB#0:
736; X64-NEXT:    retq
737  %res = extractelement <4 x float> %a0, i32 0
738  ret float %res
739}
740
741define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
742; X32-LABEL: test_mm_cvtss_si32:
743; X32:       # BB#0:
744; X32-NEXT:    cvtss2si %xmm0, %eax
745; X32-NEXT:    retl
746;
747; X64-LABEL: test_mm_cvtss_si32:
748; X64:       # BB#0:
749; X64-NEXT:    cvtss2si %xmm0, %eax
750; X64-NEXT:    retq
751  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
752  ret i32 %res
753}
754
755define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
756; X32-LABEL: test_mm_cvttss_si:
757; X32:       # BB#0:
758; X32-NEXT:    cvttss2si %xmm0, %eax
759; X32-NEXT:    retl
760;
761; X64-LABEL: test_mm_cvttss_si:
762; X64:       # BB#0:
763; X64-NEXT:    cvttss2si %xmm0, %eax
764; X64-NEXT:    retq
765  %cvt = extractelement <4 x float> %a0, i32 0
766  %res = fptosi float %cvt to i32
767  ret i32 %res
768}
769
770define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
771; X32-LABEL: test_mm_cvttss_si32:
772; X32:       # BB#0:
773; X32-NEXT:    cvttss2si %xmm0, %eax
774; X32-NEXT:    retl
775;
776; X64-LABEL: test_mm_cvttss_si32:
777; X64:       # BB#0:
778; X64-NEXT:    cvttss2si %xmm0, %eax
779; X64-NEXT:    retq
780  %cvt = extractelement <4 x float> %a0, i32 0
781  %res = fptosi float %cvt to i32
782  ret i32 %res
783}
784
785define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
786; X32-LABEL: test_mm_div_ps:
787; X32:       # BB#0:
788; X32-NEXT:    divps %xmm1, %xmm0
789; X32-NEXT:    retl
790;
791; X64-LABEL: test_mm_div_ps:
792; X64:       # BB#0:
793; X64-NEXT:    divps %xmm1, %xmm0
794; X64-NEXT:    retq
795  %res = fdiv <4 x float> %a0, %a1
796  ret <4 x float> %res
797}
798
799define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
800; X32-LABEL: test_mm_div_ss:
801; X32:       # BB#0:
802; X32-NEXT:    divss %xmm1, %xmm0
803; X32-NEXT:    retl
804;
805; X64-LABEL: test_mm_div_ss:
806; X64:       # BB#0:
807; X64-NEXT:    divss %xmm1, %xmm0
808; X64-NEXT:    retq
809  %ext0 = extractelement <4 x float> %a0, i32 0
810  %ext1 = extractelement <4 x float> %a1, i32 0
811  %fdiv = fdiv float %ext0, %ext1
812  %res = insertelement <4 x float> %a0, float %fdiv, i32 0
813  ret <4 x float> %res
814}
815
816define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
817; X32-LABEL: test_MM_GET_EXCEPTION_MASK:
818; X32:       # BB#0:
819; X32-NEXT:    pushl %eax
820; X32-NEXT:    leal (%esp), %eax
821; X32-NEXT:    stmxcsr (%eax)
822; X32-NEXT:    movl (%esp), %eax
823; X32-NEXT:    andl $8064, %eax # imm = 0x1F80
824; X32-NEXT:    popl %ecx
825; X32-NEXT:    retl
826;
827; X64-LABEL: test_MM_GET_EXCEPTION_MASK:
828; X64:       # BB#0:
829; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
830; X64-NEXT:    stmxcsr (%rax)
831; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
832; X64-NEXT:    andl $8064, %eax # imm = 0x1F80
833; X64-NEXT:    retq
834  %1 = alloca i32, align 4
835  %2 = bitcast i32* %1 to i8*
836  call void @llvm.x86.sse.stmxcsr(i8* %2)
837  %3 = load i32, i32* %1, align 4
838  %4 = and i32 %3, 8064
839  ret i32 %4
840}
841declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
842
843define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
844; X32-LABEL: test_MM_GET_EXCEPTION_STATE:
845; X32:       # BB#0:
846; X32-NEXT:    pushl %eax
847; X32-NEXT:    leal (%esp), %eax
848; X32-NEXT:    stmxcsr (%eax)
849; X32-NEXT:    movl (%esp), %eax
850; X32-NEXT:    andl $63, %eax
851; X32-NEXT:    popl %ecx
852; X32-NEXT:    retl
853;
854; X64-LABEL: test_MM_GET_EXCEPTION_STATE:
855; X64:       # BB#0:
856; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
857; X64-NEXT:    stmxcsr (%rax)
858; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
859; X64-NEXT:    andl $63, %eax
860; X64-NEXT:    retq
861  %1 = alloca i32, align 4
862  %2 = bitcast i32* %1 to i8*
863  call void @llvm.x86.sse.stmxcsr(i8* %2)
864  %3 = load i32, i32* %1, align 4
865  %4 = and i32 %3, 63
866  ret i32 %4
867}
868
869define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
870; X32-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
871; X32:       # BB#0:
872; X32-NEXT:    pushl %eax
873; X32-NEXT:    leal (%esp), %eax
874; X32-NEXT:    stmxcsr (%eax)
875; X32-NEXT:    movl (%esp), %eax
876; X32-NEXT:    andl $32768, %eax # imm = 0x8000
877; X32-NEXT:    popl %ecx
878; X32-NEXT:    retl
879;
880; X64-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
881; X64:       # BB#0:
882; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
883; X64-NEXT:    stmxcsr (%rax)
884; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
885; X64-NEXT:    andl $32768, %eax # imm = 0x8000
886; X64-NEXT:    retq
887  %1 = alloca i32, align 4
888  %2 = bitcast i32* %1 to i8*
889  call void @llvm.x86.sse.stmxcsr(i8* %2)
890  %3 = load i32, i32* %1, align 4
891  %4 = and i32 %3, 32768
892  ret i32 %4
893}
894
895define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
896; X32-LABEL: test_MM_GET_ROUNDING_MODE:
897; X32:       # BB#0:
898; X32-NEXT:    pushl %eax
899; X32-NEXT:    leal (%esp), %eax
900; X32-NEXT:    stmxcsr (%eax)
901; X32-NEXT:    movl (%esp), %eax
902; X32-NEXT:    andl $24576, %eax # imm = 0x6000
903; X32-NEXT:    popl %ecx
904; X32-NEXT:    retl
905;
906; X64-LABEL: test_MM_GET_ROUNDING_MODE:
907; X64:       # BB#0:
908; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
909; X64-NEXT:    stmxcsr (%rax)
910; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
911; X64-NEXT:    andl $24576, %eax # imm = 0x6000
912; X64-NEXT:    retq
913  %1 = alloca i32, align 4
914  %2 = bitcast i32* %1 to i8*
915  call void @llvm.x86.sse.stmxcsr(i8* %2)
916  %3 = load i32, i32* %1, align 4
917  %4 = and i32 %3, 24576
918  ret i32 %4
919}
920
921define i32 @test_mm_getcsr() nounwind {
922; X32-LABEL: test_mm_getcsr:
923; X32:       # BB#0:
924; X32-NEXT:    pushl %eax
925; X32-NEXT:    leal (%esp), %eax
926; X32-NEXT:    stmxcsr (%eax)
927; X32-NEXT:    movl (%esp), %eax
928; X32-NEXT:    popl %ecx
929; X32-NEXT:    retl
930;
931; X64-LABEL: test_mm_getcsr:
932; X64:       # BB#0:
933; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
934; X64-NEXT:    stmxcsr (%rax)
935; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
936; X64-NEXT:    retq
937  %1 = alloca i32, align 4
938  %2 = bitcast i32* %1 to i8*
939  call void @llvm.x86.sse.stmxcsr(i8* %2)
940  %3 = load i32, i32* %1, align 4
941  ret i32 %3
942}
943
944define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
945; X32-LABEL: test_mm_load_ps:
946; X32:       # BB#0:
947; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
948; X32-NEXT:    movaps (%eax), %xmm0
949; X32-NEXT:    retl
950;
951; X64-LABEL: test_mm_load_ps:
952; X64:       # BB#0:
953; X64-NEXT:    movaps (%rdi), %xmm0
954; X64-NEXT:    retq
955  %arg0 = bitcast float* %a0 to <4 x float>*
956  %res = load <4 x float>, <4 x float>* %arg0, align 16
957  ret <4 x float> %res
958}
959
960define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
961; X32-LABEL: test_mm_load_ps1:
962; X32:       # BB#0:
963; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
964; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
965; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
966; X32-NEXT:    retl
967;
968; X64-LABEL: test_mm_load_ps1:
969; X64:       # BB#0:
970; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
971; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
972; X64-NEXT:    retq
973  %ld = load float, float* %a0, align 4
974  %res0 = insertelement <4 x float> undef, float %ld, i32 0
975  %res1 = insertelement <4 x float> %res0, float %ld, i32 1
976  %res2 = insertelement <4 x float> %res1, float %ld, i32 2
977  %res3 = insertelement <4 x float> %res2, float %ld, i32 3
978  ret <4 x float> %res3
979}
980
981define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
982; X32-LABEL: test_mm_load_ss:
983; X32:       # BB#0:
984; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
985; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
986; X32-NEXT:    retl
987;
988; X64-LABEL: test_mm_load_ss:
989; X64:       # BB#0:
990; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
991; X64-NEXT:    retq
992  %ld = load float, float* %a0, align 1
993  %res0 = insertelement <4 x float> undef, float %ld, i32 0
994  %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
995  %res2 = insertelement <4 x float> %res1, float 0.0, i32 2
996  %res3 = insertelement <4 x float> %res2, float 0.0, i32 3
997  ret <4 x float> %res3
998}
999
1000define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
1001; X32-LABEL: test_mm_load1_ps:
1002; X32:       # BB#0:
1003; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1004; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1005; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1006; X32-NEXT:    retl
1007;
1008; X64-LABEL: test_mm_load1_ps:
1009; X64:       # BB#0:
1010; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1011; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1012; X64-NEXT:    retq
1013  %ld = load float, float* %a0, align 4
1014  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1015  %res1 = insertelement <4 x float> %res0, float %ld, i32 1
1016  %res2 = insertelement <4 x float> %res1, float %ld, i32 2
1017  %res3 = insertelement <4 x float> %res2, float %ld, i32 3
1018  ret <4 x float> %res3
1019}
1020
1021define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
1022; X32-LABEL: test_mm_loadh_pi:
1023; X32:       # BB#0:
1024; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1025; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1026; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1027; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1028; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1029; X32-NEXT:    retl
1030;
1031; X64-LABEL: test_mm_loadh_pi:
1032; X64:       # BB#0:
1033; X64-NEXT:    movq (%rdi), %rax
1034; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
1035; X64-NEXT:    shrq $32, %rax
1036; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
1037; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1038; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1039; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1040; X64-NEXT:    xorps %xmm2, %xmm2
1041; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1042; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1043; X64-NEXT:    retq
1044  %ptr = bitcast x86_mmx* %a1 to <2 x float>*
1045  %ld  = load <2 x float>, <2 x float>* %ptr
1046  %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1047  %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1048  ret <4 x float> %res
1049}
1050
1051define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
1052; X32-LABEL: test_mm_loadl_pi:
1053; X32:       # BB#0:
1054; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1055; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1056; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1057; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1058; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1059; X32-NEXT:    movaps %xmm1, %xmm0
1060; X32-NEXT:    retl
1061;
1062; X64-LABEL: test_mm_loadl_pi:
1063; X64:       # BB#0:
1064; X64-NEXT:    movq (%rdi), %rax
1065; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
1066; X64-NEXT:    shrq $32, %rax
1067; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
1068; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1069; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1070; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1071; X64-NEXT:    xorps %xmm2, %xmm2
1072; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1073; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1074; X64-NEXT:    movaps %xmm1, %xmm0
1075; X64-NEXT:    retq
1076  %ptr = bitcast x86_mmx* %a1 to <2 x float>*
1077  %ld  = load <2 x float>, <2 x float>* %ptr
1078  %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1079  %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1080  ret <4 x float> %res
1081}
1082
1083define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
1084; X32-LABEL: test_mm_loadr_ps:
1085; X32:       # BB#0:
1086; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1087; X32-NEXT:    movaps (%eax), %xmm0
1088; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1089; X32-NEXT:    retl
1090;
1091; X64-LABEL: test_mm_loadr_ps:
1092; X64:       # BB#0:
1093; X64-NEXT:    movaps (%rdi), %xmm0
1094; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1095; X64-NEXT:    retq
1096  %arg0 = bitcast float* %a0 to <4 x float>*
1097  %ld = load <4 x float>, <4 x float>* %arg0, align 16
1098  %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1099  ret <4 x float> %res
1100}
1101
1102define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
1103; X32-LABEL: test_mm_loadu_ps:
1104; X32:       # BB#0:
1105; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1106; X32-NEXT:    movups (%eax), %xmm0
1107; X32-NEXT:    retl
1108;
1109; X64-LABEL: test_mm_loadu_ps:
1110; X64:       # BB#0:
1111; X64-NEXT:    movups (%rdi), %xmm0
1112; X64-NEXT:    retq
1113  %arg0 = bitcast float* %a0 to <4 x float>*
1114  %res = load <4 x float>, <4 x float>* %arg0, align 1
1115  ret <4 x float> %res
1116}
1117
1118define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
1119; X32-LABEL: test_mm_max_ps:
1120; X32:       # BB#0:
1121; X32-NEXT:    maxps %xmm1, %xmm0
1122; X32-NEXT:    retl
1123;
1124; X64-LABEL: test_mm_max_ps:
1125; X64:       # BB#0:
1126; X64-NEXT:    maxps %xmm1, %xmm0
1127; X64-NEXT:    retq
1128  %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
1129  ret <4 x float> %res
1130}
1131declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
1132
1133define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
1134; X32-LABEL: test_mm_max_ss:
1135; X32:       # BB#0:
1136; X32-NEXT:    maxss %xmm1, %xmm0
1137; X32-NEXT:    retl
1138;
1139; X64-LABEL: test_mm_max_ss:
1140; X64:       # BB#0:
1141; X64-NEXT:    maxss %xmm1, %xmm0
1142; X64-NEXT:    retq
1143  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
1144  ret <4 x float> %res
1145}
1146declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
1147
1148define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
1149; X32-LABEL: test_mm_min_ps:
1150; X32:       # BB#0:
1151; X32-NEXT:    minps %xmm1, %xmm0
1152; X32-NEXT:    retl
1153;
1154; X64-LABEL: test_mm_min_ps:
1155; X64:       # BB#0:
1156; X64-NEXT:    minps %xmm1, %xmm0
1157; X64-NEXT:    retq
1158  %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
1159  ret <4 x float> %res
1160}
1161declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
1162
1163define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
1164; X32-LABEL: test_mm_min_ss:
1165; X32:       # BB#0:
1166; X32-NEXT:    minss %xmm1, %xmm0
1167; X32-NEXT:    retl
1168;
1169; X64-LABEL: test_mm_min_ss:
1170; X64:       # BB#0:
1171; X64-NEXT:    minss %xmm1, %xmm0
1172; X64-NEXT:    retq
1173  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
1174  ret <4 x float> %res
1175}
1176declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
1177
1178define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
1179; X32-LABEL: test_mm_move_ss:
1180; X32:       # BB#0:
1181; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1182; X32-NEXT:    retl
1183;
1184; X64-LABEL: test_mm_move_ss:
1185; X64:       # BB#0:
1186; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1187; X64-NEXT:    retq
1188  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1189  ret <4 x float> %res
1190}
1191
1192define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
1193; X32-LABEL: test_mm_movehl_ps:
1194; X32:       # BB#0:
1195; X32-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1196; X32-NEXT:    retl
1197;
1198; X64-LABEL: test_mm_movehl_ps:
1199; X64:       # BB#0:
1200; X64-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1201; X64-NEXT:    retq
1202  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
1203  ret <4 x float> %res
1204}
1205
1206define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
1207; X32-LABEL: test_mm_movelh_ps:
1208; X32:       # BB#0:
1209; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1210; X32-NEXT:    retl
1211;
1212; X64-LABEL: test_mm_movelh_ps:
1213; X64:       # BB#0:
1214; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1215; X64-NEXT:    retq
1216  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1217  ret <4 x float> %res
1218}
1219
1220define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
1221; X32-LABEL: test_mm_movemask_ps:
1222; X32:       # BB#0:
1223; X32-NEXT:    movmskps %xmm0, %eax
1224; X32-NEXT:    retl
1225;
1226; X64-LABEL: test_mm_movemask_ps:
1227; X64:       # BB#0:
1228; X64-NEXT:    movmskps %xmm0, %eax
1229; X64-NEXT:    retq
1230  %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
1231  ret i32 %res
1232}
1233declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
1234
1235define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
1236; X32-LABEL: test_mm_mul_ps:
1237; X32:       # BB#0:
1238; X32-NEXT:    mulps %xmm1, %xmm0
1239; X32-NEXT:    retl
1240;
1241; X64-LABEL: test_mm_mul_ps:
1242; X64:       # BB#0:
1243; X64-NEXT:    mulps %xmm1, %xmm0
1244; X64-NEXT:    retq
1245  %res = fmul <4 x float> %a0, %a1
1246  ret <4 x float> %res
1247}
1248
1249define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
1250; X32-LABEL: test_mm_mul_ss:
1251; X32:       # BB#0:
1252; X32-NEXT:    mulss %xmm1, %xmm0
1253; X32-NEXT:    retl
1254;
1255; X64-LABEL: test_mm_mul_ss:
1256; X64:       # BB#0:
1257; X64-NEXT:    mulss %xmm1, %xmm0
1258; X64-NEXT:    retq
1259  %ext0 = extractelement <4 x float> %a0, i32 0
1260  %ext1 = extractelement <4 x float> %a1, i32 0
1261  %fmul = fmul float %ext0, %ext1
1262  %res = insertelement <4 x float> %a0, float %fmul, i32 0
1263  ret <4 x float> %res
1264}
1265
1266define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
1267; X32-LABEL: test_mm_or_ps:
1268; X32:       # BB#0:
1269; X32-NEXT:    pushl %ebp
1270; X32-NEXT:    movl %esp, %ebp
1271; X32-NEXT:    pushl %esi
1272; X32-NEXT:    andl $-16, %esp
1273; X32-NEXT:    subl $64, %esp
1274; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
1275; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1276; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1277; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
1278; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
1279; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
1280; X32-NEXT:    orl {{[0-9]+}}(%esp), %esi
1281; X32-NEXT:    movl %esi, (%esp)
1282; X32-NEXT:    orl {{[0-9]+}}(%esp), %edx
1283; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
1284; X32-NEXT:    orl {{[0-9]+}}(%esp), %eax
1285; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
1286; X32-NEXT:    orl {{[0-9]+}}(%esp), %ecx
1287; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
1288; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1289; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1290; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1291; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1292; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1293; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1294; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1295; X32-NEXT:    leal -4(%ebp), %esp
1296; X32-NEXT:    popl %esi
1297; X32-NEXT:    popl %ebp
1298; X32-NEXT:    retl
1299;
1300; X64-LABEL: test_mm_or_ps:
1301; X64:       # BB#0:
1302; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1303; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1304; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8
1305; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1306; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
1307; X64-NEXT:    movq %rdx, %rsi
1308; X64-NEXT:    orl %eax, %edx
1309; X64-NEXT:    shrq $32, %rax
1310; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
1311; X64-NEXT:    movq %rcx, %rdi
1312; X64-NEXT:    orl %r8d, %ecx
1313; X64-NEXT:    shrq $32, %r8
1314; X64-NEXT:    shrq $32, %rsi
1315; X64-NEXT:    shrq $32, %rdi
1316; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
1317; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
1318; X64-NEXT:    orl %r8d, %edi
1319; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
1320; X64-NEXT:    orl %eax, %esi
1321; X64-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
1322; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1323; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1324; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1325; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1326; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1327; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1328; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1329; X64-NEXT:    retq
1330  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
1331  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
1332  %res = or <4 x i32> %arg0, %arg1
1333  %bc = bitcast <4 x i32> %res to <4 x float>
1334  ret <4 x float> %bc
1335}
1336
1337define void @test_mm_prefetch(i8* %a0) {
1338; X32-LABEL: test_mm_prefetch:
1339; X32:       # BB#0:
1340; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1341; X32-NEXT:    prefetchnta (%eax)
1342; X32-NEXT:    retl
1343;
1344; X64-LABEL: test_mm_prefetch:
1345; X64:       # BB#0:
1346; X64-NEXT:    prefetchnta (%rdi)
1347; X64-NEXT:    retq
1348  call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
1349  ret void
1350}
1351declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
1352
1353define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
1354; X32-LABEL: test_mm_rcp_ps:
1355; X32:       # BB#0:
1356; X32-NEXT:    rcpps %xmm0, %xmm0
1357; X32-NEXT:    retl
1358;
1359; X64-LABEL: test_mm_rcp_ps:
1360; X64:       # BB#0:
1361; X64-NEXT:    rcpps %xmm0, %xmm0
1362; X64-NEXT:    retq
1363  %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
1364  ret <4 x float> %res
1365}
1366declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
1367
1368define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
1369; X32-LABEL: test_mm_rcp_ss:
1370; X32:       # BB#0:
1371; X32-NEXT:    rcpss %xmm0, %xmm0
1372; X32-NEXT:    retl
1373;
1374; X64-LABEL: test_mm_rcp_ss:
1375; X64:       # BB#0:
1376; X64-NEXT:    rcpss %xmm0, %xmm0
1377; X64-NEXT:    retq
1378  %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
1379  %ext0 = extractelement <4 x float> %rcp, i32 0
1380  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
1381  %ext1 = extractelement <4 x float> %a0, i32 1
1382  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
1383  %ext2 = extractelement <4 x float> %a0, i32 2
1384  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
1385  %ext3 = extractelement <4 x float> %a0, i32 3
1386  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
1387  ret <4 x float> %ins3
1388}
1389declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
1390
1391define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
1392; X32-LABEL: test_mm_rsqrt_ps:
1393; X32:       # BB#0:
1394; X32-NEXT:    rsqrtps %xmm0, %xmm0
1395; X32-NEXT:    retl
1396;
1397; X64-LABEL: test_mm_rsqrt_ps:
1398; X64:       # BB#0:
1399; X64-NEXT:    rsqrtps %xmm0, %xmm0
1400; X64-NEXT:    retq
1401  %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
1402  ret <4 x float> %res
1403}
1404declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
1405
1406define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
1407; X32-LABEL: test_mm_rsqrt_ss:
1408; X32:       # BB#0:
1409; X32-NEXT:    rsqrtss %xmm0, %xmm0
1410; X32-NEXT:    retl
1411;
1412; X64-LABEL: test_mm_rsqrt_ss:
1413; X64:       # BB#0:
1414; X64-NEXT:    rsqrtss %xmm0, %xmm0
1415; X64-NEXT:    retq
1416  %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
1417  %ext0 = extractelement <4 x float> %rsqrt, i32 0
1418  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
1419  %ext1 = extractelement <4 x float> %a0, i32 1
1420  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
1421  %ext2 = extractelement <4 x float> %a0, i32 2
1422  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
1423  %ext3 = extractelement <4 x float> %a0, i32 3
1424  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
1425  ret <4 x float> %ins3
1426}
1427declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
1428
1429define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
1430; X32-LABEL: test_MM_SET_EXCEPTION_MASK:
1431; X32:       # BB#0:
1432; X32-NEXT:    pushl %eax
1433; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1434; X32-NEXT:    leal (%esp), %ecx
1435; X32-NEXT:    stmxcsr (%ecx)
1436; X32-NEXT:    movl (%esp), %edx
1437; X32-NEXT:    andl $-8065, %edx # imm = 0xE07F
1438; X32-NEXT:    orl %eax, %edx
1439; X32-NEXT:    movl %edx, (%esp)
1440; X32-NEXT:    ldmxcsr (%ecx)
1441; X32-NEXT:    popl %eax
1442; X32-NEXT:    retl
1443;
1444; X64-LABEL: test_MM_SET_EXCEPTION_MASK:
1445; X64:       # BB#0:
1446; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
1447; X64-NEXT:    stmxcsr (%rax)
1448; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
1449; X64-NEXT:    andl $-8065, %ecx # imm = 0xE07F
1450; X64-NEXT:    orl %edi, %ecx
1451; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
1452; X64-NEXT:    ldmxcsr (%rax)
1453; X64-NEXT:    retq
1454  %1 = alloca i32, align 4
1455  %2 = bitcast i32* %1 to i8*
1456  call void @llvm.x86.sse.stmxcsr(i8* %2)
1457  %3 = load i32, i32* %1
1458  %4 = and i32 %3, -8065
1459  %5 = or i32 %4, %a0
1460  store i32 %5, i32* %1
1461  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1462  ret void
1463}
1464declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
1465
1466define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
1467; X32-LABEL: test_MM_SET_EXCEPTION_STATE:
1468; X32:       # BB#0:
1469; X32-NEXT:    pushl %eax
1470; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1471; X32-NEXT:    leal (%esp), %ecx
1472; X32-NEXT:    stmxcsr (%ecx)
1473; X32-NEXT:    movl (%esp), %edx
1474; X32-NEXT:    andl $-64, %edx
1475; X32-NEXT:    orl %eax, %edx
1476; X32-NEXT:    movl %edx, (%esp)
1477; X32-NEXT:    ldmxcsr (%ecx)
1478; X32-NEXT:    popl %eax
1479; X32-NEXT:    retl
1480;
1481; X64-LABEL: test_MM_SET_EXCEPTION_STATE:
1482; X64:       # BB#0:
1483; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
1484; X64-NEXT:    stmxcsr (%rax)
1485; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
1486; X64-NEXT:    andl $-64, %ecx
1487; X64-NEXT:    orl %edi, %ecx
1488; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
1489; X64-NEXT:    ldmxcsr (%rax)
1490; X64-NEXT:    retq
1491  %1 = alloca i32, align 4
1492  %2 = bitcast i32* %1 to i8*
1493  call void @llvm.x86.sse.stmxcsr(i8* %2)
1494  %3 = load i32, i32* %1
1495  %4 = and i32 %3, -64
1496  %5 = or i32 %4, %a0
1497  store i32 %5, i32* %1
1498  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1499  ret void
1500}
1501
1502define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
1503; X32-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1504; X32:       # BB#0:
1505; X32-NEXT:    pushl %eax
1506; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1507; X32-NEXT:    leal (%esp), %ecx
1508; X32-NEXT:    stmxcsr (%ecx)
1509; X32-NEXT:    movl (%esp), %edx
1510; X32-NEXT:    andl $-32769, %edx # imm = 0xFFFF7FFF
1511; X32-NEXT:    orl %eax, %edx
1512; X32-NEXT:    movl %edx, (%esp)
1513; X32-NEXT:    ldmxcsr (%ecx)
1514; X32-NEXT:    popl %eax
1515; X32-NEXT:    retl
1516;
1517; X64-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1518; X64:       # BB#0:
1519; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
1520; X64-NEXT:    stmxcsr (%rax)
1521; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
1522; X64-NEXT:    andl $-32769, %ecx # imm = 0xFFFF7FFF
1523; X64-NEXT:    orl %edi, %ecx
1524; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
1525; X64-NEXT:    ldmxcsr (%rax)
1526; X64-NEXT:    retq
1527  %1 = alloca i32, align 4
1528  %2 = bitcast i32* %1 to i8*
1529  call void @llvm.x86.sse.stmxcsr(i8* %2)
1530  %3 = load i32, i32* %1
1531  %4 = and i32 %3, -32769
1532  %5 = or i32 %4, %a0
1533  store i32 %5, i32* %1
1534  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1535  ret void
1536}
1537
1538define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
1539; X32-LABEL: test_mm_set_ps:
1540; X32:       # BB#0:
1541; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1542; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1543; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1544; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
1545; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1546; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1547; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1548; X32-NEXT:    retl
1549;
1550; X64-LABEL: test_mm_set_ps:
1551; X64:       # BB#0:
1552; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1553; X64-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1554; X64-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1555; X64-NEXT:    movaps %xmm3, %xmm0
1556; X64-NEXT:    retq
1557  %res0  = insertelement <4 x float> undef, float %a3, i32 0
1558  %res1  = insertelement <4 x float> %res0, float %a2, i32 1
1559  %res2  = insertelement <4 x float> %res1, float %a1, i32 2
1560  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
1561  ret <4 x float> %res3
1562}
1563
1564define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
1565; X32-LABEL: test_mm_set_ps1:
1566; X32:       # BB#0:
1567; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1568; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1569; X32-NEXT:    retl
1570;
1571; X64-LABEL: test_mm_set_ps1:
1572; X64:       # BB#0:
1573; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1574; X64-NEXT:    retq
1575  %res0  = insertelement <4 x float> undef, float %a0, i32 0
1576  %res1  = insertelement <4 x float> %res0, float %a0, i32 1
1577  %res2  = insertelement <4 x float> %res1, float %a0, i32 2
1578  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
1579  ret <4 x float> %res3
1580}
1581
1582define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
1583; X32-LABEL: test_MM_SET_ROUNDING_MODE:
1584; X32:       # BB#0:
1585; X32-NEXT:    pushl %eax
1586; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1587; X32-NEXT:    leal (%esp), %ecx
1588; X32-NEXT:    stmxcsr (%ecx)
1589; X32-NEXT:    movl (%esp), %edx
1590; X32-NEXT:    andl $-24577, %edx # imm = 0x9FFF
1591; X32-NEXT:    orl %eax, %edx
1592; X32-NEXT:    movl %edx, (%esp)
1593; X32-NEXT:    ldmxcsr (%ecx)
1594; X32-NEXT:    popl %eax
1595; X32-NEXT:    retl
1596;
1597; X64-LABEL: test_MM_SET_ROUNDING_MODE:
1598; X64:       # BB#0:
1599; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
1600; X64-NEXT:    stmxcsr (%rax)
1601; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
1602; X64-NEXT:    andl $-24577, %ecx # imm = 0x9FFF
1603; X64-NEXT:    orl %edi, %ecx
1604; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
1605; X64-NEXT:    ldmxcsr (%rax)
1606; X64-NEXT:    retq
1607  %1 = alloca i32, align 4
1608  %2 = bitcast i32* %1 to i8*
1609  call void @llvm.x86.sse.stmxcsr(i8* %2)
1610  %3 = load i32, i32* %1
1611  %4 = and i32 %3, -24577
1612  %5 = or i32 %4, %a0
1613  store i32 %5, i32* %1
1614  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1615  ret void
1616}
1617
1618define <4 x float> @test_mm_set_ss(float %a0) nounwind {
1619; X32-LABEL: test_mm_set_ss:
1620; X32:       # BB#0:
1621; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1622; X32-NEXT:    xorps %xmm0, %xmm0
1623; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1624; X32-NEXT:    retl
1625;
1626; X64-LABEL: test_mm_set_ss:
1627; X64:       # BB#0:
1628; X64-NEXT:    xorps %xmm1, %xmm1
1629; X64-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1630; X64-NEXT:    movaps %xmm1, %xmm0
1631; X64-NEXT:    retq
1632  %res0  = insertelement <4 x float> undef, float %a0, i32 0
1633  %res1  = insertelement <4 x float> %res0, float 0.0, i32 1
1634  %res2  = insertelement <4 x float> %res1, float 0.0, i32 2
1635  %res3  = insertelement <4 x float> %res2, float 0.0, i32 3
1636  ret <4 x float> %res3
1637}
1638
1639define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
1640; X32-LABEL: test_mm_set1_ps:
1641; X32:       # BB#0:
1642; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1643; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1644; X32-NEXT:    retl
1645;
1646; X64-LABEL: test_mm_set1_ps:
1647; X64:       # BB#0:
1648; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1649; X64-NEXT:    retq
1650  %res0  = insertelement <4 x float> undef, float %a0, i32 0
1651  %res1  = insertelement <4 x float> %res0, float %a0, i32 1
1652  %res2  = insertelement <4 x float> %res1, float %a0, i32 2
1653  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
1654  ret <4 x float> %res3
1655}
1656
1657define void @test_mm_setcsr(i32 %a0) nounwind {
1658; X32-LABEL: test_mm_setcsr:
1659; X32:       # BB#0:
1660; X32-NEXT:    pushl %eax
1661; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1662; X32-NEXT:    leal (%esp), %ecx
1663; X32-NEXT:    movl %eax, (%esp)
1664; X32-NEXT:    ldmxcsr (%ecx)
1665; X32-NEXT:    popl %eax
1666; X32-NEXT:    retl
1667;
1668; X64-LABEL: test_mm_setcsr:
1669; X64:       # BB#0:
1670; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
1671; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
1672; X64-NEXT:    ldmxcsr (%rax)
1673; X64-NEXT:    retq
1674  %st = alloca i32, align 4
1675  store i32 %a0, i32* %st, align 4
1676  %bc = bitcast i32* %st to i8*
1677  call void @llvm.x86.sse.ldmxcsr(i8* %bc)
1678  ret void
1679}
1680
1681define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
1682; X32-LABEL: test_mm_setr_ps:
1683; X32:       # BB#0:
1684; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1685; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1686; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
1687; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1688; X32-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1689; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1690; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1691; X32-NEXT:    retl
1692;
1693; X64-LABEL: test_mm_setr_ps:
1694; X64:       # BB#0:
1695; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1696; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1697; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1698; X64-NEXT:    retq
1699  %res0  = insertelement <4 x float> undef, float %a0, i32 0
1700  %res1  = insertelement <4 x float> %res0, float %a1, i32 1
1701  %res2  = insertelement <4 x float> %res1, float %a2, i32 2
1702  %res3  = insertelement <4 x float> %res2, float %a3, i32 3
1703  ret <4 x float> %res3
1704}
1705
1706define <4 x float> @test_mm_setzero_ps() {
1707; X32-LABEL: test_mm_setzero_ps:
1708; X32:       # BB#0:
1709; X32-NEXT:    xorps %xmm0, %xmm0
1710; X32-NEXT:    retl
1711;
1712; X64-LABEL: test_mm_setzero_ps:
1713; X64:       # BB#0:
1714; X64-NEXT:    xorps %xmm0, %xmm0
1715; X64-NEXT:    retq
1716  ret <4 x float> zeroinitializer
1717}
1718
1719define void @test_mm_sfence() nounwind {
1720; X32-LABEL: test_mm_sfence:
1721; X32:       # BB#0:
1722; X32-NEXT:    sfence
1723; X32-NEXT:    retl
1724;
1725; X64-LABEL: test_mm_sfence:
1726; X64:       # BB#0:
1727; X64-NEXT:    sfence
1728; X64-NEXT:    retq
1729  call void @llvm.x86.sse.sfence()
1730  ret void
1731}
1732declare void @llvm.x86.sse.sfence() nounwind readnone
1733
1734define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
1735; X32-LABEL: test_mm_shuffle_ps:
1736; X32:       # BB#0:
1737; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
1738; X32-NEXT:    retl
1739;
1740; X64-LABEL: test_mm_shuffle_ps:
1741; X64:       # BB#0:
1742; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
1743; X64-NEXT:    retq
1744  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
1745  ret <4 x float> %res
1746}
1747
1748define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
1749; X32-LABEL: test_mm_sqrt_ps:
1750; X32:       # BB#0:
1751; X32-NEXT:    sqrtps %xmm0, %xmm0
1752; X32-NEXT:    retl
1753;
1754; X64-LABEL: test_mm_sqrt_ps:
1755; X64:       # BB#0:
1756; X64-NEXT:    sqrtps %xmm0, %xmm0
1757; X64-NEXT:    retq
1758  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
1759  ret <4 x float> %res
1760}
1761declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
1762
1763define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
1764; X32-LABEL: test_mm_sqrt_ss:
1765; X32:       # BB#0:
1766; X32-NEXT:    sqrtss %xmm0, %xmm0
1767; X32-NEXT:    retl
1768;
1769; X64-LABEL: test_mm_sqrt_ss:
1770; X64:       # BB#0:
1771; X64-NEXT:    sqrtss %xmm0, %xmm0
1772; X64-NEXT:    retq
1773  %sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
1774  %ext0 = extractelement <4 x float> %sqrt, i32 0
1775  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
1776  %ext1 = extractelement <4 x float> %a0, i32 1
1777  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
1778  %ext2 = extractelement <4 x float> %a0, i32 2
1779  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
1780  %ext3 = extractelement <4 x float> %a0, i32 3
1781  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
1782  ret <4 x float> %ins3
1783}
1784declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
1785
1786define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
1787; X32-LABEL: test_mm_store_ps:
1788; X32:       # BB#0:
1789; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1790; X32-NEXT:    movaps %xmm0, (%eax)
1791; X32-NEXT:    retl
1792;
1793; X64-LABEL: test_mm_store_ps:
1794; X64:       # BB#0:
1795; X64-NEXT:    movaps %xmm0, (%rdi)
1796; X64-NEXT:    retq
1797  %arg0 = bitcast float* %a0 to <4 x float>*
1798  store <4 x float> %a1, <4 x float>* %arg0, align 16
1799  ret void
1800}
1801
1802define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
1803; X32-LABEL: test_mm_store_ps1:
1804; X32:       # BB#0:
1805; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1806; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1807; X32-NEXT:    movaps %xmm0, (%eax)
1808; X32-NEXT:    retl
1809;
1810; X64-LABEL: test_mm_store_ps1:
1811; X64:       # BB#0:
1812; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1813; X64-NEXT:    movaps %xmm0, (%rdi)
1814; X64-NEXT:    retq
1815  %arg0 = bitcast float* %a0 to <4 x float>*
1816  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
1817  store <4 x float> %shuf, <4 x float>* %arg0, align 16
1818  ret void
1819}
1820
1821define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
1822; X32-LABEL: test_mm_store_ss:
1823; X32:       # BB#0:
1824; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1825; X32-NEXT:    movss %xmm0, (%eax)
1826; X32-NEXT:    retl
1827;
1828; X64-LABEL: test_mm_store_ss:
1829; X64:       # BB#0:
1830; X64-NEXT:    movss %xmm0, (%rdi)
1831; X64-NEXT:    retq
1832  %ext = extractelement <4 x float> %a1, i32 0
1833  store float %ext, float* %a0, align 1
1834  ret void
1835}
1836
1837define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
1838; X32-LABEL: test_mm_store1_ps:
1839; X32:       # BB#0:
1840; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1841; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1842; X32-NEXT:    movaps %xmm0, (%eax)
1843; X32-NEXT:    retl
1844;
1845; X64-LABEL: test_mm_store1_ps:
1846; X64:       # BB#0:
1847; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1848; X64-NEXT:    movaps %xmm0, (%rdi)
1849; X64-NEXT:    retq
1850  %arg0 = bitcast float* %a0 to <4 x float>*
1851  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
1852  store <4 x float> %shuf, <4 x float>* %arg0, align 16
1853  ret void
1854}
1855
1856define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
1857; X32-LABEL: test_mm_storeh_ps:
1858; X32:       # BB#0:
1859; X32-NEXT:    pushl %ebp
1860; X32-NEXT:    movl %esp, %ebp
1861; X32-NEXT:    andl $-16, %esp
1862; X32-NEXT:    subl $32, %esp
1863; X32-NEXT:    movl 8(%ebp), %eax
1864; X32-NEXT:    movaps %xmm0, (%esp)
1865; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1866; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
1867; X32-NEXT:    movl %edx, 4(%eax)
1868; X32-NEXT:    movl %ecx, (%eax)
1869; X32-NEXT:    movl %ebp, %esp
1870; X32-NEXT:    popl %ebp
1871; X32-NEXT:    retl
1872;
1873; X64-LABEL: test_mm_storeh_ps:
1874; X64:       # BB#0:
1875; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1876; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1877; X64-NEXT:    movq %rax, (%rdi)
1878; X64-NEXT:    retq
1879  %ptr = bitcast x86_mmx* %a0 to i64*
1880  %bc  = bitcast <4 x float> %a1 to <2 x i64>
1881  %ext = extractelement <2 x i64> %bc, i32 1
1882  store i64 %ext, i64* %ptr
1883  ret void
1884}
1885
1886define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
1887; X32-LABEL: test_mm_storel_ps:
1888; X32:       # BB#0:
1889; X32-NEXT:    pushl %ebp
1890; X32-NEXT:    movl %esp, %ebp
1891; X32-NEXT:    andl $-16, %esp
1892; X32-NEXT:    subl $32, %esp
1893; X32-NEXT:    movl 8(%ebp), %eax
1894; X32-NEXT:    movaps %xmm0, (%esp)
1895; X32-NEXT:    movl (%esp), %ecx
1896; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
1897; X32-NEXT:    movl %edx, 4(%eax)
1898; X32-NEXT:    movl %ecx, (%eax)
1899; X32-NEXT:    movl %ebp, %esp
1900; X32-NEXT:    popl %ebp
1901; X32-NEXT:    retl
1902;
1903; X64-LABEL: test_mm_storel_ps:
1904; X64:       # BB#0:
1905; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1906; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1907; X64-NEXT:    movq %rax, (%rdi)
1908; X64-NEXT:    retq
1909  %ptr = bitcast x86_mmx* %a0 to i64*
1910  %bc  = bitcast <4 x float> %a1 to <2 x i64>
1911  %ext = extractelement <2 x i64> %bc, i32 0
1912  store i64 %ext, i64* %ptr
1913  ret void
1914}
1915
1916define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
1917; X32-LABEL: test_mm_storer_ps:
1918; X32:       # BB#0:
1919; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1920; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1921; X32-NEXT:    movaps %xmm0, (%eax)
1922; X32-NEXT:    retl
1923;
1924; X64-LABEL: test_mm_storer_ps:
1925; X64:       # BB#0:
1926; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1927; X64-NEXT:    movaps %xmm0, (%rdi)
1928; X64-NEXT:    retq
1929  %arg0 = bitcast float* %a0 to <4 x float>*
1930  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1931  store <4 x float> %shuf, <4 x float>* %arg0, align 16
1932  ret void
1933}
1934
1935define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
1936; X32-LABEL: test_mm_storeu_ps:
1937; X32:       # BB#0:
1938; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1939; X32-NEXT:    movups %xmm0, (%eax)
1940; X32-NEXT:    retl
1941;
1942; X64-LABEL: test_mm_storeu_ps:
1943; X64:       # BB#0:
1944; X64-NEXT:    movups %xmm0, (%rdi)
1945; X64-NEXT:    retq
1946  %arg0 = bitcast float* %a0 to <4 x float>*
1947  store <4 x float> %a1, <4 x float>* %arg0, align 1
1948  ret void
1949}
1950
1951define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
1952; X32-LABEL: test_mm_stream_ps:
1953; X32:       # BB#0:
1954; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1955; X32-NEXT:    movntps %xmm0, (%eax)
1956; X32-NEXT:    retl
1957;
1958; X64-LABEL: test_mm_stream_ps:
1959; X64:       # BB#0:
1960; X64-NEXT:    movntps %xmm0, (%rdi)
1961; X64-NEXT:    retq
1962  %arg0 = bitcast float* %a0 to <4 x float>*
1963  store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0
1964  ret void
1965}
1966
1967define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
1968; X32-LABEL: test_mm_sub_ps:
1969; X32:       # BB#0:
1970; X32-NEXT:    subps %xmm1, %xmm0
1971; X32-NEXT:    retl
1972;
1973; X64-LABEL: test_mm_sub_ps:
1974; X64:       # BB#0:
1975; X64-NEXT:    subps %xmm1, %xmm0
1976; X64-NEXT:    retq
1977  %res = fsub <4 x float> %a0, %a1
1978  ret <4 x float> %res
1979}
1980
1981define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
1982; X32-LABEL: test_mm_sub_ss:
1983; X32:       # BB#0:
1984; X32-NEXT:    subss %xmm1, %xmm0
1985; X32-NEXT:    retl
1986;
1987; X64-LABEL: test_mm_sub_ss:
1988; X64:       # BB#0:
1989; X64-NEXT:    subss %xmm1, %xmm0
1990; X64-NEXT:    retq
1991  %ext0 = extractelement <4 x float> %a0, i32 0
1992  %ext1 = extractelement <4 x float> %a1, i32 0
1993  %fsub = fsub float %ext0, %ext1
1994  %res = insertelement <4 x float> %a0, float %fsub, i32 0
1995  ret <4 x float> %res
1996}
1997
1998define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
1999; X32-LABEL: test_MM_TRANSPOSE4_PS:
2000; X32:       # BB#0:
2001; X32-NEXT:    pushl %esi
2002; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2003; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2004; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
2005; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
2006; X32-NEXT:    movaps (%esi), %xmm0
2007; X32-NEXT:    movaps (%edx), %xmm1
2008; X32-NEXT:    movaps (%ecx), %xmm2
2009; X32-NEXT:    movaps (%eax), %xmm3
2010; X32-NEXT:    movaps %xmm0, %xmm4
2011; X32-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2012; X32-NEXT:    movaps %xmm2, %xmm5
2013; X32-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2014; X32-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2015; X32-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2016; X32-NEXT:    movaps %xmm4, %xmm1
2017; X32-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
2018; X32-NEXT:    movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
2019; X32-NEXT:    movaps %xmm0, %xmm3
2020; X32-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
2021; X32-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
2022; X32-NEXT:    movaps %xmm1, (%esi)
2023; X32-NEXT:    movaps %xmm5, (%edx)
2024; X32-NEXT:    movaps %xmm3, (%ecx)
2025; X32-NEXT:    movaps %xmm2, (%eax)
2026; X32-NEXT:    popl %esi
2027; X32-NEXT:    retl
2028;
2029; X64-LABEL: test_MM_TRANSPOSE4_PS:
2030; X64:       # BB#0:
2031; X64-NEXT:    movaps (%rdi), %xmm0
2032; X64-NEXT:    movaps (%rsi), %xmm1
2033; X64-NEXT:    movaps (%rdx), %xmm2
2034; X64-NEXT:    movaps (%rcx), %xmm3
2035; X64-NEXT:    movaps %xmm0, %xmm4
2036; X64-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2037; X64-NEXT:    movaps %xmm2, %xmm5
2038; X64-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2039; X64-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2040; X64-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2041; X64-NEXT:    movaps %xmm4, %xmm1
2042; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
2043; X64-NEXT:    movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
2044; X64-NEXT:    movaps %xmm0, %xmm3
2045; X64-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
2046; X64-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
2047; X64-NEXT:    movaps %xmm1, (%rdi)
2048; X64-NEXT:    movaps %xmm5, (%rsi)
2049; X64-NEXT:    movaps %xmm3, (%rdx)
2050; X64-NEXT:    movaps %xmm2, (%rcx)
2051; X64-NEXT:    retq
2052  %row0 = load <4 x float>, <4 x float>* %a0, align 16
2053  %row1 = load <4 x float>, <4 x float>* %a1, align 16
2054  %row2 = load <4 x float>, <4 x float>* %a2, align 16
2055  %row3 = load <4 x float>, <4 x float>* %a3, align 16
2056  %tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2057  %tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2058  %tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
2059  %tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
2060  %res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
2061  %res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
2062  %res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
2063  %res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
2064  store <4 x float> %res0, <4 x float>* %a0, align 16
2065  store <4 x float> %res1, <4 x float>* %a1, align 16
2066  store <4 x float> %res2, <4 x float>* %a2, align 16
2067  store <4 x float> %res3, <4 x float>* %a3, align 16
2068  ret void
2069}
2070
2071define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
2072; X32-LABEL: test_mm_ucomieq_ss:
2073; X32:       # BB#0:
2074; X32-NEXT:    ucomiss %xmm1, %xmm0
2075; X32-NEXT:    setnp %al
2076; X32-NEXT:    sete %cl
2077; X32-NEXT:    andb %al, %cl
2078; X32-NEXT:    movzbl %cl, %eax
2079; X32-NEXT:    retl
2080;
2081; X64-LABEL: test_mm_ucomieq_ss:
2082; X64:       # BB#0:
2083; X64-NEXT:    ucomiss %xmm1, %xmm0
2084; X64-NEXT:    setnp %al
2085; X64-NEXT:    sete %cl
2086; X64-NEXT:    andb %al, %cl
2087; X64-NEXT:    movzbl %cl, %eax
2088; X64-NEXT:    retq
2089  %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
2090  ret i32 %res
2091}
2092declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
2093
2094define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
2095; X32-LABEL: test_mm_ucomige_ss:
2096; X32:       # BB#0:
2097; X32-NEXT:    xorl %eax, %eax
2098; X32-NEXT:    ucomiss %xmm1, %xmm0
2099; X32-NEXT:    setae %al
2100; X32-NEXT:    retl
2101;
2102; X64-LABEL: test_mm_ucomige_ss:
2103; X64:       # BB#0:
2104; X64-NEXT:    xorl %eax, %eax
2105; X64-NEXT:    ucomiss %xmm1, %xmm0
2106; X64-NEXT:    setae %al
2107; X64-NEXT:    retq
2108  %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1)
2109  ret i32 %res
2110}
2111declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
2112
2113define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
2114; X32-LABEL: test_mm_ucomigt_ss:
2115; X32:       # BB#0:
2116; X32-NEXT:    xorl %eax, %eax
2117; X32-NEXT:    ucomiss %xmm1, %xmm0
2118; X32-NEXT:    seta %al
2119; X32-NEXT:    retl
2120;
2121; X64-LABEL: test_mm_ucomigt_ss:
2122; X64:       # BB#0:
2123; X64-NEXT:    xorl %eax, %eax
2124; X64-NEXT:    ucomiss %xmm1, %xmm0
2125; X64-NEXT:    seta %al
2126; X64-NEXT:    retq
2127  %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1)
2128  ret i32 %res
2129}
2130declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
2131
2132define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
2133; X32-LABEL: test_mm_ucomile_ss:
2134; X32:       # BB#0:
2135; X32-NEXT:    xorl %eax, %eax
2136; X32-NEXT:    ucomiss %xmm0, %xmm1
2137; X32-NEXT:    setae %al
2138; X32-NEXT:    retl
2139;
2140; X64-LABEL: test_mm_ucomile_ss:
2141; X64:       # BB#0:
2142; X64-NEXT:    xorl %eax, %eax
2143; X64-NEXT:    ucomiss %xmm0, %xmm1
2144; X64-NEXT:    setae %al
2145; X64-NEXT:    retq
2146  %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1)
2147  ret i32 %res
2148}
2149declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
2150
2151define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
2152; X32-LABEL: test_mm_ucomilt_ss:
2153; X32:       # BB#0:
2154; X32-NEXT:    xorl %eax, %eax
2155; X32-NEXT:    ucomiss %xmm0, %xmm1
2156; X32-NEXT:    seta %al
2157; X32-NEXT:    retl
2158;
2159; X64-LABEL: test_mm_ucomilt_ss:
2160; X64:       # BB#0:
2161; X64-NEXT:    xorl %eax, %eax
2162; X64-NEXT:    ucomiss %xmm0, %xmm1
2163; X64-NEXT:    seta %al
2164; X64-NEXT:    retq
2165  %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1)
2166  ret i32 %res
2167}
2168declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
2169
2170define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
2171; X32-LABEL: test_mm_ucomineq_ss:
2172; X32:       # BB#0:
2173; X32-NEXT:    ucomiss %xmm1, %xmm0
2174; X32-NEXT:    setp %al
2175; X32-NEXT:    setne %cl
2176; X32-NEXT:    orb %al, %cl
2177; X32-NEXT:    movzbl %cl, %eax
2178; X32-NEXT:    retl
2179;
2180; X64-LABEL: test_mm_ucomineq_ss:
2181; X64:       # BB#0:
2182; X64-NEXT:    ucomiss %xmm1, %xmm0
2183; X64-NEXT:    setp %al
2184; X64-NEXT:    setne %cl
2185; X64-NEXT:    orb %al, %cl
2186; X64-NEXT:    movzbl %cl, %eax
2187; X64-NEXT:    retq
2188  %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
2189  ret i32 %res
2190}
2191declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
2192
2193define <4 x float> @test_mm_undefined_ps() {
2194; X32-LABEL: test_mm_undefined_ps:
2195; X32:       # BB#0:
2196; X32-NEXT:    retl
2197;
2198; X64-LABEL: test_mm_undefined_ps:
2199; X64:       # BB#0:
2200; X64-NEXT:    retq
2201  ret <4 x float> undef
2202}
2203
2204define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2205; X32-LABEL: test_mm_unpackhi_ps:
2206; X32:       # BB#0:
2207; X32-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2208; X32-NEXT:    retl
2209;
2210; X64-LABEL: test_mm_unpackhi_ps:
2211; X64:       # BB#0:
2212; X64-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2213; X64-NEXT:    retq
2214  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
2215  ret <4 x float> %res
2216}
2217
2218define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2219; X32-LABEL: test_mm_unpacklo_ps:
2220; X32:       # BB#0:
2221; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2222; X32-NEXT:    retl
2223;
2224; X64-LABEL: test_mm_unpacklo_ps:
2225; X64:       # BB#0:
2226; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2227; X64-NEXT:    retq
2228  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2229  ret <4 x float> %res
2230}
2231
2232define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2233; X32-LABEL: test_mm_xor_ps:
2234; X32:       # BB#0:
2235; X32-NEXT:    pushl %ebp
2236; X32-NEXT:    movl %esp, %ebp
2237; X32-NEXT:    pushl %esi
2238; X32-NEXT:    andl $-16, %esp
2239; X32-NEXT:    subl $64, %esp
2240; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
2241; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2242; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2243; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
2244; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
2245; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
2246; X32-NEXT:    xorl {{[0-9]+}}(%esp), %esi
2247; X32-NEXT:    movl %esi, (%esp)
2248; X32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
2249; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
2250; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
2251; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
2252; X32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
2253; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
2254; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2255; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2256; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2257; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2258; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2259; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2260; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2261; X32-NEXT:    leal -4(%ebp), %esp
2262; X32-NEXT:    popl %esi
2263; X32-NEXT:    popl %ebp
2264; X32-NEXT:    retl
2265;
2266; X64-LABEL: test_mm_xor_ps:
2267; X64:       # BB#0:
2268; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
2269; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
2270; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8
2271; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
2272; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
2273; X64-NEXT:    movq %rdx, %rsi
2274; X64-NEXT:    xorl %eax, %edx
2275; X64-NEXT:    shrq $32, %rax
2276; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
2277; X64-NEXT:    movq %rcx, %rdi
2278; X64-NEXT:    xorl %r8d, %ecx
2279; X64-NEXT:    shrq $32, %r8
2280; X64-NEXT:    shrq $32, %rsi
2281; X64-NEXT:    shrq $32, %rdi
2282; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
2283; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
2284; X64-NEXT:    xorl %r8d, %edi
2285; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
2286; X64-NEXT:    xorl %eax, %esi
2287; X64-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
2288; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2289; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2290; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2291; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2292; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2293; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2294; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2295; X64-NEXT:    retq
2296  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
2297  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
2298  %res = xor <4 x i32> %arg0, %arg1
2299  %bc = bitcast <4 x i32> %res to <4 x float>
2300  ret <4 x float> %bc
2301}
2302
2303!0 = !{i32 1}
2304