• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse -O3 | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2,+sse -O3 | FileCheck %s --check-prefixes=CHECK,X64
4
5; Tests for SSE1 and below, without SSE2+.
6
7; PR7993
8;define <4 x i32> @test3(<4 x i16> %a) nounwind {
9;  %c = sext <4 x i16> %a to <4 x i32>             ; <<4 x i32>> [#uses=1]
10;  ret <4 x i32> %c
11;}
12
13; This should not emit shuffles to populate the top 2 elements of the 4-element
14; vector that this ends up returning.
15; rdar://8368414
16define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind {
17; CHECK-LABEL: test4:
18; CHECK:       # %bb.0: # %entry
19; CHECK-NEXT:    movaps %xmm0, %xmm2
20; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
21; CHECK-NEXT:    addss %xmm1, %xmm0
22; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
23; CHECK-NEXT:    subss %xmm1, %xmm2
24; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
25; CHECK-NEXT:    ret{{[l|q]}}
26entry:
27  %tmp7 = extractelement <2 x float> %A, i32 0
28  %tmp5 = extractelement <2 x float> %A, i32 1
29  %tmp3 = extractelement <2 x float> %B, i32 0
30  %tmp1 = extractelement <2 x float> %B, i32 1
31  %add.r = fadd float %tmp7, %tmp3
32  %add.i = fsub float %tmp5, %tmp1
33  %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
34  %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
35  ret <2 x float> %tmp9
36}
37
38; We used to get stuck in type legalization for this example when lowering the
39; vselect. With SSE1 v4f32 is a legal type but v4i1 (or any vector integer type)
40; is not. We used to ping pong between splitting the vselect for the v4i
41; condition operand and widening the resulting vselect for the v4f32 result.
42; PR18036
43
44define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
45; X86-LABEL: vselect:
46; X86:       # %bb.0: # %entry
47; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
48; X86-NEXT:    xorps %xmm0, %xmm0
49; X86-NEXT:    je .LBB1_1
50; X86-NEXT:  # %bb.2: # %entry
51; X86-NEXT:    xorps %xmm1, %xmm1
52; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
53; X86-NEXT:    jne .LBB1_5
54; X86-NEXT:  .LBB1_4:
55; X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
56; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
57; X86-NEXT:    jne .LBB1_8
58; X86-NEXT:  .LBB1_7:
59; X86-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
60; X86-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
61; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
62; X86-NEXT:    je .LBB1_10
63; X86-NEXT:    jmp .LBB1_11
64; X86-NEXT:  .LBB1_1:
65; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
66; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
67; X86-NEXT:    je .LBB1_4
68; X86-NEXT:  .LBB1_5: # %entry
69; X86-NEXT:    xorps %xmm2, %xmm2
70; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
71; X86-NEXT:    je .LBB1_7
72; X86-NEXT:  .LBB1_8: # %entry
73; X86-NEXT:    xorps %xmm3, %xmm3
74; X86-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
75; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
76; X86-NEXT:    jne .LBB1_11
77; X86-NEXT:  .LBB1_10:
78; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
79; X86-NEXT:  .LBB1_11: # %entry
80; X86-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
81; X86-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
82; X86-NEXT:    retl
83;
84; X64-LABEL: vselect:
85; X64:       # %bb.0: # %entry
86; X64-NEXT:    testl %edx, %edx
87; X64-NEXT:    xorps %xmm0, %xmm0
88; X64-NEXT:    je .LBB1_1
89; X64-NEXT:  # %bb.2: # %entry
90; X64-NEXT:    xorps %xmm1, %xmm1
91; X64-NEXT:    testl %ecx, %ecx
92; X64-NEXT:    jne .LBB1_5
93; X64-NEXT:  .LBB1_4:
94; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
95; X64-NEXT:    testl %r8d, %r8d
96; X64-NEXT:    jne .LBB1_8
97; X64-NEXT:  .LBB1_7:
98; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
99; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
100; X64-NEXT:    testl %esi, %esi
101; X64-NEXT:    je .LBB1_10
102; X64-NEXT:    jmp .LBB1_11
103; X64-NEXT:  .LBB1_1:
104; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
105; X64-NEXT:    testl %ecx, %ecx
106; X64-NEXT:    je .LBB1_4
107; X64-NEXT:  .LBB1_5: # %entry
108; X64-NEXT:    xorps %xmm2, %xmm2
109; X64-NEXT:    testl %r8d, %r8d
110; X64-NEXT:    je .LBB1_7
111; X64-NEXT:  .LBB1_8: # %entry
112; X64-NEXT:    xorps %xmm3, %xmm3
113; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
114; X64-NEXT:    testl %esi, %esi
115; X64-NEXT:    jne .LBB1_11
116; X64-NEXT:  .LBB1_10:
117; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
118; X64-NEXT:  .LBB1_11: # %entry
119; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
120; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
121; X64-NEXT:    retq
122entry:
123  %a1 = icmp eq <4 x i32> %q, zeroinitializer
124  %a14 = select <4 x i1> %a1, <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+0> , <4 x float> zeroinitializer
125  ret <4 x float> %a14
126}
127
128; v4i32 isn't legal for SSE1, but this should be cmpps.
129
130define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind {
131; CHECK-LABEL: PR28044:
132; CHECK:       # %bb.0:
133; CHECK-NEXT:    cmpeqps %xmm1, %xmm0
134; CHECK-NEXT:    ret{{[l|q]}}
135  %cmp = fcmp oeq <4 x float> %a0, %a1
136  %sext = sext <4 x i1> %cmp to <4 x i32>
137  %res = bitcast <4 x i32> %sext to <4 x float>
138  ret <4 x float> %res
139}
140
141; Don't crash trying to do the impossible: an integer vector comparison doesn't exist, so we must scalarize.
142; https://llvm.org/bugs/show_bug.cgi?id=30512
143
144define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind {
145; X86-LABEL: PR30512:
146; X86:       # %bb.0:
147; X86-NEXT:    pushl %ebx
148; X86-NEXT:    pushl %edi
149; X86-NEXT:    pushl %esi
150; X86-NEXT:    subl $16, %esp
151; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
152; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
153; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
154; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
155; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
156; X86-NEXT:    xorl %ebx, %ebx
157; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
158; X86-NEXT:    sete %bl
159; X86-NEXT:    negl %ebx
160; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
161; X86-NEXT:    xorl %ebx, %ebx
162; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
163; X86-NEXT:    sete %bl
164; X86-NEXT:    negl %ebx
165; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
166; X86-NEXT:    xorl %ebx, %ebx
167; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
168; X86-NEXT:    sete %bl
169; X86-NEXT:    negl %ebx
170; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
171; X86-NEXT:    xorl %edx, %edx
172; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
173; X86-NEXT:    sete %dl
174; X86-NEXT:    negl %edx
175; X86-NEXT:    movl %edx, (%esp)
176; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
177; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
178; X86-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
179; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
180; X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
181; X86-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
182; X86-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
183; X86-NEXT:    andps {{\.LCPI.*}}, %xmm2
184; X86-NEXT:    movaps %xmm2, (%eax)
185; X86-NEXT:    addl $16, %esp
186; X86-NEXT:    popl %esi
187; X86-NEXT:    popl %edi
188; X86-NEXT:    popl %ebx
189; X86-NEXT:    retl $4
190;
191; X64-LABEL: PR30512:
192; X64:       # %bb.0:
193; X64-NEXT:    movq %rdi, %rax
194; X64-NEXT:    xorl %edi, %edi
195; X64-NEXT:    cmpl {{[0-9]+}}(%rsp), %r8d
196; X64-NEXT:    sete %dil
197; X64-NEXT:    negl %edi
198; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
199; X64-NEXT:    xorl %edi, %edi
200; X64-NEXT:    cmpl {{[0-9]+}}(%rsp), %ecx
201; X64-NEXT:    sete %dil
202; X64-NEXT:    negl %edi
203; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
204; X64-NEXT:    xorl %ecx, %ecx
205; X64-NEXT:    cmpl {{[0-9]+}}(%rsp), %edx
206; X64-NEXT:    sete %cl
207; X64-NEXT:    negl %ecx
208; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
209; X64-NEXT:    xorl %ecx, %ecx
210; X64-NEXT:    cmpl %r9d, %esi
211; X64-NEXT:    sete %cl
212; X64-NEXT:    negl %ecx
213; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
214; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
215; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
216; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
217; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
218; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
219; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
220; X64-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
221; X64-NEXT:    andps {{.*}}(%rip), %xmm2
222; X64-NEXT:    movaps %xmm2, (%rax)
223; X64-NEXT:    retq
224  %cmp = icmp eq <4 x i32> %x, %y
225  %zext = zext <4 x i1> %cmp to <4 x i32>
226  ret <4 x i32> %zext
227}
228
229; Fragile test warning - we need to induce the generation of a vselect
230; post-legalization to cause the crash seen in:
231; https://llvm.org/bugs/show_bug.cgi?id=31672
232; Is there a way to do that without an unsafe/fast sqrt intrinsic call?
233;
234; We now no longer try to lower sqrt using rsqrt with SSE1 only as the
235; v4i32 vselect mentioned above should never have been created. We ended up
236; scalarizing it anyway.
237
238define <2 x float> @PR31672() #0 {
239; X86-LABEL: PR31672:
240; X86:       # %bb.0:
241; X86-NEXT:    sqrtps {{\.LCPI.*}}, %xmm0
242; X86-NEXT:    retl
243;
244; X64-LABEL: PR31672:
245; X64:       # %bb.0:
246; X64-NEXT:    sqrtps {{.*}}(%rip), %xmm0
247; X64-NEXT:    retq
248  %t0 = call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> <float 42.0, float 3.0>)
249  ret <2 x float> %t0
250}
251
252declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #1
253
254attributes #0 = { nounwind "unsafe-fp-math"="true" }
255
256