• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-linux -mattr=+sse2 < %s | FileCheck %s --check-prefixes=LIN,LIN-SSE2
3; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s --check-prefixes=LIN,LIN-SSE4
4; RUN: llc -mtriple=x86_64-win32 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=WIN,WIN-SSE2
5; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefixes=WIN,WIN-SSE4
6; RUN: llc -mtriple=i686-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN32
7; rdar://7398554
8
9; When doing vector gather-scatter index calculation with 32-bit indices,
10; minimize shuffling of each individual element out of the index vector.
11
12define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
13; LIN-SSE2-LABEL: foo:
14; LIN-SSE2:       # %bb.0:
15; LIN-SSE2-NEXT:    movdqa (%rsi), %xmm0
16; LIN-SSE2-NEXT:    pand (%rdx), %xmm0
17; LIN-SSE2-NEXT:    movd %xmm0, %eax
18; LIN-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
19; LIN-SSE2-NEXT:    movd %xmm1, %ecx
20; LIN-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
21; LIN-SSE2-NEXT:    movd %xmm1, %edx
22; LIN-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
23; LIN-SSE2-NEXT:    movd %xmm0, %esi
24; LIN-SSE2-NEXT:    cltq
25; LIN-SSE2-NEXT:    movslq %ecx, %rcx
26; LIN-SSE2-NEXT:    movslq %edx, %rdx
27; LIN-SSE2-NEXT:    movslq %esi, %rsi
28; LIN-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
29; LIN-SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
30; LIN-SSE2-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
31; LIN-SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
32; LIN-SSE2-NEXT:    retq
33;
34; LIN-SSE4-LABEL: foo:
35; LIN-SSE4:       # %bb.0:
36; LIN-SSE4-NEXT:    movdqa (%rsi), %xmm0
37; LIN-SSE4-NEXT:    pand (%rdx), %xmm0
38; LIN-SSE4-NEXT:    movd %xmm0, %eax
39; LIN-SSE4-NEXT:    pextrd $1, %xmm0, %ecx
40; LIN-SSE4-NEXT:    pextrd $2, %xmm0, %edx
41; LIN-SSE4-NEXT:    pextrd $3, %xmm0, %esi
42; LIN-SSE4-NEXT:    cltq
43; LIN-SSE4-NEXT:    movslq %ecx, %rcx
44; LIN-SSE4-NEXT:    movslq %edx, %rdx
45; LIN-SSE4-NEXT:    movslq %esi, %rsi
46; LIN-SSE4-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
47; LIN-SSE4-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
48; LIN-SSE4-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
49; LIN-SSE4-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
50; LIN-SSE4-NEXT:    retq
51;
52; WIN-SSE2-LABEL: foo:
53; WIN-SSE2:       # %bb.0:
54; WIN-SSE2-NEXT:    movdqa (%rdx), %xmm0
55; WIN-SSE2-NEXT:    pand (%r8), %xmm0
56; WIN-SSE2-NEXT:    movd %xmm0, %r8d
57; WIN-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
58; WIN-SSE2-NEXT:    movd %xmm1, %r9d
59; WIN-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
60; WIN-SSE2-NEXT:    movd %xmm1, %r10d
61; WIN-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
62; WIN-SSE2-NEXT:    movd %xmm0, %edx
63; WIN-SSE2-NEXT:    movslq %r8d, %r11
64; WIN-SSE2-NEXT:    movslq %r9d, %r8
65; WIN-SSE2-NEXT:    movslq %r10d, %rax
66; WIN-SSE2-NEXT:    movslq %edx, %rdx
67; WIN-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
68; WIN-SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
69; WIN-SSE2-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
70; WIN-SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
71; WIN-SSE2-NEXT:    retq
72;
73; WIN-SSE4-LABEL: foo:
74; WIN-SSE4:       # %bb.0:
75; WIN-SSE4-NEXT:    movdqa (%rdx), %xmm0
76; WIN-SSE4-NEXT:    pand (%r8), %xmm0
77; WIN-SSE4-NEXT:    movd %xmm0, %eax
78; WIN-SSE4-NEXT:    pextrd $1, %xmm0, %edx
79; WIN-SSE4-NEXT:    pextrd $2, %xmm0, %r8d
80; WIN-SSE4-NEXT:    pextrd $3, %xmm0, %r9d
81; WIN-SSE4-NEXT:    cltq
82; WIN-SSE4-NEXT:    movslq %edx, %r10
83; WIN-SSE4-NEXT:    movslq %r8d, %rdx
84; WIN-SSE4-NEXT:    movslq %r9d, %r8
85; WIN-SSE4-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
86; WIN-SSE4-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
87; WIN-SSE4-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
88; WIN-SSE4-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
89; WIN-SSE4-NEXT:    retq
90;
91; LIN32-LABEL: foo:
92; LIN32:       # %bb.0:
93; LIN32-NEXT:    pushl %edi
94; LIN32-NEXT:    pushl %esi
95; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
96; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
97; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
98; LIN32-NEXT:    movdqa (%edx), %xmm0
99; LIN32-NEXT:    pand (%ecx), %xmm0
100; LIN32-NEXT:    movd %xmm0, %ecx
101; LIN32-NEXT:    pextrd $1, %xmm0, %edx
102; LIN32-NEXT:    pextrd $2, %xmm0, %esi
103; LIN32-NEXT:    pextrd $3, %xmm0, %edi
104; LIN32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
105; LIN32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
106; LIN32-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
107; LIN32-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
108; LIN32-NEXT:    popl %esi
109; LIN32-NEXT:    popl %edi
110; LIN32-NEXT:    retl
111  %a = load <4 x i32>, <4 x i32>* %i
112  %b = load <4 x i32>, <4 x i32>* %h
113  %j = and <4 x i32> %a, %b
114  %d0 = extractelement <4 x i32> %j, i32 0
115  %d1 = extractelement <4 x i32> %j, i32 1
116  %d2 = extractelement <4 x i32> %j, i32 2
117  %d3 = extractelement <4 x i32> %j, i32 3
118  %q0 = getelementptr double, double* %p, i32 %d0
119  %q1 = getelementptr double, double* %p, i32 %d1
120  %q2 = getelementptr double, double* %p, i32 %d2
121  %q3 = getelementptr double, double* %p, i32 %d3
122  %r0 = load double, double* %q0
123  %r1 = load double, double* %q1
124  %r2 = load double, double* %q2
125  %r3 = load double, double* %q3
126  %v0 = insertelement <4 x double> undef, double %r0, i32 0
127  %v1 = insertelement <4 x double> %v0, double %r1, i32 1
128  %v2 = insertelement <4 x double> %v1, double %r2, i32 2
129  %v3 = insertelement <4 x double> %v2, double %r3, i32 3
130  ret <4 x double> %v3
131}
132
133; Check that the sequence previously used above, which bounces the vector off the
134; cache works for x86-32. Note that in this case it will not be used for index
135; calculation, since indexes are 32-bit, not 64.
136define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind {
137; LIN-SSE2-LABEL: old:
138; LIN-SSE2:       # %bb.0:
139; LIN-SSE2-NEXT:    movdqa (%rsi), %xmm0
140; LIN-SSE2-NEXT:    pand (%rdx), %xmm0
141; LIN-SSE2-NEXT:    movd %xmm0, %eax
142; LIN-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
143; LIN-SSE2-NEXT:    movd %xmm1, %edx
144; LIN-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
145; LIN-SSE2-NEXT:    movd %xmm1, %esi
146; LIN-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
147; LIN-SSE2-NEXT:    movd %xmm0, %edi
148; LIN-SSE2-NEXT:    andl %ecx, %eax
149; LIN-SSE2-NEXT:    andl %ecx, %edx
150; LIN-SSE2-NEXT:    andl %ecx, %esi
151; LIN-SSE2-NEXT:    andl %ecx, %edi
152; LIN-SSE2-NEXT:    movd %eax, %xmm0
153; LIN-SSE2-NEXT:    movd %edx, %xmm1
154; LIN-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
155; LIN-SSE2-NEXT:    movd %edi, %xmm2
156; LIN-SSE2-NEXT:    movd %esi, %xmm1
157; LIN-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
158; LIN-SSE2-NEXT:    retq
159;
160; LIN-SSE4-LABEL: old:
161; LIN-SSE4:       # %bb.0:
162; LIN-SSE4-NEXT:    movdqa (%rsi), %xmm0
163; LIN-SSE4-NEXT:    pand (%rdx), %xmm0
164; LIN-SSE4-NEXT:    movd %xmm0, %eax
165; LIN-SSE4-NEXT:    pextrd $1, %xmm0, %edx
166; LIN-SSE4-NEXT:    pextrd $2, %xmm0, %esi
167; LIN-SSE4-NEXT:    pextrd $3, %xmm0, %edi
168; LIN-SSE4-NEXT:    andl %ecx, %eax
169; LIN-SSE4-NEXT:    andl %ecx, %edx
170; LIN-SSE4-NEXT:    andl %ecx, %esi
171; LIN-SSE4-NEXT:    andl %ecx, %edi
172; LIN-SSE4-NEXT:    movd %edx, %xmm1
173; LIN-SSE4-NEXT:    movd %eax, %xmm0
174; LIN-SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
175; LIN-SSE4-NEXT:    movd %edi, %xmm2
176; LIN-SSE4-NEXT:    movd %esi, %xmm1
177; LIN-SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
178; LIN-SSE4-NEXT:    retq
179;
180; WIN-SSE2-LABEL: old:
181; WIN-SSE2:       # %bb.0:
182; WIN-SSE2-NEXT:    movdqa (%rdx), %xmm0
183; WIN-SSE2-NEXT:    pand (%r8), %xmm0
184; WIN-SSE2-NEXT:    movd %xmm0, %eax
185; WIN-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
186; WIN-SSE2-NEXT:    movd %xmm1, %ecx
187; WIN-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
188; WIN-SSE2-NEXT:    movd %xmm1, %r8d
189; WIN-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
190; WIN-SSE2-NEXT:    movd %xmm0, %edx
191; WIN-SSE2-NEXT:    andl %r9d, %eax
192; WIN-SSE2-NEXT:    andl %r9d, %ecx
193; WIN-SSE2-NEXT:    andl %r9d, %r8d
194; WIN-SSE2-NEXT:    andl %r9d, %edx
195; WIN-SSE2-NEXT:    movd %eax, %xmm0
196; WIN-SSE2-NEXT:    movd %ecx, %xmm1
197; WIN-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
198; WIN-SSE2-NEXT:    movd %edx, %xmm2
199; WIN-SSE2-NEXT:    movd %r8d, %xmm1
200; WIN-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
201; WIN-SSE2-NEXT:    retq
202;
203; WIN-SSE4-LABEL: old:
204; WIN-SSE4:       # %bb.0:
205; WIN-SSE4-NEXT:    movdqa (%rdx), %xmm0
206; WIN-SSE4-NEXT:    pand (%r8), %xmm0
207; WIN-SSE4-NEXT:    movd %xmm0, %eax
208; WIN-SSE4-NEXT:    pextrd $1, %xmm0, %ecx
209; WIN-SSE4-NEXT:    pextrd $2, %xmm0, %r8d
210; WIN-SSE4-NEXT:    pextrd $3, %xmm0, %edx
211; WIN-SSE4-NEXT:    andl %r9d, %eax
212; WIN-SSE4-NEXT:    andl %r9d, %ecx
213; WIN-SSE4-NEXT:    andl %r9d, %r8d
214; WIN-SSE4-NEXT:    andl %r9d, %edx
215; WIN-SSE4-NEXT:    movd %ecx, %xmm1
216; WIN-SSE4-NEXT:    movd %eax, %xmm0
217; WIN-SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
218; WIN-SSE4-NEXT:    movd %edx, %xmm2
219; WIN-SSE4-NEXT:    movd %r8d, %xmm1
220; WIN-SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
221; WIN-SSE4-NEXT:    retq
222;
223; LIN32-LABEL: old:
224; LIN32:       # %bb.0:
225; LIN32-NEXT:    pushl %edi
226; LIN32-NEXT:    pushl %esi
227; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
228; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
229; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
230; LIN32-NEXT:    movdqa (%edx), %xmm0
231; LIN32-NEXT:    pand (%ecx), %xmm0
232; LIN32-NEXT:    movd %xmm0, %ecx
233; LIN32-NEXT:    pextrd $1, %xmm0, %edx
234; LIN32-NEXT:    pextrd $2, %xmm0, %esi
235; LIN32-NEXT:    pextrd $3, %xmm0, %edi
236; LIN32-NEXT:    andl %eax, %ecx
237; LIN32-NEXT:    andl %eax, %edx
238; LIN32-NEXT:    andl %eax, %esi
239; LIN32-NEXT:    andl %eax, %edi
240; LIN32-NEXT:    movd %edx, %xmm1
241; LIN32-NEXT:    movd %ecx, %xmm0
242; LIN32-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
243; LIN32-NEXT:    movd %edi, %xmm2
244; LIN32-NEXT:    movd %esi, %xmm1
245; LIN32-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
246; LIN32-NEXT:    popl %esi
247; LIN32-NEXT:    popl %edi
248; LIN32-NEXT:    retl
249  %a = load <4 x i32>, <4 x i32>* %i
250  %b = load <4 x i32>, <4 x i32>* %h
251  %j = and <4 x i32> %a, %b
252  %d0 = extractelement <4 x i32> %j, i32 0
253  %d1 = extractelement <4 x i32> %j, i32 1
254  %d2 = extractelement <4 x i32> %j, i32 2
255  %d3 = extractelement <4 x i32> %j, i32 3
256  %q0 = zext i32 %d0 to i64
257  %q1 = zext i32 %d1 to i64
258  %q2 = zext i32 %d2 to i64
259  %q3 = zext i32 %d3 to i64
260  %r0 = and i64 %q0, %f
261  %r1 = and i64 %q1, %f
262  %r2 = and i64 %q2, %f
263  %r3 = and i64 %q3, %f
264  %v0 = insertelement <4 x i64> undef, i64 %r0, i32 0
265  %v1 = insertelement <4 x i64> %v0, i64 %r1, i32 1
266  %v2 = insertelement <4 x i64> %v1, i64 %r2, i32 2
267  %v3 = insertelement <4 x i64> %v2, i64 %r3, i32 3
268  ret <4 x i64> %v3
269}
270