• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,XOP
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
6
7; fold (rot (rot x, c1), c2) -> rot x, c1+c2
8define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) {
9; SSE2-LABEL: combine_vec_rot_rot:
10; SSE2:       # %bb.0:
11; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [524288,131072,32768,8192]
12; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
13; SSE2-NEXT:    pmuludq %xmm1, %xmm0
14; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
15; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
16; SSE2-NEXT:    pmuludq %xmm2, %xmm1
17; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
18; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
19; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
20; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
21; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
22; SSE2-NEXT:    por %xmm3, %xmm0
23; SSE2-NEXT:    retq
24;
25; XOP-LABEL: combine_vec_rot_rot:
26; XOP:       # %bb.0:
27; XOP-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
28; XOP-NEXT:    retq
29;
30; AVX2-LABEL: combine_vec_rot_rot:
31; AVX2:       # %bb.0:
32; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
33; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
34; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
35; AVX2-NEXT:    retq
36;
37; AVX512-LABEL: combine_vec_rot_rot:
38; AVX512:       # %bb.0:
39; AVX512-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
40; AVX512-NEXT:    retq
41  %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
42  %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28>
43  %3 = or <4 x i32> %1, %2
44  %4 = lshr <4 x i32> %3, <i32 12, i32 13, i32 14, i32 15>
45  %5 = shl <4 x i32> %3, <i32 20, i32 19, i32 18, i32 17>
46  %6 = or <4 x i32> %4, %5
47  ret <4 x i32> %6
48}
49
50define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) {
51; SSE2-LABEL: combine_vec_rot_rot_splat:
52; SSE2:       # %bb.0:
53; SSE2-NEXT:    movdqa %xmm0, %xmm1
54; SSE2-NEXT:    psrld $25, %xmm1
55; SSE2-NEXT:    pslld $7, %xmm0
56; SSE2-NEXT:    por %xmm1, %xmm0
57; SSE2-NEXT:    retq
58;
59; XOP-LABEL: combine_vec_rot_rot_splat:
60; XOP:       # %bb.0:
61; XOP-NEXT:    vprotd $7, %xmm0, %xmm0
62; XOP-NEXT:    retq
63;
64; AVX2-LABEL: combine_vec_rot_rot_splat:
65; AVX2:       # %bb.0:
66; AVX2-NEXT:    vpsrld $25, %xmm0, %xmm1
67; AVX2-NEXT:    vpslld $7, %xmm0, %xmm0
68; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
69; AVX2-NEXT:    retq
70;
71; AVX512-LABEL: combine_vec_rot_rot_splat:
72; AVX512:       # %bb.0:
73; AVX512-NEXT:    vprold $7, %xmm0, %xmm0
74; AVX512-NEXT:    retq
75  %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
76  %2 = shl <4 x i32> %x, <i32 29, i32 29, i32 29, i32 29>
77  %3 = or <4 x i32> %1, %2
78  %4 = lshr <4 x i32> %3, <i32 22, i32 22, i32 22, i32 22>
79  %5 = shl <4 x i32> %3, <i32 10, i32 10, i32 10, i32 10>
80  %6 = or <4 x i32> %4, %5
81  ret <4 x i32> %6
82}
83
84define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) {
85; CHECK-LABEL: combine_vec_rot_rot_splat_zero:
86; CHECK:       # %bb.0:
87; CHECK-NEXT:    retq
88  %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
89  %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
90  %3 = or <4 x i32> %1, %2
91  %4 = lshr <4 x i32> %3, <i32 31, i32 31, i32 31, i32 31>
92  %5 = shl <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
93  %6 = or <4 x i32> %4, %5
94  ret <4 x i32> %6
95}
96
97; TODO - fold (select (icmp eq c, 0), x, (rot x, c)) -> rot x, c
98define i32 @combine_rot_select_zero(i32, i32) {
99; CHECK-LABEL: combine_rot_select_zero:
100; CHECK:       # %bb.0:
101; CHECK-NEXT:    movl %esi, %ecx
102; CHECK-NEXT:    movl %edi, %eax
103; CHECK-NEXT:    roll %cl, %eax
104; CHECK-NEXT:    testl %esi, %esi
105; CHECK-NEXT:    cmovel %edi, %eax
106; CHECK-NEXT:    retq
107  %3 = and i32 %1, 31
108  %4 = shl i32 %0, %3
109  %5 = sub i32 0, %1
110  %6 = and i32 %5, 31
111  %7 = lshr i32 %0, %6
112  %8 = or i32 %4, %7
113  %9 = icmp eq i32 %1, 0
114  %10 = select i1 %9, i32 %0, i32 %8
115  ret i32 %10
116}
117
118define <4 x i32> @combine_vec_rot_select_zero(<4 x i32>, <4 x i32>) {
119; SSE2-LABEL: combine_vec_rot_select_zero:
120; SSE2:       # %bb.0:
121; SSE2-NEXT:    pxor %xmm2, %xmm2
122; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [31,31,31,31]
123; SSE2-NEXT:    pand %xmm1, %xmm3
124; SSE2-NEXT:    pslld $23, %xmm3
125; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm3
126; SSE2-NEXT:    cvttps2dq %xmm3, %xmm3
127; SSE2-NEXT:    movdqa %xmm0, %xmm4
128; SSE2-NEXT:    pmuludq %xmm3, %xmm4
129; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
130; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
131; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
132; SSE2-NEXT:    pmuludq %xmm6, %xmm3
133; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
134; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
135; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
136; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
137; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
138; SSE2-NEXT:    por %xmm5, %xmm4
139; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
140; SSE2-NEXT:    pand %xmm2, %xmm0
141; SSE2-NEXT:    pandn %xmm4, %xmm2
142; SSE2-NEXT:    por %xmm2, %xmm0
143; SSE2-NEXT:    retq
144;
145; XOP-LABEL: combine_vec_rot_select_zero:
146; XOP:       # %bb.0:
147; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
148; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm3
149; XOP-NEXT:    vpcomeqd %xmm2, %xmm1, %xmm1
150; XOP-NEXT:    vblendvps %xmm1, %xmm0, %xmm3, %xmm0
151; XOP-NEXT:    retq
152;
153; AVX2-LABEL: combine_vec_rot_select_zero:
154; AVX2:       # %bb.0:
155; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
156; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
157; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm3
158; AVX2-NEXT:    vpsllvd %xmm3, %xmm0, %xmm4
159; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [32,32,32,32]
160; AVX2-NEXT:    vpsubd %xmm3, %xmm5, %xmm3
161; AVX2-NEXT:    vpsrlvd %xmm3, %xmm0, %xmm3
162; AVX2-NEXT:    vpor %xmm3, %xmm4, %xmm3
163; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
164; AVX2-NEXT:    vblendvps %xmm1, %xmm0, %xmm3, %xmm0
165; AVX2-NEXT:    retq
166;
167; AVX512-LABEL: combine_vec_rot_select_zero:
168; AVX512:       # %bb.0:
169; AVX512-NEXT:    vprolvd %xmm1, %xmm0, %xmm2
170; AVX512-NEXT:    vptestnmd %xmm1, %xmm1, %k1
171; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm2 {%k1}
172; AVX512-NEXT:    vmovdqa %xmm2, %xmm0
173; AVX512-NEXT:    retq
174  %3 = and <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31>
175  %4 = shl <4 x i32> %0, %3
176  %5 = sub <4 x i32> zeroinitializer, %1
177  %6 = and <4 x i32> %5, <i32 31, i32 31, i32 31, i32 31>
178  %7 = lshr <4 x i32> %0, %6
179  %8 = or <4 x i32> %4, %7
180  %9 = icmp eq <4 x i32> %1, zeroinitializer
181  %10 = select <4 x i1> %9, <4 x i32> %0, <4 x i32> %8
182  ret <4 x i32> %10
183}
184
185define <4 x i32> @rotate_demanded_bits(<4 x i32>, <4 x i32>) {
186; SSE2-LABEL: rotate_demanded_bits:
187; SSE2:       # %bb.0:
188; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
189; SSE2-NEXT:    pslld $23, %xmm1
190; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
191; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
192; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
193; SSE2-NEXT:    pmuludq %xmm1, %xmm0
194; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
195; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
196; SSE2-NEXT:    pmuludq %xmm2, %xmm1
197; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
198; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
199; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
200; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
201; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
202; SSE2-NEXT:    por %xmm3, %xmm0
203; SSE2-NEXT:    retq
204;
205; XOP-LABEL: rotate_demanded_bits:
206; XOP:       # %bb.0:
207; XOP-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
208; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
209; XOP-NEXT:    retq
210;
211; AVX2-LABEL: rotate_demanded_bits:
212; AVX2:       # %bb.0:
213; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [30,30,30,30]
214; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
215; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
216; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
217; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
218; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
219; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
220; AVX2-NEXT:    retq
221;
222; AVX512-LABEL: rotate_demanded_bits:
223; AVX512:       # %bb.0:
224; AVX512-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1
225; AVX512-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
226; AVX512-NEXT:    retq
227  %3 = and <4 x i32> %1, <i32 30, i32 30, i32 30, i32 30>
228  %4 = shl <4 x i32> %0, %3
229  %5 = sub nsw <4 x i32> zeroinitializer, %3
230  %6 = and <4 x i32> %5, <i32 30, i32 30, i32 30, i32 30>
231  %7 = lshr <4 x i32> %0, %6
232  %8 = or <4 x i32> %7, %4
233  ret <4 x i32> %8
234}
235
236define <4 x i32> @rotate_demanded_bits_2(<4 x i32>, <4 x i32>) {
237; SSE2-LABEL: rotate_demanded_bits_2:
238; SSE2:       # %bb.0:
239; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
240; SSE2-NEXT:    pslld $23, %xmm1
241; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
242; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
243; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
244; SSE2-NEXT:    pmuludq %xmm1, %xmm0
245; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
246; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
247; SSE2-NEXT:    pmuludq %xmm2, %xmm1
248; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
249; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
250; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
251; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
252; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
253; SSE2-NEXT:    por %xmm3, %xmm0
254; SSE2-NEXT:    retq
255;
256; XOP-LABEL: rotate_demanded_bits_2:
257; XOP:       # %bb.0:
258; XOP-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
259; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
260; XOP-NEXT:    retq
261;
262; AVX2-LABEL: rotate_demanded_bits_2:
263; AVX2:       # %bb.0:
264; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [23,23,23,23]
265; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
266; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
267; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
268; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
269; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
270; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
271; AVX2-NEXT:    retq
272;
273; AVX512-LABEL: rotate_demanded_bits_2:
274; AVX512:       # %bb.0:
275; AVX512-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1
276; AVX512-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
277; AVX512-NEXT:    retq
278  %3 = and <4 x i32> %1, <i32 23, i32 23, i32 23, i32 23>
279  %4 = shl <4 x i32> %0, %3
280  %5 = sub nsw <4 x i32> zeroinitializer, %3
281  %6 = and <4 x i32> %5, <i32 31, i32 31, i32 31, i32 31>
282  %7 = lshr <4 x i32> %0, %6
283  %8 = or <4 x i32> %7, %4
284  ret <4 x i32> %8
285}
286
287define <4 x i32> @rotate_demanded_bits_3(<4 x i32>, <4 x i32>) {
288; SSE2-LABEL: rotate_demanded_bits_3:
289; SSE2:       # %bb.0:
290; SSE2-NEXT:    paddd %xmm1, %xmm1
291; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
292; SSE2-NEXT:    pslld $23, %xmm1
293; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
294; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
295; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
296; SSE2-NEXT:    pmuludq %xmm1, %xmm0
297; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
298; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
299; SSE2-NEXT:    pmuludq %xmm2, %xmm1
300; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
301; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
302; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
303; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
304; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
305; SSE2-NEXT:    por %xmm3, %xmm0
306; SSE2-NEXT:    retq
307;
308; XOP-LABEL: rotate_demanded_bits_3:
309; XOP:       # %bb.0:
310; XOP-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
311; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
312; XOP-NEXT:    retq
313;
314; AVX2-LABEL: rotate_demanded_bits_3:
315; AVX2:       # %bb.0:
316; AVX2-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
317; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
318; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
319; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
320; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
321; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
322; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
323; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
324; AVX2-NEXT:    retq
325;
326; AVX512-LABEL: rotate_demanded_bits_3:
327; AVX512:       # %bb.0:
328; AVX512-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
329; AVX512-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
330; AVX512-NEXT:    retq
331  %3 = shl <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
332  %4 = and <4 x i32> %3, <i32 30, i32 30, i32 30, i32 30>
333  %5 = shl <4 x i32> %0, %4
334  %6 = sub <4 x i32> zeroinitializer, %3
335  %7 = and <4 x i32> %6, <i32 30, i32 30, i32 30, i32 30>
336  %8 = lshr <4 x i32> %0, %7
337  %9 = or <4 x i32> %5, %8
338  ret <4 x i32> %9
339}
340
341; OSS Fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=9935
342define i32 @fuzz9935() {
343; CHECK-LABEL: fuzz9935:
344; CHECK:       # %bb.0:
345; CHECK-NEXT:    movl $-1, %eax
346; CHECK-NEXT:    retq
347  %1 = trunc i40 549755813887 to i32
348  %2 = mul i32 %1, %1
349  %3 = lshr i32 %2, %1
350  %4 = or i32 %3, %2
351  ret i32 %4
352}
353