• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
5
6; fold (urem x, 1) -> 0
7define i32 @combine_urem_by_one(i32 %x) {
8; CHECK-LABEL: combine_urem_by_one:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    xorl %eax, %eax
11; CHECK-NEXT:    retq
12  %1 = urem i32 %x, 1
13  ret i32 %1
14}
15
16define <4 x i32> @combine_vec_urem_by_one(<4 x i32> %x) {
17; SSE-LABEL: combine_vec_urem_by_one:
18; SSE:       # %bb.0:
19; SSE-NEXT:    xorps %xmm0, %xmm0
20; SSE-NEXT:    retq
21;
22; AVX-LABEL: combine_vec_urem_by_one:
23; AVX:       # %bb.0:
24; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
25; AVX-NEXT:    retq
26  %1 = urem <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
27  ret <4 x i32> %1
28}
29
30; fold (urem x, -1) -> select((icmp eq x, -1), 0, x)
31define i32 @combine_urem_by_negone(i32 %x) {
32; CHECK-LABEL: combine_urem_by_negone:
33; CHECK:       # %bb.0:
34; CHECK-NEXT:    xorl %eax, %eax
35; CHECK-NEXT:    cmpl $-1, %edi
36; CHECK-NEXT:    cmovnel %edi, %eax
37; CHECK-NEXT:    retq
38  %1 = urem i32 %x, -1
39  ret i32 %1
40}
41
42define <4 x i32> @combine_vec_urem_by_negone(<4 x i32> %x) {
43; SSE-LABEL: combine_vec_urem_by_negone:
44; SSE:       # %bb.0:
45; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
46; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
47; SSE-NEXT:    pandn %xmm0, %xmm1
48; SSE-NEXT:    movdqa %xmm1, %xmm0
49; SSE-NEXT:    retq
50;
51; AVX-LABEL: combine_vec_urem_by_negone:
52; AVX:       # %bb.0:
53; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
54; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1
55; AVX-NEXT:    vpandn %xmm0, %xmm1, %xmm0
56; AVX-NEXT:    retq
57  %1 = urem <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
58  ret <4 x i32> %1
59}
60
61; fold (urem x, INT_MIN) -> (and x, ~INT_MIN)
62define i32 @combine_urem_by_minsigned(i32 %x) {
63; CHECK-LABEL: combine_urem_by_minsigned:
64; CHECK:       # %bb.0:
65; CHECK-NEXT:    andl $2147483647, %edi # imm = 0x7FFFFFFF
66; CHECK-NEXT:    movl %edi, %eax
67; CHECK-NEXT:    retq
68  %1 = urem i32 %x, -2147483648
69  ret i32 %1
70}
71
72define <4 x i32> @combine_vec_urem_by_minsigned(<4 x i32> %x) {
73; SSE-LABEL: combine_vec_urem_by_minsigned:
74; SSE:       # %bb.0:
75; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
76; SSE-NEXT:    retq
77;
78; AVX1-LABEL: combine_vec_urem_by_minsigned:
79; AVX1:       # %bb.0:
80; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
81; AVX1-NEXT:    retq
82;
83; AVX2-LABEL: combine_vec_urem_by_minsigned:
84; AVX2:       # %bb.0:
85; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2147483647,2147483647,2147483647,2147483647]
86; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
87; AVX2-NEXT:    retq
88  %1 = urem <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
89  ret <4 x i32> %1
90}
91
92; TODO fold (urem x, x) -> 0
93define i32 @combine_urem_dupe(i32 %x) {
94; CHECK-LABEL: combine_urem_dupe:
95; CHECK:       # %bb.0:
96; CHECK-NEXT:    xorl %edx, %edx
97; CHECK-NEXT:    movl %edi, %eax
98; CHECK-NEXT:    divl %edi
99; CHECK-NEXT:    movl %edx, %eax
100; CHECK-NEXT:    retq
101  %1 = urem i32 %x, %x
102  ret i32 %1
103}
104
105define <4 x i32> @combine_vec_urem_dupe(<4 x i32> %x) {
106; SSE-LABEL: combine_vec_urem_dupe:
107; SSE:       # %bb.0:
108; SSE-NEXT:    pextrd $1, %xmm0, %eax
109; SSE-NEXT:    xorl %edx, %edx
110; SSE-NEXT:    divl %eax
111; SSE-NEXT:    movl %edx, %ecx
112; SSE-NEXT:    movd %xmm0, %eax
113; SSE-NEXT:    xorl %edx, %edx
114; SSE-NEXT:    divl %eax
115; SSE-NEXT:    movd %edx, %xmm1
116; SSE-NEXT:    pinsrd $1, %ecx, %xmm1
117; SSE-NEXT:    pextrd $2, %xmm0, %eax
118; SSE-NEXT:    xorl %edx, %edx
119; SSE-NEXT:    divl %eax
120; SSE-NEXT:    pinsrd $2, %edx, %xmm1
121; SSE-NEXT:    pextrd $3, %xmm0, %eax
122; SSE-NEXT:    xorl %edx, %edx
123; SSE-NEXT:    divl %eax
124; SSE-NEXT:    pinsrd $3, %edx, %xmm1
125; SSE-NEXT:    movdqa %xmm1, %xmm0
126; SSE-NEXT:    retq
127;
128; AVX-LABEL: combine_vec_urem_dupe:
129; AVX:       # %bb.0:
130; AVX-NEXT:    vpextrd $1, %xmm0, %eax
131; AVX-NEXT:    xorl %edx, %edx
132; AVX-NEXT:    divl %eax
133; AVX-NEXT:    movl %edx, %ecx
134; AVX-NEXT:    vmovd %xmm0, %eax
135; AVX-NEXT:    xorl %edx, %edx
136; AVX-NEXT:    divl %eax
137; AVX-NEXT:    vmovd %edx, %xmm1
138; AVX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
139; AVX-NEXT:    vpextrd $2, %xmm0, %eax
140; AVX-NEXT:    xorl %edx, %edx
141; AVX-NEXT:    divl %eax
142; AVX-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
143; AVX-NEXT:    vpextrd $3, %xmm0, %eax
144; AVX-NEXT:    xorl %edx, %edx
145; AVX-NEXT:    divl %eax
146; AVX-NEXT:    vpinsrd $3, %edx, %xmm1, %xmm0
147; AVX-NEXT:    retq
148  %1 = urem <4 x i32> %x, %x
149  ret <4 x i32> %1
150}
151
152; fold (urem x, pow2) -> (and x, (pow2-1))
153define <4 x i32> @combine_vec_urem_by_pow2a(<4 x i32> %x) {
154; SSE-LABEL: combine_vec_urem_by_pow2a:
155; SSE:       # %bb.0:
156; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
157; SSE-NEXT:    retq
158;
159; AVX1-LABEL: combine_vec_urem_by_pow2a:
160; AVX1:       # %bb.0:
161; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
162; AVX1-NEXT:    retq
163;
164; AVX2-LABEL: combine_vec_urem_by_pow2a:
165; AVX2:       # %bb.0:
166; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3,3,3,3]
167; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
168; AVX2-NEXT:    retq
169  %1 = urem <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
170  ret <4 x i32> %1
171}
172
173define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) {
174; SSE-LABEL: combine_vec_urem_by_pow2b:
175; SSE:       # %bb.0:
176; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
177; SSE-NEXT:    retq
178;
179; AVX-LABEL: combine_vec_urem_by_pow2b:
180; AVX:       # %bb.0:
181; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
182; AVX-NEXT:    retq
183  %1 = urem <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
184  ret <4 x i32> %1
185}
186
187define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
188; SSE-LABEL: combine_vec_urem_by_pow2c:
189; SSE:       # %bb.0:
190; SSE-NEXT:    pslld $23, %xmm1
191; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
192; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
193; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
194; SSE-NEXT:    paddd %xmm1, %xmm2
195; SSE-NEXT:    pand %xmm2, %xmm0
196; SSE-NEXT:    retq
197;
198; AVX1-LABEL: combine_vec_urem_by_pow2c:
199; AVX1:       # %bb.0:
200; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
201; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
202; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
203; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
204; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
205; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
206; AVX1-NEXT:    retq
207;
208; AVX2-LABEL: combine_vec_urem_by_pow2c:
209; AVX2:       # %bb.0:
210; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
211; AVX2-NEXT:    vpsllvd %xmm1, %xmm2, %xmm1
212; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
213; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
214; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
215; AVX2-NEXT:    retq
216  %1 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y
217  %2 = urem <4 x i32> %x, %1
218  ret <4 x i32> %2
219}
220
221define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) {
222; SSE-LABEL: combine_vec_urem_by_pow2d:
223; SSE:       # %bb.0:
224; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
225; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
226; SSE-NEXT:    movdqa %xmm3, %xmm4
227; SSE-NEXT:    psrld %xmm2, %xmm4
228; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
229; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7]
230; SSE-NEXT:    movdqa %xmm3, %xmm6
231; SSE-NEXT:    psrld %xmm5, %xmm6
232; SSE-NEXT:    pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
233; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
234; SSE-NEXT:    movdqa %xmm3, %xmm4
235; SSE-NEXT:    psrld %xmm1, %xmm4
236; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
237; SSE-NEXT:    psrld %xmm1, %xmm3
238; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
239; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
240; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
241; SSE-NEXT:    paddd %xmm3, %xmm1
242; SSE-NEXT:    pand %xmm1, %xmm0
243; SSE-NEXT:    retq
244;
245; AVX1-LABEL: combine_vec_urem_by_pow2d:
246; AVX1:       # %bb.0:
247; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
248; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
249; AVX1-NEXT:    vpsrld %xmm2, %xmm3, %xmm2
250; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
251; AVX1-NEXT:    vpsrld %xmm4, %xmm3, %xmm4
252; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
253; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
254; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
255; AVX1-NEXT:    vpsrld %xmm4, %xmm3, %xmm4
256; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
257; AVX1-NEXT:    vpsrld %xmm1, %xmm3, %xmm1
258; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
259; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
260; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
261; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
262; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
263; AVX1-NEXT:    retq
264;
265; AVX2-LABEL: combine_vec_urem_by_pow2d:
266; AVX2:       # %bb.0:
267; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
268; AVX2-NEXT:    vpsrlvd %xmm1, %xmm2, %xmm1
269; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
270; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
271; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
272; AVX2-NEXT:    retq
273  %1 = lshr <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %y
274  %2 = urem <4 x i32> %x, %1
275  ret <4 x i32> %2
276}
277
278; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
279define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
280; SSE-LABEL: combine_vec_urem_by_shl_pow2a:
281; SSE:       # %bb.0:
282; SSE-NEXT:    pslld $23, %xmm1
283; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
284; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
285; SSE-NEXT:    pslld $2, %xmm1
286; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
287; SSE-NEXT:    paddd %xmm1, %xmm2
288; SSE-NEXT:    pand %xmm2, %xmm0
289; SSE-NEXT:    retq
290;
291; AVX1-LABEL: combine_vec_urem_by_shl_pow2a:
292; AVX1:       # %bb.0:
293; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
294; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
295; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
296; AVX1-NEXT:    vpslld $2, %xmm1, %xmm1
297; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
298; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
299; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
300; AVX1-NEXT:    retq
301;
302; AVX2-LABEL: combine_vec_urem_by_shl_pow2a:
303; AVX2:       # %bb.0:
304; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
305; AVX2-NEXT:    vpsllvd %xmm1, %xmm2, %xmm1
306; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
307; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
308; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
309; AVX2-NEXT:    retq
310  %1 = shl <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y
311  %2 = urem <4 x i32> %x, %1
312  ret <4 x i32> %2
313}
314
315define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
316; SSE-LABEL: combine_vec_urem_by_shl_pow2b:
317; SSE:       # %bb.0:
318; SSE-NEXT:    pslld $23, %xmm1
319; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
320; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
321; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm1
322; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
323; SSE-NEXT:    paddd %xmm1, %xmm2
324; SSE-NEXT:    pand %xmm2, %xmm0
325; SSE-NEXT:    retq
326;
327; AVX1-LABEL: combine_vec_urem_by_shl_pow2b:
328; AVX1:       # %bb.0:
329; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
330; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
331; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
332; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
333; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
334; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
335; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
336; AVX1-NEXT:    retq
337;
338; AVX2-LABEL: combine_vec_urem_by_shl_pow2b:
339; AVX2:       # %bb.0:
340; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,4,8,16]
341; AVX2-NEXT:    vpsllvd %xmm1, %xmm2, %xmm1
342; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
343; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
344; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
345; AVX2-NEXT:    retq
346  %1 = shl <4 x i32> <i32 1, i32 4, i32 8, i32 16>, %y
347  %2 = urem <4 x i32> %x, %1
348  ret <4 x i32> %2
349}
350