1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 5 6; fold (urem x, 1) -> 0 7define i32 @combine_urem_by_one(i32 %x) { 8; CHECK-LABEL: combine_urem_by_one: 9; CHECK: # %bb.0: 10; CHECK-NEXT: xorl %eax, %eax 11; CHECK-NEXT: retq 12 %1 = urem i32 %x, 1 13 ret i32 %1 14} 15 16define <4 x i32> @combine_vec_urem_by_one(<4 x i32> %x) { 17; SSE-LABEL: combine_vec_urem_by_one: 18; SSE: # %bb.0: 19; SSE-NEXT: xorps %xmm0, %xmm0 20; SSE-NEXT: retq 21; 22; AVX-LABEL: combine_vec_urem_by_one: 23; AVX: # %bb.0: 24; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 25; AVX-NEXT: retq 26 %1 = urem <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> 27 ret <4 x i32> %1 28} 29 30; fold (urem x, -1) -> select((icmp eq x, -1), 0, x) 31define i32 @combine_urem_by_negone(i32 %x) { 32; CHECK-LABEL: combine_urem_by_negone: 33; CHECK: # %bb.0: 34; CHECK-NEXT: xorl %eax, %eax 35; CHECK-NEXT: cmpl $-1, %edi 36; CHECK-NEXT: cmovnel %edi, %eax 37; CHECK-NEXT: retq 38 %1 = urem i32 %x, -1 39 ret i32 %1 40} 41 42define <4 x i32> @combine_vec_urem_by_negone(<4 x i32> %x) { 43; SSE-LABEL: combine_vec_urem_by_negone: 44; SSE: # %bb.0: 45; SSE-NEXT: pcmpeqd %xmm1, %xmm1 46; SSE-NEXT: pcmpeqd %xmm0, %xmm1 47; SSE-NEXT: pandn %xmm0, %xmm1 48; SSE-NEXT: movdqa %xmm1, %xmm0 49; SSE-NEXT: retq 50; 51; AVX-LABEL: combine_vec_urem_by_negone: 52; AVX: # %bb.0: 53; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 54; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 55; AVX-NEXT: vpandn %xmm0, %xmm1, %xmm0 56; AVX-NEXT: retq 57 %1 = urem <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> 58 ret <4 x i32> %1 59} 60 61; fold (urem x, INT_MIN) -> (and x, ~INT_MIN) 62define i32 @combine_urem_by_minsigned(i32 %x) { 63; CHECK-LABEL: combine_urem_by_minsigned: 64; CHECK: # %bb.0: 65; CHECK-NEXT: movl %edi, %eax 66; CHECK-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF 67; CHECK-NEXT: retq 68 %1 = urem i32 %x, -2147483648 69 ret i32 %1 70} 71 72define <4 x i32> @combine_vec_urem_by_minsigned(<4 x i32> %x) { 73; SSE-LABEL: combine_vec_urem_by_minsigned: 74; SSE: # %bb.0: 75; SSE-NEXT: andps {{.*}}(%rip), %xmm0 76; SSE-NEXT: retq 77; 78; AVX1-LABEL: combine_vec_urem_by_minsigned: 79; AVX1: # %bb.0: 80; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 81; AVX1-NEXT: retq 82; 83; AVX2-LABEL: combine_vec_urem_by_minsigned: 84; AVX2: # %bb.0: 85; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483647,2147483647,2147483647,2147483647] 86; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 87; AVX2-NEXT: retq 88 %1 = urem <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> 89 ret <4 x i32> %1 90} 91 92; fold (urem 0, x) -> 0 93define i32 @combine_urem_zero(i32 %x) { 94; CHECK-LABEL: combine_urem_zero: 95; CHECK: # %bb.0: 96; CHECK-NEXT: xorl %eax, %eax 97; CHECK-NEXT: retq 98 %1 = urem i32 0, %x 99 ret i32 %1 100} 101 102define <4 x i32> @combine_vec_urem_zero(<4 x i32> %x) { 103; SSE-LABEL: combine_vec_urem_zero: 104; SSE: # %bb.0: 105; SSE-NEXT: xorps %xmm0, %xmm0 106; SSE-NEXT: retq 107; 108; AVX-LABEL: combine_vec_urem_zero: 109; AVX: # %bb.0: 110; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 111; AVX-NEXT: retq 112 %1 = urem <4 x i32> zeroinitializer, %x 113 ret <4 x i32> %1 114} 115 116; fold (urem x, x) -> 0 117define i32 @combine_urem_dupe(i32 %x) { 118; CHECK-LABEL: combine_urem_dupe: 119; CHECK: # %bb.0: 120; CHECK-NEXT: xorl %eax, %eax 121; CHECK-NEXT: retq 122 %1 = urem i32 %x, %x 123 ret i32 %1 124} 125 126define <4 x i32> @combine_vec_urem_dupe(<4 x i32> %x) { 127; SSE-LABEL: combine_vec_urem_dupe: 128; SSE: # %bb.0: 129; SSE-NEXT: xorps %xmm0, %xmm0 130; SSE-NEXT: retq 131; 132; AVX-LABEL: combine_vec_urem_dupe: 133; AVX: # %bb.0: 134; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 135; AVX-NEXT: retq 136 %1 = urem <4 x i32> %x, %x 137 ret <4 x i32> %1 138} 139 140; fold (urem x, pow2) -> (and x, (pow2-1)) 141define <4 x i32> @combine_vec_urem_by_pow2a(<4 x i32> %x) { 142; SSE-LABEL: combine_vec_urem_by_pow2a: 143; SSE: # %bb.0: 144; SSE-NEXT: andps {{.*}}(%rip), %xmm0 145; SSE-NEXT: retq 146; 147; AVX1-LABEL: combine_vec_urem_by_pow2a: 148; AVX1: # %bb.0: 149; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 150; AVX1-NEXT: retq 151; 152; AVX2-LABEL: combine_vec_urem_by_pow2a: 153; AVX2: # %bb.0: 154; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] 155; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 156; AVX2-NEXT: retq 157 %1 = urem <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4> 158 ret <4 x i32> %1 159} 160 161define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) { 162; SSE-LABEL: combine_vec_urem_by_pow2b: 163; SSE: # %bb.0: 164; SSE-NEXT: andps {{.*}}(%rip), %xmm0 165; SSE-NEXT: retq 166; 167; AVX-LABEL: combine_vec_urem_by_pow2b: 168; AVX: # %bb.0: 169; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 170; AVX-NEXT: retq 171 %1 = urem <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16> 172 ret <4 x i32> %1 173} 174 175define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) { 176; SSE-LABEL: combine_vec_urem_by_pow2c: 177; SSE: # %bb.0: 178; SSE-NEXT: pslld $23, %xmm1 179; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 180; SSE-NEXT: cvttps2dq %xmm1, %xmm1 181; SSE-NEXT: pcmpeqd %xmm2, %xmm2 182; SSE-NEXT: paddd %xmm1, %xmm2 183; SSE-NEXT: pand %xmm2, %xmm0 184; SSE-NEXT: retq 185; 186; AVX1-LABEL: combine_vec_urem_by_pow2c: 187; AVX1: # %bb.0: 188; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 189; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 190; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 191; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 192; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 193; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 194; AVX1-NEXT: retq 195; 196; AVX2-LABEL: combine_vec_urem_by_pow2c: 197; AVX2: # %bb.0: 198; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] 199; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 200; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 201; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 202; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 203; AVX2-NEXT: retq 204 %1 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y 205 %2 = urem <4 x i32> %x, %1 206 ret <4 x i32> %2 207} 208 209define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) { 210; SSE-LABEL: combine_vec_urem_by_pow2d: 211; SSE: # %bb.0: 212; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 213; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 214; SSE-NEXT: movdqa %xmm3, %xmm4 215; SSE-NEXT: psrld %xmm2, %xmm4 216; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 217; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] 218; SSE-NEXT: movdqa %xmm3, %xmm6 219; SSE-NEXT: psrld %xmm5, %xmm6 220; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7] 221; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 222; SSE-NEXT: movdqa %xmm3, %xmm4 223; SSE-NEXT: psrld %xmm1, %xmm4 224; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 225; SSE-NEXT: psrld %xmm1, %xmm3 226; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 227; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] 228; SSE-NEXT: pcmpeqd %xmm1, %xmm1 229; SSE-NEXT: paddd %xmm3, %xmm1 230; SSE-NEXT: pand %xmm1, %xmm0 231; SSE-NEXT: retq 232; 233; AVX1-LABEL: combine_vec_urem_by_pow2d: 234; AVX1: # %bb.0: 235; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 236; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 237; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 238; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 239; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 240; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 241; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 242; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] 243; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 244; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 245; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1 246; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 247; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 248; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 249; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 250; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 251; AVX1-NEXT: retq 252; 253; AVX2-LABEL: combine_vec_urem_by_pow2d: 254; AVX2: # %bb.0: 255; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 256; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 257; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 258; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 259; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 260; AVX2-NEXT: retq 261 %1 = lshr <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %y 262 %2 = urem <4 x i32> %x, %1 263 ret <4 x i32> %2 264} 265 266; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) 267define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { 268; SSE-LABEL: combine_vec_urem_by_shl_pow2a: 269; SSE: # %bb.0: 270; SSE-NEXT: pslld $23, %xmm1 271; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 272; SSE-NEXT: cvttps2dq %xmm1, %xmm1 273; SSE-NEXT: pslld $2, %xmm1 274; SSE-NEXT: pcmpeqd %xmm2, %xmm2 275; SSE-NEXT: paddd %xmm1, %xmm2 276; SSE-NEXT: pand %xmm2, %xmm0 277; SSE-NEXT: retq 278; 279; AVX1-LABEL: combine_vec_urem_by_shl_pow2a: 280; AVX1: # %bb.0: 281; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 282; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 283; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 284; AVX1-NEXT: vpslld $2, %xmm1, %xmm1 285; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 286; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 287; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 288; AVX1-NEXT: retq 289; 290; AVX2-LABEL: combine_vec_urem_by_shl_pow2a: 291; AVX2: # %bb.0: 292; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4] 293; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 294; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 295; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 296; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 297; AVX2-NEXT: retq 298 %1 = shl <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y 299 %2 = urem <4 x i32> %x, %1 300 ret <4 x i32> %2 301} 302 303define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { 304; SSE-LABEL: combine_vec_urem_by_shl_pow2b: 305; SSE: # %bb.0: 306; SSE-NEXT: pslld $23, %xmm1 307; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 308; SSE-NEXT: cvttps2dq %xmm1, %xmm1 309; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 310; SSE-NEXT: pcmpeqd %xmm2, %xmm2 311; SSE-NEXT: paddd %xmm1, %xmm2 312; SSE-NEXT: pand %xmm2, %xmm0 313; SSE-NEXT: retq 314; 315; AVX1-LABEL: combine_vec_urem_by_shl_pow2b: 316; AVX1: # %bb.0: 317; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 318; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 319; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 320; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 321; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 322; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 323; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 324; AVX1-NEXT: retq 325; 326; AVX2-LABEL: combine_vec_urem_by_shl_pow2b: 327; AVX2: # %bb.0: 328; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,8,16] 329; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 330; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 331; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 332; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 333; AVX2-NEXT: retq 334 %1 = shl <4 x i32> <i32 1, i32 4, i32 8, i32 16>, %y 335 %2 = urem <4 x i32> %x, %1 336 ret <4 x i32> %2 337} 338 339define i1 @bool_urem(i1 %x, i1 %y) { 340; CHECK-LABEL: bool_urem: 341; CHECK: # %bb.0: 342; CHECK-NEXT: xorl %eax, %eax 343; CHECK-NEXT: retq 344 %r = urem i1 %x, %y 345 ret i1 %r 346} 347 348define <4 x i1> @boolvec_urem(<4 x i1> %x, <4 x i1> %y) { 349; SSE-LABEL: boolvec_urem: 350; SSE: # %bb.0: 351; SSE-NEXT: xorps %xmm0, %xmm0 352; SSE-NEXT: retq 353; 354; AVX-LABEL: boolvec_urem: 355; AVX: # %bb.0: 356; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 357; AVX-NEXT: retq 358 %r = urem <4 x i1> %x, %y 359 ret <4 x i1> %r 360} 361