1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,XOP 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512 6 7; fold (rot (rot x, c1), c2) -> rot x, c1+c2 8define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) { 9; SSE2-LABEL: combine_vec_rot_rot: 10; SSE2: # %bb.0: 11; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [524288,131072,32768,8192] 12; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 13; SSE2-NEXT: pmuludq %xmm1, %xmm0 14; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 15; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 16; SSE2-NEXT: pmuludq %xmm2, %xmm1 17; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 18; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 19; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 20; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 21; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 22; SSE2-NEXT: por %xmm3, %xmm0 23; SSE2-NEXT: retq 24; 25; XOP-LABEL: combine_vec_rot_rot: 26; XOP: # %bb.0: 27; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 28; XOP-NEXT: retq 29; 30; AVX2-LABEL: combine_vec_rot_rot: 31; AVX2: # %bb.0: 32; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 33; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 34; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 35; AVX2-NEXT: retq 36; 37; AVX512-LABEL: combine_vec_rot_rot: 38; AVX512: # %bb.0: 39; AVX512-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 40; AVX512-NEXT: retq 41 %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4> 42 %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28> 43 %3 = or <4 x i32> %1, %2 44 %4 = lshr <4 x i32> %3, <i32 12, i32 13, i32 14, i32 15> 45 %5 = shl <4 x i32> %3, <i32 20, i32 19, i32 18, i32 17> 46 %6 = or <4 x i32> %4, %5 47 ret <4 x i32> %6 48} 49 50define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) { 51; SSE2-LABEL: combine_vec_rot_rot_splat: 52; SSE2: # %bb.0: 53; SSE2-NEXT: movdqa %xmm0, %xmm1 54; SSE2-NEXT: psrld $25, %xmm1 55; SSE2-NEXT: pslld $7, %xmm0 56; SSE2-NEXT: por %xmm1, %xmm0 57; SSE2-NEXT: retq 58; 59; XOP-LABEL: combine_vec_rot_rot_splat: 60; XOP: # %bb.0: 61; XOP-NEXT: vprotd $7, %xmm0, %xmm0 62; XOP-NEXT: retq 63; 64; AVX2-LABEL: combine_vec_rot_rot_splat: 65; AVX2: # %bb.0: 66; AVX2-NEXT: vpsrld $25, %xmm0, %xmm1 67; AVX2-NEXT: vpslld $7, %xmm0, %xmm0 68; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 69; AVX2-NEXT: retq 70; 71; AVX512-LABEL: combine_vec_rot_rot_splat: 72; AVX512: # %bb.0: 73; AVX512-NEXT: vprold $7, %xmm0, %xmm0 74; AVX512-NEXT: retq 75 %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3> 76 %2 = shl <4 x i32> %x, <i32 29, i32 29, i32 29, i32 29> 77 %3 = or <4 x i32> %1, %2 78 %4 = lshr <4 x i32> %3, <i32 22, i32 22, i32 22, i32 22> 79 %5 = shl <4 x i32> %3, <i32 10, i32 10, i32 10, i32 10> 80 %6 = or <4 x i32> %4, %5 81 ret <4 x i32> %6 82} 83 84define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) { 85; CHECK-LABEL: combine_vec_rot_rot_splat_zero: 86; CHECK: # %bb.0: 87; CHECK-NEXT: retq 88 %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> 89 %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31> 90 %3 = or <4 x i32> %1, %2 91 %4 = lshr <4 x i32> %3, <i32 31, i32 31, i32 31, i32 31> 92 %5 = shl <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 93 %6 = or <4 x i32> %4, %5 94 ret <4 x i32> %6 95} 96 97; TODO - fold (select (icmp eq c, 0), x, (rot x, c)) -> rot x, c 98define i32 @combine_rot_select_zero(i32, i32) { 99; CHECK-LABEL: combine_rot_select_zero: 100; CHECK: # %bb.0: 101; CHECK-NEXT: movl %esi, %ecx 102; CHECK-NEXT: movl %edi, %eax 103; CHECK-NEXT: roll %cl, %eax 104; CHECK-NEXT: testl %esi, %esi 105; CHECK-NEXT: cmovel %edi, %eax 106; CHECK-NEXT: retq 107 %3 = and i32 %1, 31 108 %4 = shl i32 %0, %3 109 %5 = sub i32 0, %1 110 %6 = and i32 %5, 31 111 %7 = lshr i32 %0, %6 112 %8 = or i32 %4, %7 113 %9 = icmp eq i32 %1, 0 114 %10 = select i1 %9, i32 %0, i32 %8 115 ret i32 %10 116} 117 118define <4 x i32> @combine_vec_rot_select_zero(<4 x i32>, <4 x i32>) { 119; SSE2-LABEL: combine_vec_rot_select_zero: 120; SSE2: # %bb.0: 121; SSE2-NEXT: pxor %xmm2, %xmm2 122; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] 123; SSE2-NEXT: pand %xmm1, %xmm3 124; SSE2-NEXT: pslld $23, %xmm3 125; SSE2-NEXT: paddd {{.*}}(%rip), %xmm3 126; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 127; SSE2-NEXT: movdqa %xmm0, %xmm4 128; SSE2-NEXT: pmuludq %xmm3, %xmm4 129; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] 130; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] 131; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 132; SSE2-NEXT: pmuludq %xmm6, %xmm3 133; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] 134; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 135; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 136; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 137; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 138; SSE2-NEXT: por %xmm5, %xmm4 139; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 140; SSE2-NEXT: pand %xmm2, %xmm0 141; SSE2-NEXT: pandn %xmm4, %xmm2 142; SSE2-NEXT: por %xmm2, %xmm0 143; SSE2-NEXT: retq 144; 145; XOP-LABEL: combine_vec_rot_select_zero: 146; XOP: # %bb.0: 147; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 148; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm3 149; XOP-NEXT: vpcomeqd %xmm2, %xmm1, %xmm1 150; XOP-NEXT: vblendvps %xmm1, %xmm0, %xmm3, %xmm0 151; XOP-NEXT: retq 152; 153; AVX2-LABEL: combine_vec_rot_select_zero: 154; AVX2: # %bb.0: 155; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 156; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 157; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm3 158; AVX2-NEXT: vpsllvd %xmm3, %xmm0, %xmm4 159; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [32,32,32,32] 160; AVX2-NEXT: vpsubd %xmm3, %xmm5, %xmm3 161; AVX2-NEXT: vpsrlvd %xmm3, %xmm0, %xmm3 162; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 163; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 164; AVX2-NEXT: vblendvps %xmm1, %xmm0, %xmm3, %xmm0 165; AVX2-NEXT: retq 166; 167; AVX512-LABEL: combine_vec_rot_select_zero: 168; AVX512: # %bb.0: 169; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm2 170; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k1 171; AVX512-NEXT: vmovdqa32 %xmm0, %xmm2 {%k1} 172; AVX512-NEXT: vmovdqa %xmm2, %xmm0 173; AVX512-NEXT: retq 174 %3 = and <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31> 175 %4 = shl <4 x i32> %0, %3 176 %5 = sub <4 x i32> zeroinitializer, %1 177 %6 = and <4 x i32> %5, <i32 31, i32 31, i32 31, i32 31> 178 %7 = lshr <4 x i32> %0, %6 179 %8 = or <4 x i32> %4, %7 180 %9 = icmp eq <4 x i32> %1, zeroinitializer 181 %10 = select <4 x i1> %9, <4 x i32> %0, <4 x i32> %8 182 ret <4 x i32> %10 183} 184 185define <4 x i32> @rotate_demanded_bits(<4 x i32>, <4 x i32>) { 186; SSE2-LABEL: rotate_demanded_bits: 187; SSE2: # %bb.0: 188; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 189; SSE2-NEXT: pslld $23, %xmm1 190; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 191; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 192; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 193; SSE2-NEXT: pmuludq %xmm1, %xmm0 194; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 195; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 196; SSE2-NEXT: pmuludq %xmm2, %xmm1 197; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 198; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 199; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 200; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 201; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 202; SSE2-NEXT: por %xmm3, %xmm0 203; SSE2-NEXT: retq 204; 205; XOP-LABEL: rotate_demanded_bits: 206; XOP: # %bb.0: 207; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 208; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 209; XOP-NEXT: retq 210; 211; AVX2-LABEL: rotate_demanded_bits: 212; AVX2: # %bb.0: 213; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [30,30,30,30] 214; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 215; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 216; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 217; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 218; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 219; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 220; AVX2-NEXT: retq 221; 222; AVX512-LABEL: rotate_demanded_bits: 223; AVX512: # %bb.0: 224; AVX512-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1 225; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 226; AVX512-NEXT: retq 227 %3 = and <4 x i32> %1, <i32 30, i32 30, i32 30, i32 30> 228 %4 = shl <4 x i32> %0, %3 229 %5 = sub nsw <4 x i32> zeroinitializer, %3 230 %6 = and <4 x i32> %5, <i32 30, i32 30, i32 30, i32 30> 231 %7 = lshr <4 x i32> %0, %6 232 %8 = or <4 x i32> %7, %4 233 ret <4 x i32> %8 234} 235 236define <4 x i32> @rotate_demanded_bits_2(<4 x i32>, <4 x i32>) { 237; SSE2-LABEL: rotate_demanded_bits_2: 238; SSE2: # %bb.0: 239; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 240; SSE2-NEXT: pslld $23, %xmm1 241; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 242; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 243; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 244; SSE2-NEXT: pmuludq %xmm1, %xmm0 245; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 246; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 247; SSE2-NEXT: pmuludq %xmm2, %xmm1 248; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 249; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 250; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 251; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 252; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 253; SSE2-NEXT: por %xmm3, %xmm0 254; SSE2-NEXT: retq 255; 256; XOP-LABEL: rotate_demanded_bits_2: 257; XOP: # %bb.0: 258; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 259; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 260; XOP-NEXT: retq 261; 262; AVX2-LABEL: rotate_demanded_bits_2: 263; AVX2: # %bb.0: 264; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [23,23,23,23] 265; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 266; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 267; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 268; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 269; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 270; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 271; AVX2-NEXT: retq 272; 273; AVX512-LABEL: rotate_demanded_bits_2: 274; AVX512: # %bb.0: 275; AVX512-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1 276; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 277; AVX512-NEXT: retq 278 %3 = and <4 x i32> %1, <i32 23, i32 23, i32 23, i32 23> 279 %4 = shl <4 x i32> %0, %3 280 %5 = sub nsw <4 x i32> zeroinitializer, %3 281 %6 = and <4 x i32> %5, <i32 31, i32 31, i32 31, i32 31> 282 %7 = lshr <4 x i32> %0, %6 283 %8 = or <4 x i32> %7, %4 284 ret <4 x i32> %8 285} 286 287define <4 x i32> @rotate_demanded_bits_3(<4 x i32>, <4 x i32>) { 288; SSE2-LABEL: rotate_demanded_bits_3: 289; SSE2: # %bb.0: 290; SSE2-NEXT: paddd %xmm1, %xmm1 291; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 292; SSE2-NEXT: pslld $23, %xmm1 293; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 294; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 295; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 296; SSE2-NEXT: pmuludq %xmm1, %xmm0 297; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 298; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 299; SSE2-NEXT: pmuludq %xmm2, %xmm1 300; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 301; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 302; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 303; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 304; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 305; SSE2-NEXT: por %xmm3, %xmm0 306; SSE2-NEXT: retq 307; 308; XOP-LABEL: rotate_demanded_bits_3: 309; XOP: # %bb.0: 310; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm1 311; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 312; XOP-NEXT: retq 313; 314; AVX2-LABEL: rotate_demanded_bits_3: 315; AVX2: # %bb.0: 316; AVX2-NEXT: vpaddd %xmm1, %xmm1, %xmm1 317; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] 318; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 319; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 320; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 321; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 322; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 323; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 324; AVX2-NEXT: retq 325; 326; AVX512-LABEL: rotate_demanded_bits_3: 327; AVX512: # %bb.0: 328; AVX512-NEXT: vpaddd %xmm1, %xmm1, %xmm1 329; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 330; AVX512-NEXT: retq 331 %3 = shl <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1> 332 %4 = and <4 x i32> %3, <i32 30, i32 30, i32 30, i32 30> 333 %5 = shl <4 x i32> %0, %4 334 %6 = sub <4 x i32> zeroinitializer, %3 335 %7 = and <4 x i32> %6, <i32 30, i32 30, i32 30, i32 30> 336 %8 = lshr <4 x i32> %0, %7 337 %9 = or <4 x i32> %5, %8 338 ret <4 x i32> %9 339} 340 341; OSS Fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=9935 342define i32 @fuzz9935() { 343; CHECK-LABEL: fuzz9935: 344; CHECK: # %bb.0: 345; CHECK-NEXT: movl $-1, %eax 346; CHECK-NEXT: retq 347 %1 = trunc i40 549755813887 to i32 348 %2 = mul i32 %1, %1 349 %3 = lshr i32 %2, %1 350 %4 = or i32 %3, %2 351 ret i32 %4 352} 353