1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,AVX,XOP 9 10; fold (sdiv x, 1) -> x 11define i32 @combine_sdiv_by_one(i32 %x) { 12; CHECK-LABEL: combine_sdiv_by_one: 13; CHECK: # %bb.0: 14; CHECK-NEXT: movl %edi, %eax 15; CHECK-NEXT: retq 16 %1 = sdiv i32 %x, 1 17 ret i32 %1 18} 19 20define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) { 21; CHECK-LABEL: combine_vec_sdiv_by_one: 22; CHECK: # %bb.0: 23; CHECK-NEXT: retq 24 %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> 25 ret <4 x i32> %1 26} 27 28; fold (sdiv x, -1) -> 0 - x 29define i32 @combine_sdiv_by_negone(i32 %x) { 30; CHECK-LABEL: combine_sdiv_by_negone: 31; CHECK: # %bb.0: 32; CHECK-NEXT: movl %edi, %eax 33; CHECK-NEXT: negl %eax 34; CHECK-NEXT: retq 35 %1 = sdiv i32 %x, -1 36 ret i32 %1 37} 38 39define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) { 40; SSE-LABEL: combine_vec_sdiv_by_negone: 41; SSE: # %bb.0: 42; SSE-NEXT: pxor %xmm1, %xmm1 43; SSE-NEXT: psubd %xmm0, %xmm1 44; SSE-NEXT: movdqa %xmm1, %xmm0 45; SSE-NEXT: retq 46; 47; AVX-LABEL: combine_vec_sdiv_by_negone: 48; AVX: # %bb.0: 49; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 50; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 51; AVX-NEXT: retq 52 %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> 53 ret <4 x i32> %1 54} 55 56; fold (sdiv x, INT_MIN) -> select((icmp eq x, INT_MIN), 1, 0) 57define i32 @combine_sdiv_by_minsigned(i32 %x) { 58; CHECK-LABEL: combine_sdiv_by_minsigned: 59; CHECK: # %bb.0: 60; CHECK-NEXT: xorl %eax, %eax 61; CHECK-NEXT: cmpl $-2147483648, %edi # imm = 0x80000000 62; CHECK-NEXT: sete %al 63; CHECK-NEXT: retq 64 %1 = sdiv i32 %x, -2147483648 65 ret i32 %1 66} 67 68define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) { 69; SSE-LABEL: combine_vec_sdiv_by_minsigned: 70; SSE: # %bb.0: 71; SSE-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 72; SSE-NEXT: psrld $31, %xmm0 73; SSE-NEXT: retq 74; 75; AVX1-LABEL: combine_vec_sdiv_by_minsigned: 76; AVX1: # %bb.0: 77; AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 78; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 79; AVX1-NEXT: retq 80; 81; AVX2-LABEL: combine_vec_sdiv_by_minsigned: 82; AVX2: # %bb.0: 83; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 84; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 85; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 86; AVX2-NEXT: retq 87; 88; AVX512F-LABEL: combine_vec_sdiv_by_minsigned: 89; AVX512F: # %bb.0: 90; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 91; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 92; AVX512F-NEXT: vpsrld $31, %xmm0, %xmm0 93; AVX512F-NEXT: retq 94; 95; AVX512BW-LABEL: combine_vec_sdiv_by_minsigned: 96; AVX512BW: # %bb.0: 97; AVX512BW-NEXT: vpcmpeqd {{.*}}(%rip){1to4}, %xmm0, %k1 98; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} 99; AVX512BW-NEXT: retq 100; 101; XOP-LABEL: combine_vec_sdiv_by_minsigned: 102; XOP: # %bb.0: 103; XOP-NEXT: vpcomeqd {{.*}}(%rip), %xmm0, %xmm0 104; XOP-NEXT: vpsrld $31, %xmm0, %xmm0 105; XOP-NEXT: retq 106 %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> 107 ret <4 x i32> %1 108} 109 110; fold (sdiv 0, x) -> 0 111define i32 @combine_sdiv_zero(i32 %x) { 112; CHECK-LABEL: combine_sdiv_zero: 113; CHECK: # %bb.0: 114; CHECK-NEXT: xorl %eax, %eax 115; CHECK-NEXT: retq 116 %1 = sdiv i32 0, %x 117 ret i32 %1 118} 119 120define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) { 121; SSE-LABEL: combine_vec_sdiv_zero: 122; SSE: # %bb.0: 123; SSE-NEXT: xorps %xmm0, %xmm0 124; SSE-NEXT: retq 125; 126; AVX-LABEL: combine_vec_sdiv_zero: 127; AVX: # %bb.0: 128; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 129; AVX-NEXT: retq 130 %1 = sdiv <4 x i32> zeroinitializer, %x 131 ret <4 x i32> %1 132} 133 134; fold (sdiv x, x) -> 1 135define i32 @combine_sdiv_dupe(i32 %x) { 136; CHECK-LABEL: combine_sdiv_dupe: 137; CHECK: # %bb.0: 138; CHECK-NEXT: movl $1, %eax 139; CHECK-NEXT: retq 140 %1 = sdiv i32 %x, %x 141 ret i32 %1 142} 143 144define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) { 145; SSE-LABEL: combine_vec_sdiv_dupe: 146; SSE: # %bb.0: 147; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] 148; SSE-NEXT: retq 149; 150; AVX1-LABEL: combine_vec_sdiv_dupe: 151; AVX1: # %bb.0: 152; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] 153; AVX1-NEXT: retq 154; 155; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe: 156; AVX2ORLATER: # %bb.0: 157; AVX2ORLATER-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] 158; AVX2ORLATER-NEXT: retq 159; 160; XOP-LABEL: combine_vec_sdiv_dupe: 161; XOP: # %bb.0: 162; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] 163; XOP-NEXT: retq 164 %1 = sdiv <4 x i32> %x, %x 165 ret <4 x i32> %1 166} 167 168; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive 169define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) { 170; SSE-LABEL: combine_vec_sdiv_by_pos0: 171; SSE: # %bb.0: 172; SSE-NEXT: pand {{.*}}(%rip), %xmm0 173; SSE-NEXT: psrld $2, %xmm0 174; SSE-NEXT: retq 175; 176; AVX-LABEL: combine_vec_sdiv_by_pos0: 177; AVX: # %bb.0: 178; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 179; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 180; AVX-NEXT: retq 181 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255> 182 %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4> 183 ret <4 x i32> %2 184} 185 186define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) { 187; SSE2-LABEL: combine_vec_sdiv_by_pos1: 188; SSE2: # %bb.0: 189; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 190; SSE2-NEXT: movdqa %xmm0, %xmm1 191; SSE2-NEXT: psrld $4, %xmm1 192; SSE2-NEXT: movdqa %xmm0, %xmm2 193; SSE2-NEXT: psrld $3, %xmm2 194; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 195; SSE2-NEXT: movdqa %xmm0, %xmm1 196; SSE2-NEXT: psrld $2, %xmm1 197; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 198; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 199; SSE2-NEXT: retq 200; 201; SSE41-LABEL: combine_vec_sdiv_by_pos1: 202; SSE41: # %bb.0: 203; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 204; SSE41-NEXT: movdqa %xmm0, %xmm2 205; SSE41-NEXT: movdqa %xmm0, %xmm1 206; SSE41-NEXT: psrld $3, %xmm1 207; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 208; SSE41-NEXT: psrld $4, %xmm0 209; SSE41-NEXT: psrld $2, %xmm2 210; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] 211; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 212; SSE41-NEXT: movdqa %xmm1, %xmm0 213; SSE41-NEXT: retq 214; 215; AVX1-LABEL: combine_vec_sdiv_by_pos1: 216; AVX1: # %bb.0: 217; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 218; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1 219; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 220; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 221; AVX1-NEXT: vpsrld $3, %xmm0, %xmm2 222; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 223; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 224; AVX1-NEXT: retq 225; 226; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pos1: 227; AVX2ORLATER: # %bb.0: 228; AVX2ORLATER-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 229; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 230; AVX2ORLATER-NEXT: retq 231; 232; XOP-LABEL: combine_vec_sdiv_by_pos1: 233; XOP: # %bb.0: 234; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 235; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 236; XOP-NEXT: retq 237 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255> 238 %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16> 239 ret <4 x i32> %2 240} 241 242; fold (sdiv x, (1 << c)) -> x >>u c 243define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) { 244; SSE-LABEL: combine_vec_sdiv_by_pow2a: 245; SSE: # %bb.0: 246; SSE-NEXT: movdqa %xmm0, %xmm1 247; SSE-NEXT: psrad $31, %xmm1 248; SSE-NEXT: psrld $30, %xmm1 249; SSE-NEXT: paddd %xmm0, %xmm1 250; SSE-NEXT: psrad $2, %xmm1 251; SSE-NEXT: movdqa %xmm1, %xmm0 252; SSE-NEXT: retq 253; 254; AVX-LABEL: combine_vec_sdiv_by_pow2a: 255; AVX: # %bb.0: 256; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 257; AVX-NEXT: vpsrld $30, %xmm1, %xmm1 258; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 259; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 260; AVX-NEXT: retq 261 %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4> 262 ret <4 x i32> %1 263} 264 265define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) { 266; SSE-LABEL: combine_vec_sdiv_by_pow2a_neg: 267; SSE: # %bb.0: 268; SSE-NEXT: movdqa %xmm0, %xmm1 269; SSE-NEXT: psrad $31, %xmm1 270; SSE-NEXT: psrld $30, %xmm1 271; SSE-NEXT: paddd %xmm0, %xmm1 272; SSE-NEXT: psrad $2, %xmm1 273; SSE-NEXT: pxor %xmm0, %xmm0 274; SSE-NEXT: psubd %xmm1, %xmm0 275; SSE-NEXT: retq 276; 277; AVX-LABEL: combine_vec_sdiv_by_pow2a_neg: 278; AVX: # %bb.0: 279; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 280; AVX-NEXT: vpsrld $30, %xmm1, %xmm1 281; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 282; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 283; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 284; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 285; AVX-NEXT: retq 286 %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4> 287 ret <4 x i32> %1 288} 289 290define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { 291; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 292; SSE2: # %bb.0: 293; SSE2-NEXT: pxor %xmm1, %xmm1 294; SSE2-NEXT: pxor %xmm2, %xmm2 295; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 296; SSE2-NEXT: movdqa %xmm2, %xmm3 297; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 298; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [256,4,2,16,8,32,64,2] 299; SSE2-NEXT: pmullw %xmm4, %xmm3 300; SSE2-NEXT: psrlw $8, %xmm3 301; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 302; SSE2-NEXT: pmullw %xmm4, %xmm2 303; SSE2-NEXT: psrlw $8, %xmm2 304; SSE2-NEXT: packuswb %xmm3, %xmm2 305; SSE2-NEXT: paddb %xmm0, %xmm2 306; SSE2-NEXT: movdqa %xmm2, %xmm1 307; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 308; SSE2-NEXT: psraw $8, %xmm1 309; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128] 310; SSE2-NEXT: pmullw %xmm3, %xmm1 311; SSE2-NEXT: psrlw $8, %xmm1 312; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 313; SSE2-NEXT: psraw $8, %xmm2 314; SSE2-NEXT: pmullw %xmm3, %xmm2 315; SSE2-NEXT: psrlw $8, %xmm2 316; SSE2-NEXT: packuswb %xmm1, %xmm2 317; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 318; SSE2-NEXT: pand %xmm1, %xmm2 319; SSE2-NEXT: pandn %xmm0, %xmm1 320; SSE2-NEXT: por %xmm2, %xmm1 321; SSE2-NEXT: movdqa %xmm1, %xmm0 322; SSE2-NEXT: retq 323; 324; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 325; SSE41: # %bb.0: 326; SSE41-NEXT: movdqa %xmm0, %xmm1 327; SSE41-NEXT: pxor %xmm0, %xmm0 328; SSE41-NEXT: pxor %xmm3, %xmm3 329; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 330; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 331; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 332; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2] 333; SSE41-NEXT: pmullw %xmm0, %xmm3 334; SSE41-NEXT: psrlw $8, %xmm3 335; SSE41-NEXT: pmullw %xmm0, %xmm2 336; SSE41-NEXT: psrlw $8, %xmm2 337; SSE41-NEXT: packuswb %xmm3, %xmm2 338; SSE41-NEXT: paddb %xmm1, %xmm2 339; SSE41-NEXT: movdqa %xmm2, %xmm0 340; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 341; SSE41-NEXT: psraw $8, %xmm0 342; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128] 343; SSE41-NEXT: pmullw %xmm3, %xmm0 344; SSE41-NEXT: psrlw $8, %xmm0 345; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 346; SSE41-NEXT: psraw $8, %xmm2 347; SSE41-NEXT: pmullw %xmm3, %xmm2 348; SSE41-NEXT: psrlw $8, %xmm2 349; SSE41-NEXT: packuswb %xmm0, %xmm2 350; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 351; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 352; SSE41-NEXT: movdqa %xmm1, %xmm0 353; SSE41-NEXT: retq 354; 355; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 356; AVX1: # %bb.0: 357; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 358; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 359; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 360; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,4,2,16,8,32,64,2] 361; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 362; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 363; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 364; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 365; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 366; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 367; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 368; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 369; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 370; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128] 371; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 372; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 373; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 374; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 375; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 376; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 377; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 378; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 379; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 380; AVX1-NEXT: retq 381; 382; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 383; AVX2: # %bb.0: 384; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 385; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 386; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 387; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 388; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 389; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 390; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 391; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 392; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 393; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 394; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 395; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 396; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 397; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 398; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 399; AVX2-NEXT: vzeroupper 400; AVX2-NEXT: retq 401; 402; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 403; AVX512F: # %bb.0: 404; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 405; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 406; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 407; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1 408; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 409; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1 410; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 411; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1 412; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 413; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 414; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 415; AVX512F-NEXT: vzeroupper 416; AVX512F-NEXT: retq 417; 418; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 419; AVX512BW: # %bb.0: 420; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 421; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 422; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 423; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1 424; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 425; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm1 426; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 427; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm1, %ymm1 428; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 429; AVX512BW-NEXT: movw $257, %ax # imm = 0x101 430; AVX512BW-NEXT: kmovd %eax, %k1 431; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} 432; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 433; AVX512BW-NEXT: vzeroupper 434; AVX512BW-NEXT: retq 435; 436; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 437; XOP: # %bb.0: 438; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 439; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 440; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1 441; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1 442; XOP-NEXT: vpshab {{.*}}(%rip), %xmm1, %xmm1 443; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 444; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 445; XOP-NEXT: retq 446 %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2> 447 ret <16 x i8> %1 448} 449 450define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { 451; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 452; SSE2: # %bb.0: 453; SSE2-NEXT: movdqa %xmm0, %xmm1 454; SSE2-NEXT: psraw $15, %xmm1 455; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 456; SSE2-NEXT: paddw %xmm0, %xmm1 457; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,0,0,65535] 458; SSE2-NEXT: movdqa %xmm1, %xmm3 459; SSE2-NEXT: pand %xmm2, %xmm3 460; SSE2-NEXT: psraw $4, %xmm1 461; SSE2-NEXT: pandn %xmm1, %xmm2 462; SSE2-NEXT: por %xmm3, %xmm2 463; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,0,65535] 464; SSE2-NEXT: movdqa %xmm2, %xmm3 465; SSE2-NEXT: pand %xmm1, %xmm3 466; SSE2-NEXT: psraw $2, %xmm2 467; SSE2-NEXT: pandn %xmm2, %xmm1 468; SSE2-NEXT: por %xmm3, %xmm1 469; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,0,0,65535,0] 470; SSE2-NEXT: movdqa %xmm1, %xmm3 471; SSE2-NEXT: pand %xmm2, %xmm3 472; SSE2-NEXT: psraw $1, %xmm1 473; SSE2-NEXT: pandn %xmm1, %xmm2 474; SSE2-NEXT: por %xmm3, %xmm2 475; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] 476; SSE2-NEXT: pand %xmm1, %xmm2 477; SSE2-NEXT: pandn %xmm0, %xmm1 478; SSE2-NEXT: por %xmm2, %xmm1 479; SSE2-NEXT: movdqa %xmm1, %xmm0 480; SSE2-NEXT: retq 481; 482; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 483; SSE41: # %bb.0: 484; SSE41-NEXT: movdqa %xmm0, %xmm1 485; SSE41-NEXT: psraw $15, %xmm1 486; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm1 487; SSE41-NEXT: paddw %xmm0, %xmm1 488; SSE41-NEXT: movdqa %xmm1, %xmm2 489; SSE41-NEXT: psraw $1, %xmm2 490; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm1 491; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] 492; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 493; SSE41-NEXT: movdqa %xmm1, %xmm0 494; SSE41-NEXT: retq 495; 496; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 497; AVX1: # %bb.0: 498; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1 499; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 500; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 501; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2 502; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm1 503; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] 504; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 505; AVX1-NEXT: retq 506; 507; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 508; AVX2: # %bb.0: 509; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1 510; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 511; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 512; AVX2-NEXT: vpsraw $1, %xmm1, %xmm2 513; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm1 514; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] 515; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 516; AVX2-NEXT: retq 517; 518; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 519; AVX512F: # %bb.0: 520; AVX512F-NEXT: vpsraw $15, %xmm0, %xmm1 521; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 522; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm1 523; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1 524; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1 525; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 526; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 527; AVX512F-NEXT: vzeroupper 528; AVX512F-NEXT: retq 529; 530; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 531; AVX512BW: # %bb.0: 532; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm1 533; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm1 534; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm1 535; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm1, %xmm1 536; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 537; AVX512BW-NEXT: retq 538; 539; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 540; XOP: # %bb.0: 541; XOP-NEXT: vpsraw $15, %xmm0, %xmm1 542; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm1 543; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm1 544; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm1, %xmm1 545; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 546; XOP-NEXT: retq 547 %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2> 548 ret <8 x i16> %1 549} 550 551define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { 552; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 553; SSE2: # %bb.0: 554; SSE2-NEXT: movdqa %xmm0, %xmm3 555; SSE2-NEXT: psraw $15, %xmm0 556; SSE2-NEXT: movdqa {{.*#+}} xmm8 = <u,4,2,16,8,32,64,2> 557; SSE2-NEXT: pmulhuw %xmm8, %xmm0 558; SSE2-NEXT: paddw %xmm3, %xmm0 559; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535] 560; SSE2-NEXT: movdqa %xmm0, %xmm2 561; SSE2-NEXT: pand %xmm4, %xmm2 562; SSE2-NEXT: psraw $4, %xmm0 563; SSE2-NEXT: movdqa %xmm4, %xmm6 564; SSE2-NEXT: pandn %xmm0, %xmm6 565; SSE2-NEXT: por %xmm2, %xmm6 566; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,0,65535] 567; SSE2-NEXT: movdqa %xmm6, %xmm0 568; SSE2-NEXT: pand %xmm5, %xmm0 569; SSE2-NEXT: psraw $2, %xmm6 570; SSE2-NEXT: movdqa %xmm5, %xmm2 571; SSE2-NEXT: pandn %xmm6, %xmm2 572; SSE2-NEXT: por %xmm0, %xmm2 573; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,0,0,65535,0] 574; SSE2-NEXT: movdqa %xmm2, %xmm0 575; SSE2-NEXT: pand %xmm7, %xmm0 576; SSE2-NEXT: psraw $1, %xmm2 577; SSE2-NEXT: movdqa %xmm7, %xmm6 578; SSE2-NEXT: pandn %xmm2, %xmm6 579; SSE2-NEXT: por %xmm0, %xmm6 580; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] 581; SSE2-NEXT: pand %xmm2, %xmm6 582; SSE2-NEXT: movdqa %xmm2, %xmm0 583; SSE2-NEXT: pandn %xmm3, %xmm0 584; SSE2-NEXT: por %xmm6, %xmm0 585; SSE2-NEXT: movdqa %xmm1, %xmm3 586; SSE2-NEXT: psraw $15, %xmm3 587; SSE2-NEXT: pmulhuw %xmm8, %xmm3 588; SSE2-NEXT: paddw %xmm1, %xmm3 589; SSE2-NEXT: movdqa %xmm3, %xmm6 590; SSE2-NEXT: pand %xmm4, %xmm6 591; SSE2-NEXT: psraw $4, %xmm3 592; SSE2-NEXT: pandn %xmm3, %xmm4 593; SSE2-NEXT: por %xmm6, %xmm4 594; SSE2-NEXT: movdqa %xmm4, %xmm3 595; SSE2-NEXT: pand %xmm5, %xmm3 596; SSE2-NEXT: psraw $2, %xmm4 597; SSE2-NEXT: pandn %xmm4, %xmm5 598; SSE2-NEXT: por %xmm3, %xmm5 599; SSE2-NEXT: movdqa %xmm5, %xmm3 600; SSE2-NEXT: pand %xmm7, %xmm3 601; SSE2-NEXT: psraw $1, %xmm5 602; SSE2-NEXT: pandn %xmm5, %xmm7 603; SSE2-NEXT: por %xmm3, %xmm7 604; SSE2-NEXT: pand %xmm2, %xmm7 605; SSE2-NEXT: pandn %xmm1, %xmm2 606; SSE2-NEXT: por %xmm7, %xmm2 607; SSE2-NEXT: movdqa %xmm2, %xmm1 608; SSE2-NEXT: retq 609; 610; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 611; SSE41: # %bb.0: 612; SSE41-NEXT: movdqa %xmm0, %xmm2 613; SSE41-NEXT: psraw $15, %xmm2 614; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2> 615; SSE41-NEXT: pmulhuw %xmm4, %xmm2 616; SSE41-NEXT: paddw %xmm0, %xmm2 617; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <u,16384,32768,4096,8192,2048,1024,32768> 618; SSE41-NEXT: movdqa %xmm2, %xmm3 619; SSE41-NEXT: pmulhw %xmm5, %xmm3 620; SSE41-NEXT: psraw $1, %xmm2 621; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6],xmm2[7] 622; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] 623; SSE41-NEXT: movdqa %xmm1, %xmm3 624; SSE41-NEXT: psraw $15, %xmm3 625; SSE41-NEXT: pmulhuw %xmm4, %xmm3 626; SSE41-NEXT: paddw %xmm1, %xmm3 627; SSE41-NEXT: pmulhw %xmm3, %xmm5 628; SSE41-NEXT: psraw $1, %xmm3 629; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7] 630; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7] 631; SSE41-NEXT: movdqa %xmm2, %xmm0 632; SSE41-NEXT: movdqa %xmm3, %xmm1 633; SSE41-NEXT: retq 634; 635; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 636; AVX1: # %bb.0: 637; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 638; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2 639; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4,2,16,8,32,64,2> 640; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2 641; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 642; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,16384,32768,4096,8192,2048,1024,32768> 643; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm4 644; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 645; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4,5,6],xmm1[7] 646; AVX1-NEXT: vpsraw $15, %xmm0, %xmm4 647; AVX1-NEXT: vpmulhuw %xmm3, %xmm4, %xmm3 648; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm3 649; AVX1-NEXT: vpmulhw %xmm2, %xmm3, %xmm2 650; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3 651; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] 652; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 653; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 654; AVX1-NEXT: # ymm2 = mem[0,1,0,1] 655; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 656; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 657; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 658; AVX1-NEXT: retq 659; 660; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 661; AVX2: # %bb.0: 662; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1 663; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1 664; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 665; AVX2-NEXT: vpsraw $1, %ymm1, %ymm2 666; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm1, %ymm1 667; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11,12,13,14],ymm2[15] 668; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 669; AVX2-NEXT: retq 670; 671; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 672; AVX512F: # %bb.0: 673; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1 674; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1 675; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 676; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 677; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1 678; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 679; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 680; AVX512F-NEXT: retq 681; 682; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 683; AVX512BW: # %bb.0: 684; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm1 685; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1 686; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm1 687; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm1, %ymm1 688; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 689; AVX512BW-NEXT: retq 690; 691; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 692; XOP: # %bb.0: 693; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 694; XOP-NEXT: vpsraw $15, %xmm1, %xmm2 695; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,65522,65521,65524,65523,65525,65526,65521> 696; XOP-NEXT: vpshlw %xmm3, %xmm2, %xmm2 697; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm1 698; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = <u,65534,65535,65532,65533,65531,65530,65535> 699; XOP-NEXT: vpshaw %xmm2, %xmm1, %xmm1 700; XOP-NEXT: vpsraw $15, %xmm0, %xmm4 701; XOP-NEXT: vpshlw %xmm3, %xmm4, %xmm3 702; XOP-NEXT: vpaddw %xmm3, %xmm0, %xmm3 703; XOP-NEXT: vpshaw %xmm2, %xmm3, %xmm2 704; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 705; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 706; XOP-NEXT: # ymm2 = mem[0,1,0,1] 707; XOP-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0 708; XOP-NEXT: retq 709 %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2> 710 ret <16 x i16> %1 711} 712 713define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { 714; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 715; SSE2: # %bb.0: 716; SSE2-NEXT: movdqa %xmm1, %xmm8 717; SSE2-NEXT: movdqa %xmm0, %xmm1 718; SSE2-NEXT: psraw $15, %xmm0 719; SSE2-NEXT: movdqa {{.*#+}} xmm9 = <u,4,2,16,8,32,64,2> 720; SSE2-NEXT: pmulhuw %xmm9, %xmm0 721; SSE2-NEXT: paddw %xmm1, %xmm0 722; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,0,0,65535] 723; SSE2-NEXT: movdqa %xmm0, %xmm4 724; SSE2-NEXT: pand %xmm11, %xmm4 725; SSE2-NEXT: psraw $4, %xmm0 726; SSE2-NEXT: movdqa %xmm11, %xmm5 727; SSE2-NEXT: pandn %xmm0, %xmm5 728; SSE2-NEXT: por %xmm4, %xmm5 729; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535] 730; SSE2-NEXT: movdqa %xmm5, %xmm0 731; SSE2-NEXT: pand %xmm7, %xmm0 732; SSE2-NEXT: psraw $2, %xmm5 733; SSE2-NEXT: movdqa %xmm7, %xmm4 734; SSE2-NEXT: pandn %xmm5, %xmm4 735; SSE2-NEXT: por %xmm0, %xmm4 736; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,0] 737; SSE2-NEXT: movdqa %xmm4, %xmm0 738; SSE2-NEXT: pand %xmm10, %xmm0 739; SSE2-NEXT: psraw $1, %xmm4 740; SSE2-NEXT: movdqa %xmm10, %xmm5 741; SSE2-NEXT: pandn %xmm4, %xmm5 742; SSE2-NEXT: por %xmm0, %xmm5 743; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,65535,65535,65535,65535] 744; SSE2-NEXT: pand %xmm12, %xmm5 745; SSE2-NEXT: movdqa %xmm12, %xmm0 746; SSE2-NEXT: pandn %xmm1, %xmm0 747; SSE2-NEXT: por %xmm5, %xmm0 748; SSE2-NEXT: movdqa %xmm8, %xmm1 749; SSE2-NEXT: psraw $15, %xmm1 750; SSE2-NEXT: pmulhuw %xmm9, %xmm1 751; SSE2-NEXT: paddw %xmm8, %xmm1 752; SSE2-NEXT: movdqa %xmm1, %xmm5 753; SSE2-NEXT: pand %xmm11, %xmm5 754; SSE2-NEXT: psraw $4, %xmm1 755; SSE2-NEXT: movdqa %xmm11, %xmm6 756; SSE2-NEXT: pandn %xmm1, %xmm6 757; SSE2-NEXT: por %xmm5, %xmm6 758; SSE2-NEXT: movdqa %xmm6, %xmm1 759; SSE2-NEXT: pand %xmm7, %xmm1 760; SSE2-NEXT: psraw $2, %xmm6 761; SSE2-NEXT: movdqa %xmm7, %xmm5 762; SSE2-NEXT: pandn %xmm6, %xmm5 763; SSE2-NEXT: por %xmm1, %xmm5 764; SSE2-NEXT: movdqa %xmm5, %xmm1 765; SSE2-NEXT: pand %xmm10, %xmm1 766; SSE2-NEXT: psraw $1, %xmm5 767; SSE2-NEXT: movdqa %xmm10, %xmm6 768; SSE2-NEXT: pandn %xmm5, %xmm6 769; SSE2-NEXT: por %xmm1, %xmm6 770; SSE2-NEXT: pand %xmm12, %xmm6 771; SSE2-NEXT: movdqa %xmm12, %xmm1 772; SSE2-NEXT: pandn %xmm8, %xmm1 773; SSE2-NEXT: por %xmm6, %xmm1 774; SSE2-NEXT: movdqa %xmm2, %xmm5 775; SSE2-NEXT: psraw $15, %xmm5 776; SSE2-NEXT: pmulhuw %xmm9, %xmm5 777; SSE2-NEXT: paddw %xmm2, %xmm5 778; SSE2-NEXT: movdqa %xmm5, %xmm6 779; SSE2-NEXT: pand %xmm11, %xmm6 780; SSE2-NEXT: psraw $4, %xmm5 781; SSE2-NEXT: movdqa %xmm11, %xmm4 782; SSE2-NEXT: pandn %xmm5, %xmm4 783; SSE2-NEXT: por %xmm6, %xmm4 784; SSE2-NEXT: movdqa %xmm4, %xmm5 785; SSE2-NEXT: pand %xmm7, %xmm5 786; SSE2-NEXT: psraw $2, %xmm4 787; SSE2-NEXT: movdqa %xmm7, %xmm6 788; SSE2-NEXT: pandn %xmm4, %xmm6 789; SSE2-NEXT: por %xmm5, %xmm6 790; SSE2-NEXT: movdqa %xmm6, %xmm4 791; SSE2-NEXT: pand %xmm10, %xmm4 792; SSE2-NEXT: psraw $1, %xmm6 793; SSE2-NEXT: movdqa %xmm10, %xmm5 794; SSE2-NEXT: pandn %xmm6, %xmm5 795; SSE2-NEXT: por %xmm4, %xmm5 796; SSE2-NEXT: pand %xmm12, %xmm5 797; SSE2-NEXT: movdqa %xmm12, %xmm8 798; SSE2-NEXT: pandn %xmm2, %xmm8 799; SSE2-NEXT: por %xmm5, %xmm8 800; SSE2-NEXT: movdqa %xmm3, %xmm2 801; SSE2-NEXT: psraw $15, %xmm2 802; SSE2-NEXT: pmulhuw %xmm9, %xmm2 803; SSE2-NEXT: paddw %xmm3, %xmm2 804; SSE2-NEXT: movdqa %xmm2, %xmm4 805; SSE2-NEXT: pand %xmm11, %xmm4 806; SSE2-NEXT: psraw $4, %xmm2 807; SSE2-NEXT: pandn %xmm2, %xmm11 808; SSE2-NEXT: por %xmm4, %xmm11 809; SSE2-NEXT: movdqa %xmm11, %xmm2 810; SSE2-NEXT: pand %xmm7, %xmm2 811; SSE2-NEXT: psraw $2, %xmm11 812; SSE2-NEXT: pandn %xmm11, %xmm7 813; SSE2-NEXT: por %xmm2, %xmm7 814; SSE2-NEXT: movdqa %xmm7, %xmm2 815; SSE2-NEXT: pand %xmm10, %xmm2 816; SSE2-NEXT: psraw $1, %xmm7 817; SSE2-NEXT: pandn %xmm7, %xmm10 818; SSE2-NEXT: por %xmm2, %xmm10 819; SSE2-NEXT: pand %xmm12, %xmm10 820; SSE2-NEXT: pandn %xmm3, %xmm12 821; SSE2-NEXT: por %xmm10, %xmm12 822; SSE2-NEXT: movdqa %xmm8, %xmm2 823; SSE2-NEXT: movdqa %xmm12, %xmm3 824; SSE2-NEXT: retq 825; 826; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 827; SSE41: # %bb.0: 828; SSE41-NEXT: movdqa %xmm1, %xmm4 829; SSE41-NEXT: movdqa %xmm0, %xmm1 830; SSE41-NEXT: psraw $15, %xmm0 831; SSE41-NEXT: movdqa {{.*#+}} xmm7 = <u,4,2,16,8,32,64,2> 832; SSE41-NEXT: pmulhuw %xmm7, %xmm0 833; SSE41-NEXT: paddw %xmm1, %xmm0 834; SSE41-NEXT: movdqa {{.*#+}} xmm6 = <u,16384,32768,4096,8192,2048,1024,32768> 835; SSE41-NEXT: movdqa %xmm0, %xmm5 836; SSE41-NEXT: pmulhw %xmm6, %xmm5 837; SSE41-NEXT: psraw $1, %xmm0 838; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3,4,5,6],xmm0[7] 839; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] 840; SSE41-NEXT: movdqa %xmm4, %xmm1 841; SSE41-NEXT: psraw $15, %xmm1 842; SSE41-NEXT: pmulhuw %xmm7, %xmm1 843; SSE41-NEXT: paddw %xmm4, %xmm1 844; SSE41-NEXT: movdqa %xmm1, %xmm5 845; SSE41-NEXT: pmulhw %xmm6, %xmm5 846; SSE41-NEXT: psraw $1, %xmm1 847; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7] 848; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] 849; SSE41-NEXT: movdqa %xmm2, %xmm4 850; SSE41-NEXT: psraw $15, %xmm4 851; SSE41-NEXT: pmulhuw %xmm7, %xmm4 852; SSE41-NEXT: paddw %xmm2, %xmm4 853; SSE41-NEXT: movdqa %xmm4, %xmm5 854; SSE41-NEXT: pmulhw %xmm6, %xmm5 855; SSE41-NEXT: psraw $1, %xmm4 856; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7] 857; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7] 858; SSE41-NEXT: movdqa %xmm3, %xmm5 859; SSE41-NEXT: psraw $15, %xmm5 860; SSE41-NEXT: pmulhuw %xmm7, %xmm5 861; SSE41-NEXT: paddw %xmm3, %xmm5 862; SSE41-NEXT: pmulhw %xmm5, %xmm6 863; SSE41-NEXT: psraw $1, %xmm5 864; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7] 865; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7] 866; SSE41-NEXT: movdqa %xmm4, %xmm2 867; SSE41-NEXT: movdqa %xmm5, %xmm3 868; SSE41-NEXT: retq 869; 870; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 871; AVX1: # %bb.0: 872; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 873; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3 874; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2> 875; AVX1-NEXT: vpmulhuw %xmm4, %xmm3, %xmm3 876; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 877; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,16384,32768,4096,8192,2048,1024,32768> 878; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm5 879; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2 880; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7] 881; AVX1-NEXT: vpsraw $15, %xmm0, %xmm5 882; AVX1-NEXT: vpmulhuw %xmm4, %xmm5, %xmm5 883; AVX1-NEXT: vpaddw %xmm5, %xmm0, %xmm5 884; AVX1-NEXT: vpmulhw %xmm3, %xmm5, %xmm6 885; AVX1-NEXT: vpsraw $1, %xmm5, %xmm5 886; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7] 887; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 888; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 889; AVX1-NEXT: # ymm5 = mem[0,1,0,1] 890; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2 891; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0 892; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0 893; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 894; AVX1-NEXT: vpsraw $15, %xmm2, %xmm6 895; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm6 896; AVX1-NEXT: vpaddw %xmm6, %xmm2, %xmm2 897; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm6 898; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2 899; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7] 900; AVX1-NEXT: vpsraw $15, %xmm1, %xmm6 901; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm4 902; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm4 903; AVX1-NEXT: vpmulhw %xmm3, %xmm4, %xmm3 904; AVX1-NEXT: vpsraw $1, %xmm4, %xmm4 905; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7] 906; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 907; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2 908; AVX1-NEXT: vandnps %ymm1, %ymm5, %ymm1 909; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 910; AVX1-NEXT: retq 911; 912; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 913; AVX2: # %bb.0: 914; AVX2-NEXT: vpsraw $15, %ymm0, %ymm2 915; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2] 916; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 917; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 918; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm2 919; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,16384,32768,4096,8192,2048,1024,32768,0,16384,32768,4096,8192,2048,1024,32768] 920; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 921; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm5 922; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2 923; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4,5,6],ymm2[7],ymm5[8,9],ymm2[10],ymm5[11,12,13,14],ymm2[15] 924; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] 925; AVX2-NEXT: vpsraw $15, %ymm1, %ymm2 926; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 927; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm2 928; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm3 929; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2 930; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11,12,13,14],ymm2[15] 931; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] 932; AVX2-NEXT: retq 933; 934; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 935; AVX512F: # %bb.0: 936; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1 937; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2] 938; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] 939; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 940; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 941; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 942; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] 943; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 944; AVX512F-NEXT: vpsravd %zmm3, %zmm1, %zmm1 945; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 946; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 947; AVX512F-NEXT: vpsraw $15, %ymm4, %ymm5 948; AVX512F-NEXT: vpmulhuw %ymm2, %ymm5, %ymm2 949; AVX512F-NEXT: vpaddw %ymm2, %ymm4, %ymm2 950; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 951; AVX512F-NEXT: vpsravd %zmm3, %zmm2, %zmm2 952; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 953; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 954; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 955; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 956; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0 957; AVX512F-NEXT: retq 958; 959; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 960; AVX512BW: # %bb.0: 961; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm1 962; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1 963; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 964; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm1, %zmm1 965; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 966; AVX512BW-NEXT: kmovd %eax, %k1 967; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} 968; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 969; AVX512BW-NEXT: retq 970; 971; XOP-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 972; XOP: # %bb.0: 973; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 974; XOP-NEXT: vpsraw $15, %xmm2, %xmm3 975; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = <u,65522,65521,65524,65523,65525,65526,65521> 976; XOP-NEXT: vpshlw %xmm4, %xmm3, %xmm3 977; XOP-NEXT: vpaddw %xmm3, %xmm2, %xmm2 978; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,65534,65535,65532,65533,65531,65530,65535> 979; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2 980; XOP-NEXT: vpsraw $15, %xmm0, %xmm5 981; XOP-NEXT: vpshlw %xmm4, %xmm5, %xmm5 982; XOP-NEXT: vpaddw %xmm5, %xmm0, %xmm5 983; XOP-NEXT: vpshaw %xmm3, %xmm5, %xmm5 984; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 985; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 986; XOP-NEXT: # ymm5 = mem[0,1,0,1] 987; XOP-NEXT: vpcmov %ymm5, %ymm0, %ymm2, %ymm0 988; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 989; XOP-NEXT: vpsraw $15, %xmm2, %xmm6 990; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm6 991; XOP-NEXT: vpaddw %xmm6, %xmm2, %xmm2 992; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2 993; XOP-NEXT: vpsraw $15, %xmm1, %xmm6 994; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm4 995; XOP-NEXT: vpaddw %xmm4, %xmm1, %xmm4 996; XOP-NEXT: vpshaw %xmm3, %xmm4, %xmm3 997; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 998; XOP-NEXT: vpcmov %ymm5, %ymm1, %ymm2, %ymm1 999; XOP-NEXT: retq 1000 %1 = sdiv <32 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2> 1001 ret <32 x i16> %1 1002} 1003 1004define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) { 1005; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1006; SSE2: # %bb.0: 1007; SSE2-NEXT: movdqa %xmm0, %xmm1 1008; SSE2-NEXT: psrad $31, %xmm1 1009; SSE2-NEXT: movdqa %xmm1, %xmm2 1010; SSE2-NEXT: psrld $28, %xmm2 1011; SSE2-NEXT: movdqa %xmm1, %xmm3 1012; SSE2-NEXT: psrld $29, %xmm3 1013; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1014; SSE2-NEXT: psrld $30, %xmm1 1015; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] 1016; SSE2-NEXT: paddd %xmm0, %xmm1 1017; SSE2-NEXT: movdqa %xmm1, %xmm2 1018; SSE2-NEXT: psrad $4, %xmm2 1019; SSE2-NEXT: movdqa %xmm1, %xmm3 1020; SSE2-NEXT: psrad $3, %xmm3 1021; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1022; SSE2-NEXT: psrad $2, %xmm1 1023; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] 1024; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1025; SSE2-NEXT: movaps %xmm1, %xmm0 1026; SSE2-NEXT: retq 1027; 1028; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1029; SSE41: # %bb.0: 1030; SSE41-NEXT: movdqa %xmm0, %xmm1 1031; SSE41-NEXT: psrad $31, %xmm1 1032; SSE41-NEXT: movdqa %xmm1, %xmm2 1033; SSE41-NEXT: psrld $28, %xmm2 1034; SSE41-NEXT: movdqa %xmm1, %xmm3 1035; SSE41-NEXT: psrld $30, %xmm3 1036; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1037; SSE41-NEXT: psrld $29, %xmm1 1038; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1039; SSE41-NEXT: paddd %xmm0, %xmm1 1040; SSE41-NEXT: movdqa %xmm1, %xmm2 1041; SSE41-NEXT: psrad $4, %xmm2 1042; SSE41-NEXT: movdqa %xmm1, %xmm3 1043; SSE41-NEXT: psrad $2, %xmm3 1044; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1045; SSE41-NEXT: psrad $3, %xmm1 1046; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1047; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1048; SSE41-NEXT: movdqa %xmm1, %xmm0 1049; SSE41-NEXT: retq 1050; 1051; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1052; AVX1: # %bb.0: 1053; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 1054; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 1055; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3 1056; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1057; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1 1058; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1059; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1060; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 1061; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 1062; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1063; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 1064; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1065; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1066; AVX1-NEXT: retq 1067; 1068; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1069; AVX2ORLATER: # %bb.0: 1070; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1 1071; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 1072; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1073; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 1074; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1075; AVX2ORLATER-NEXT: retq 1076; 1077; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1078; XOP: # %bb.0: 1079; XOP-NEXT: vpsrad $31, %xmm0, %xmm1 1080; XOP-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1 1081; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1082; XOP-NEXT: vpshad {{.*}}(%rip), %xmm1, %xmm1 1083; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1084; XOP-NEXT: retq 1085 %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16> 1086 ret <4 x i32> %1 1087} 1088 1089define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) { 1090; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1091; SSE2: # %bb.0: 1092; SSE2-NEXT: movdqa %xmm0, %xmm2 1093; SSE2-NEXT: psrad $31, %xmm0 1094; SSE2-NEXT: movdqa %xmm0, %xmm3 1095; SSE2-NEXT: psrld $28, %xmm3 1096; SSE2-NEXT: movdqa %xmm0, %xmm4 1097; SSE2-NEXT: psrld $29, %xmm4 1098; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] 1099; SSE2-NEXT: psrld $30, %xmm0 1100; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] 1101; SSE2-NEXT: paddd %xmm2, %xmm0 1102; SSE2-NEXT: movdqa %xmm0, %xmm3 1103; SSE2-NEXT: psrad $4, %xmm3 1104; SSE2-NEXT: movdqa %xmm0, %xmm4 1105; SSE2-NEXT: psrad $3, %xmm4 1106; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] 1107; SSE2-NEXT: psrad $2, %xmm0 1108; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] 1109; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] 1110; SSE2-NEXT: movdqa %xmm1, %xmm2 1111; SSE2-NEXT: psrad $31, %xmm2 1112; SSE2-NEXT: movdqa %xmm2, %xmm3 1113; SSE2-NEXT: psrld $28, %xmm3 1114; SSE2-NEXT: movdqa %xmm2, %xmm4 1115; SSE2-NEXT: psrld $29, %xmm4 1116; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] 1117; SSE2-NEXT: psrld $30, %xmm2 1118; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] 1119; SSE2-NEXT: paddd %xmm1, %xmm2 1120; SSE2-NEXT: movdqa %xmm2, %xmm3 1121; SSE2-NEXT: psrad $4, %xmm3 1122; SSE2-NEXT: movdqa %xmm2, %xmm4 1123; SSE2-NEXT: psrad $3, %xmm4 1124; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] 1125; SSE2-NEXT: psrad $2, %xmm2 1126; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] 1127; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1128; SSE2-NEXT: movaps %xmm2, %xmm1 1129; SSE2-NEXT: retq 1130; 1131; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1132; SSE41: # %bb.0: 1133; SSE41-NEXT: movdqa %xmm0, %xmm2 1134; SSE41-NEXT: psrad $31, %xmm0 1135; SSE41-NEXT: movdqa %xmm0, %xmm3 1136; SSE41-NEXT: psrld $28, %xmm3 1137; SSE41-NEXT: movdqa %xmm0, %xmm4 1138; SSE41-NEXT: psrld $30, %xmm4 1139; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1140; SSE41-NEXT: psrld $29, %xmm0 1141; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] 1142; SSE41-NEXT: paddd %xmm2, %xmm0 1143; SSE41-NEXT: movdqa %xmm0, %xmm3 1144; SSE41-NEXT: psrad $4, %xmm3 1145; SSE41-NEXT: movdqa %xmm0, %xmm4 1146; SSE41-NEXT: psrad $2, %xmm4 1147; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1148; SSE41-NEXT: psrad $3, %xmm0 1149; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] 1150; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7] 1151; SSE41-NEXT: movdqa %xmm1, %xmm2 1152; SSE41-NEXT: psrad $31, %xmm2 1153; SSE41-NEXT: movdqa %xmm2, %xmm3 1154; SSE41-NEXT: psrld $28, %xmm3 1155; SSE41-NEXT: movdqa %xmm2, %xmm4 1156; SSE41-NEXT: psrld $30, %xmm4 1157; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1158; SSE41-NEXT: psrld $29, %xmm2 1159; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1160; SSE41-NEXT: paddd %xmm1, %xmm2 1161; SSE41-NEXT: movdqa %xmm2, %xmm3 1162; SSE41-NEXT: psrad $4, %xmm3 1163; SSE41-NEXT: movdqa %xmm2, %xmm4 1164; SSE41-NEXT: psrad $2, %xmm4 1165; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1166; SSE41-NEXT: psrad $3, %xmm2 1167; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1168; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] 1169; SSE41-NEXT: movdqa %xmm2, %xmm1 1170; SSE41-NEXT: retq 1171; 1172; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1173; AVX1: # %bb.0: 1174; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1175; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 1176; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3 1177; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4 1178; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1179; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2 1180; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1181; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1182; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 1183; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 1184; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1185; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 1186; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1187; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 1188; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3 1189; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4 1190; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1191; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2 1192; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1193; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 1194; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3 1195; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1196; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1197; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2 1198; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1199; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1200; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 1201; AVX1-NEXT: retq 1202; 1203; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1204; AVX2ORLATER: # %bb.0: 1205; AVX2ORLATER-NEXT: vpsrad $31, %ymm0, %ymm1 1206; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 1207; AVX2ORLATER-NEXT: vpaddd %ymm1, %ymm0, %ymm1 1208; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1 1209; AVX2ORLATER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 1210; AVX2ORLATER-NEXT: retq 1211; 1212; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1213; XOP: # %bb.0: 1214; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 1215; XOP-NEXT: vpsrad $31, %xmm1, %xmm2 1216; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4294967266,4294967267,4294967268> 1217; XOP-NEXT: vpshld %xmm3, %xmm2, %xmm2 1218; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1219; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = <u,4294967294,4294967293,4294967292> 1220; XOP-NEXT: vpshad %xmm2, %xmm1, %xmm1 1221; XOP-NEXT: vpsrad $31, %xmm0, %xmm4 1222; XOP-NEXT: vpshld %xmm3, %xmm4, %xmm3 1223; XOP-NEXT: vpaddd %xmm3, %xmm0, %xmm3 1224; XOP-NEXT: vpshad %xmm2, %xmm3, %xmm2 1225; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1226; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 1227; XOP-NEXT: retq 1228 %1 = sdiv <8 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16> 1229 ret <8 x i32> %1 1230} 1231 1232define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) { 1233; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1234; SSE2: # %bb.0: 1235; SSE2-NEXT: movdqa %xmm1, %xmm4 1236; SSE2-NEXT: movdqa %xmm0, %xmm1 1237; SSE2-NEXT: psrad $31, %xmm0 1238; SSE2-NEXT: movdqa %xmm0, %xmm5 1239; SSE2-NEXT: psrld $28, %xmm5 1240; SSE2-NEXT: movdqa %xmm0, %xmm6 1241; SSE2-NEXT: psrld $29, %xmm6 1242; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1243; SSE2-NEXT: psrld $30, %xmm0 1244; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] 1245; SSE2-NEXT: paddd %xmm1, %xmm0 1246; SSE2-NEXT: movdqa %xmm0, %xmm5 1247; SSE2-NEXT: psrad $4, %xmm5 1248; SSE2-NEXT: movdqa %xmm0, %xmm6 1249; SSE2-NEXT: psrad $3, %xmm6 1250; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1251; SSE2-NEXT: psrad $2, %xmm0 1252; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] 1253; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1254; SSE2-NEXT: movdqa %xmm4, %xmm1 1255; SSE2-NEXT: psrad $31, %xmm1 1256; SSE2-NEXT: movdqa %xmm1, %xmm5 1257; SSE2-NEXT: psrld $28, %xmm5 1258; SSE2-NEXT: movdqa %xmm1, %xmm6 1259; SSE2-NEXT: psrld $29, %xmm6 1260; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1261; SSE2-NEXT: psrld $30, %xmm1 1262; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] 1263; SSE2-NEXT: paddd %xmm4, %xmm1 1264; SSE2-NEXT: movdqa %xmm1, %xmm5 1265; SSE2-NEXT: psrad $4, %xmm5 1266; SSE2-NEXT: movdqa %xmm1, %xmm6 1267; SSE2-NEXT: psrad $3, %xmm6 1268; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1269; SSE2-NEXT: psrad $2, %xmm1 1270; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] 1271; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] 1272; SSE2-NEXT: movdqa %xmm2, %xmm4 1273; SSE2-NEXT: psrad $31, %xmm4 1274; SSE2-NEXT: movdqa %xmm4, %xmm5 1275; SSE2-NEXT: psrld $28, %xmm5 1276; SSE2-NEXT: movdqa %xmm4, %xmm6 1277; SSE2-NEXT: psrld $29, %xmm6 1278; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1279; SSE2-NEXT: psrld $30, %xmm4 1280; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] 1281; SSE2-NEXT: paddd %xmm2, %xmm4 1282; SSE2-NEXT: movdqa %xmm4, %xmm5 1283; SSE2-NEXT: psrad $4, %xmm5 1284; SSE2-NEXT: movdqa %xmm4, %xmm6 1285; SSE2-NEXT: psrad $3, %xmm6 1286; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1287; SSE2-NEXT: psrad $2, %xmm4 1288; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] 1289; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] 1290; SSE2-NEXT: movdqa %xmm3, %xmm5 1291; SSE2-NEXT: psrad $31, %xmm5 1292; SSE2-NEXT: movdqa %xmm5, %xmm2 1293; SSE2-NEXT: psrld $28, %xmm2 1294; SSE2-NEXT: movdqa %xmm5, %xmm6 1295; SSE2-NEXT: psrld $29, %xmm6 1296; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] 1297; SSE2-NEXT: psrld $30, %xmm5 1298; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] 1299; SSE2-NEXT: paddd %xmm3, %xmm5 1300; SSE2-NEXT: movdqa %xmm5, %xmm2 1301; SSE2-NEXT: psrad $4, %xmm2 1302; SSE2-NEXT: movdqa %xmm5, %xmm6 1303; SSE2-NEXT: psrad $3, %xmm6 1304; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] 1305; SSE2-NEXT: psrad $2, %xmm5 1306; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] 1307; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] 1308; SSE2-NEXT: movaps %xmm4, %xmm2 1309; SSE2-NEXT: movaps %xmm5, %xmm3 1310; SSE2-NEXT: retq 1311; 1312; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1313; SSE41: # %bb.0: 1314; SSE41-NEXT: movdqa %xmm1, %xmm4 1315; SSE41-NEXT: movdqa %xmm0, %xmm1 1316; SSE41-NEXT: psrad $31, %xmm0 1317; SSE41-NEXT: movdqa %xmm0, %xmm5 1318; SSE41-NEXT: psrld $28, %xmm5 1319; SSE41-NEXT: movdqa %xmm0, %xmm6 1320; SSE41-NEXT: psrld $30, %xmm6 1321; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1322; SSE41-NEXT: psrld $29, %xmm0 1323; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] 1324; SSE41-NEXT: paddd %xmm1, %xmm0 1325; SSE41-NEXT: movdqa %xmm0, %xmm5 1326; SSE41-NEXT: psrad $4, %xmm5 1327; SSE41-NEXT: movdqa %xmm0, %xmm6 1328; SSE41-NEXT: psrad $2, %xmm6 1329; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1330; SSE41-NEXT: psrad $3, %xmm0 1331; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] 1332; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 1333; SSE41-NEXT: movdqa %xmm4, %xmm1 1334; SSE41-NEXT: psrad $31, %xmm1 1335; SSE41-NEXT: movdqa %xmm1, %xmm5 1336; SSE41-NEXT: psrld $28, %xmm5 1337; SSE41-NEXT: movdqa %xmm1, %xmm6 1338; SSE41-NEXT: psrld $30, %xmm6 1339; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1340; SSE41-NEXT: psrld $29, %xmm1 1341; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] 1342; SSE41-NEXT: paddd %xmm4, %xmm1 1343; SSE41-NEXT: movdqa %xmm1, %xmm5 1344; SSE41-NEXT: psrad $4, %xmm5 1345; SSE41-NEXT: movdqa %xmm1, %xmm6 1346; SSE41-NEXT: psrad $2, %xmm6 1347; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1348; SSE41-NEXT: psrad $3, %xmm1 1349; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] 1350; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] 1351; SSE41-NEXT: movdqa %xmm2, %xmm4 1352; SSE41-NEXT: psrad $31, %xmm4 1353; SSE41-NEXT: movdqa %xmm4, %xmm5 1354; SSE41-NEXT: psrld $28, %xmm5 1355; SSE41-NEXT: movdqa %xmm4, %xmm6 1356; SSE41-NEXT: psrld $30, %xmm6 1357; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1358; SSE41-NEXT: psrld $29, %xmm4 1359; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 1360; SSE41-NEXT: paddd %xmm2, %xmm4 1361; SSE41-NEXT: movdqa %xmm4, %xmm5 1362; SSE41-NEXT: psrad $4, %xmm5 1363; SSE41-NEXT: movdqa %xmm4, %xmm6 1364; SSE41-NEXT: psrad $2, %xmm6 1365; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1366; SSE41-NEXT: psrad $3, %xmm4 1367; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 1368; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] 1369; SSE41-NEXT: movdqa %xmm3, %xmm5 1370; SSE41-NEXT: psrad $31, %xmm5 1371; SSE41-NEXT: movdqa %xmm5, %xmm2 1372; SSE41-NEXT: psrld $28, %xmm2 1373; SSE41-NEXT: movdqa %xmm5, %xmm6 1374; SSE41-NEXT: psrld $30, %xmm6 1375; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7] 1376; SSE41-NEXT: psrld $29, %xmm5 1377; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 1378; SSE41-NEXT: paddd %xmm3, %xmm5 1379; SSE41-NEXT: movdqa %xmm5, %xmm2 1380; SSE41-NEXT: psrad $4, %xmm2 1381; SSE41-NEXT: movdqa %xmm5, %xmm6 1382; SSE41-NEXT: psrad $2, %xmm6 1383; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7] 1384; SSE41-NEXT: psrad $3, %xmm5 1385; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 1386; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] 1387; SSE41-NEXT: movdqa %xmm4, %xmm2 1388; SSE41-NEXT: movdqa %xmm5, %xmm3 1389; SSE41-NEXT: retq 1390; 1391; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1392; AVX1: # %bb.0: 1393; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1394; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 1395; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4 1396; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5 1397; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1398; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3 1399; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1400; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1401; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3 1402; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1403; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1404; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2 1405; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1406; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 1407; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4 1408; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5 1409; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1410; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3 1411; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1412; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm3 1413; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4 1414; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5 1415; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1416; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3 1417; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1418; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1419; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] 1420; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1421; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 1422; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4 1423; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5 1424; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1425; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3 1426; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1427; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1428; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3 1429; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1430; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1431; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2 1432; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1433; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 1434; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4 1435; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5 1436; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1437; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3 1438; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1439; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3 1440; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4 1441; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5 1442; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1443; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3 1444; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1445; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1446; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] 1447; AVX1-NEXT: retq 1448; 1449; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1450; AVX2: # %bb.0: 1451; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 1452; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,30,29,28,0,30,29,28] 1453; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 1454; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2 1455; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2 1456; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4] 1457; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 1458; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2 1459; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] 1460; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 1461; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2 1462; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm2 1463; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2 1464; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] 1465; AVX2-NEXT: retq 1466; 1467; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1468; AVX512F: # %bb.0: 1469; AVX512F-NEXT: vpsrad $31, %zmm0, %zmm1 1470; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1 1471; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 1472; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1 1473; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111 1474; AVX512F-NEXT: kmovw %eax, %k1 1475; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 1476; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 1477; AVX512F-NEXT: retq 1478; 1479; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1480; AVX512BW: # %bb.0: 1481; AVX512BW-NEXT: vpsrad $31, %zmm0, %zmm1 1482; AVX512BW-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1 1483; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 1484; AVX512BW-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1 1485; AVX512BW-NEXT: movw $4369, %ax # imm = 0x1111 1486; AVX512BW-NEXT: kmovd %eax, %k1 1487; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 1488; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 1489; AVX512BW-NEXT: retq 1490; 1491; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1492; XOP: # %bb.0: 1493; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 1494; XOP-NEXT: vpsrad $31, %xmm2, %xmm3 1495; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = <u,4294967266,4294967267,4294967268> 1496; XOP-NEXT: vpshld %xmm4, %xmm3, %xmm3 1497; XOP-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1498; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4294967294,4294967293,4294967292> 1499; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2 1500; XOP-NEXT: vpsrad $31, %xmm0, %xmm5 1501; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5 1502; XOP-NEXT: vpaddd %xmm5, %xmm0, %xmm5 1503; XOP-NEXT: vpshad %xmm3, %xmm5, %xmm5 1504; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 1505; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] 1506; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1507; XOP-NEXT: vpsrad $31, %xmm2, %xmm5 1508; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5 1509; XOP-NEXT: vpaddd %xmm5, %xmm2, %xmm2 1510; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2 1511; XOP-NEXT: vpsrad $31, %xmm1, %xmm5 1512; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm4 1513; XOP-NEXT: vpaddd %xmm4, %xmm1, %xmm4 1514; XOP-NEXT: vpshad %xmm3, %xmm4, %xmm3 1515; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1516; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] 1517; XOP-NEXT: retq 1518 %1 = sdiv <16 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16> 1519 ret <16 x i32> %1 1520} 1521 1522define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { 1523; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1524; SSE2: # %bb.0: 1525; SSE2-NEXT: movdqa %xmm0, %xmm1 1526; SSE2-NEXT: psrad $31, %xmm1 1527; SSE2-NEXT: psrlq $62, %xmm1 1528; SSE2-NEXT: paddq %xmm0, %xmm1 1529; SSE2-NEXT: movdqa %xmm1, %xmm2 1530; SSE2-NEXT: psrad $2, %xmm2 1531; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 1532; SSE2-NEXT: psrlq $2, %xmm1 1533; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1534; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1535; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1536; SSE2-NEXT: retq 1537; 1538; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1539; SSE41: # %bb.0: 1540; SSE41-NEXT: movdqa %xmm0, %xmm1 1541; SSE41-NEXT: psrad $31, %xmm1 1542; SSE41-NEXT: psrlq $62, %xmm1 1543; SSE41-NEXT: paddq %xmm0, %xmm1 1544; SSE41-NEXT: movdqa %xmm1, %xmm2 1545; SSE41-NEXT: psrad $2, %xmm2 1546; SSE41-NEXT: psrlq $2, %xmm1 1547; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1548; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1549; SSE41-NEXT: movdqa %xmm1, %xmm0 1550; SSE41-NEXT: retq 1551; 1552; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1553; AVX1: # %bb.0: 1554; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1555; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 1556; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm1 1557; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1558; AVX1-NEXT: vpsrad $2, %xmm1, %xmm2 1559; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1 1560; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1561; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1562; AVX1-NEXT: retq 1563; 1564; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1565; AVX2: # %bb.0: 1566; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1567; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 1568; AVX2-NEXT: vpsrlq $62, %xmm1, %xmm1 1569; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1570; AVX2-NEXT: vpsrad $2, %xmm1, %xmm2 1571; AVX2-NEXT: vpsrlq $2, %xmm1, %xmm1 1572; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 1573; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1574; AVX2-NEXT: retq 1575; 1576; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1577; AVX512F: # %bb.0: 1578; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1579; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1 1580; AVX512F-NEXT: vpsrlq $62, %xmm1, %xmm1 1581; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1582; AVX512F-NEXT: vpsraq $2, %zmm1, %zmm1 1583; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1584; AVX512F-NEXT: vzeroupper 1585; AVX512F-NEXT: retq 1586; 1587; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1588; AVX512BW: # %bb.0: 1589; AVX512BW-NEXT: vpsraq $63, %xmm0, %xmm1 1590; AVX512BW-NEXT: vpsrlq $62, %xmm1, %xmm1 1591; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1592; AVX512BW-NEXT: vpsraq $2, %xmm1, %xmm1 1593; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1594; AVX512BW-NEXT: retq 1595; 1596; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1597; XOP: # %bb.0: 1598; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm1 1599; XOP-NEXT: vpsrlq $62, %xmm1, %xmm1 1600; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1601; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm1, %xmm1 1602; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1603; XOP-NEXT: retq 1604 %1 = sdiv <2 x i64> %x, <i64 1, i64 4> 1605 ret <2 x i64> %1 1606} 1607 1608define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { 1609; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1610; SSE2: # %bb.0: 1611; SSE2-NEXT: movdqa %xmm0, %xmm2 1612; SSE2-NEXT: psrad $31, %xmm2 1613; SSE2-NEXT: psrlq $62, %xmm2 1614; SSE2-NEXT: paddq %xmm0, %xmm2 1615; SSE2-NEXT: movdqa %xmm2, %xmm3 1616; SSE2-NEXT: psrad $2, %xmm3 1617; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 1618; SSE2-NEXT: psrlq $2, %xmm2 1619; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1620; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1621; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 1622; SSE2-NEXT: movdqa %xmm1, %xmm2 1623; SSE2-NEXT: psrad $31, %xmm2 1624; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1625; SSE2-NEXT: movdqa %xmm2, %xmm3 1626; SSE2-NEXT: psrlq $61, %xmm3 1627; SSE2-NEXT: psrlq $60, %xmm2 1628; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] 1629; SSE2-NEXT: paddq %xmm1, %xmm2 1630; SSE2-NEXT: movdqa %xmm2, %xmm1 1631; SSE2-NEXT: psrlq $3, %xmm1 1632; SSE2-NEXT: psrlq $4, %xmm2 1633; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1634; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] 1635; SSE2-NEXT: xorpd %xmm1, %xmm2 1636; SSE2-NEXT: psubq %xmm1, %xmm2 1637; SSE2-NEXT: movdqa %xmm2, %xmm1 1638; SSE2-NEXT: retq 1639; 1640; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1641; SSE41: # %bb.0: 1642; SSE41-NEXT: movdqa %xmm0, %xmm2 1643; SSE41-NEXT: psrad $31, %xmm0 1644; SSE41-NEXT: psrlq $62, %xmm0 1645; SSE41-NEXT: paddq %xmm2, %xmm0 1646; SSE41-NEXT: movdqa %xmm0, %xmm3 1647; SSE41-NEXT: psrad $2, %xmm3 1648; SSE41-NEXT: psrlq $2, %xmm0 1649; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 1650; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 1651; SSE41-NEXT: movdqa %xmm1, %xmm2 1652; SSE41-NEXT: psrad $31, %xmm2 1653; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1654; SSE41-NEXT: movdqa %xmm2, %xmm3 1655; SSE41-NEXT: psrlq $60, %xmm3 1656; SSE41-NEXT: psrlq $61, %xmm2 1657; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] 1658; SSE41-NEXT: paddq %xmm1, %xmm2 1659; SSE41-NEXT: movdqa %xmm2, %xmm1 1660; SSE41-NEXT: psrlq $4, %xmm1 1661; SSE41-NEXT: psrlq $3, %xmm2 1662; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1663; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] 1664; SSE41-NEXT: pxor %xmm1, %xmm2 1665; SSE41-NEXT: psubq %xmm1, %xmm2 1666; SSE41-NEXT: movdqa %xmm2, %xmm1 1667; SSE41-NEXT: retq 1668; 1669; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1670; AVX1: # %bb.0: 1671; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1672; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1673; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 1674; AVX1-NEXT: vpsrlq $60, %xmm3, %xmm4 1675; AVX1-NEXT: vpsrlq $61, %xmm3, %xmm3 1676; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 1677; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 1678; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3 1679; AVX1-NEXT: vpsrlq $3, %xmm1, %xmm1 1680; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 1681; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488] 1682; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 1683; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 1684; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 1685; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2 1686; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 1687; AVX1-NEXT: vpsrad $2, %xmm2, %xmm3 1688; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2 1689; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1690; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1691; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1692; AVX1-NEXT: retq 1693; 1694; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1695; AVX2: # %bb.0: 1696; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1697; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 1698; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 1699; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1700; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 1701; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2305843009213693952,1152921504606846976,576460752303423488> 1702; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 1703; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1 1704; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1705; AVX2-NEXT: retq 1706; 1707; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1708; AVX512F: # %bb.0: 1709; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1710; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,2,3,4> 1711; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2 1712; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %ymm2, %ymm2 1713; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2 1714; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1 1715; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1716; AVX512F-NEXT: retq 1717; 1718; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1719; AVX512BW: # %bb.0: 1720; AVX512BW-NEXT: vpsraq $63, %ymm0, %ymm1 1721; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 1722; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1723; AVX512BW-NEXT: vpsravq {{.*}}(%rip), %ymm1, %ymm1 1724; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1725; AVX512BW-NEXT: retq 1726; 1727; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1728; XOP: # %bb.0: 1729; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553] 1730; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2 1731; XOP-NEXT: vpsrlq $62, %xmm2, %xmm2 1732; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2 1733; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm2, %xmm2 1734; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 1735; XOP-NEXT: vpshaq %xmm1, %xmm3, %xmm1 1736; XOP-NEXT: vpshlq {{.*}}(%rip), %xmm1, %xmm1 1737; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 1738; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm1, %xmm1 1739; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1740; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1741; XOP-NEXT: retq 1742 %1 = sdiv <4 x i64> %x, <i64 1, i64 4, i64 8, i64 16> 1743 ret <4 x i64> %1 1744} 1745 1746define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { 1747; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1748; SSE2: # %bb.0: 1749; SSE2-NEXT: movdqa %xmm0, %xmm4 1750; SSE2-NEXT: psrad $31, %xmm4 1751; SSE2-NEXT: psrlq $62, %xmm4 1752; SSE2-NEXT: paddq %xmm0, %xmm4 1753; SSE2-NEXT: movdqa %xmm4, %xmm5 1754; SSE2-NEXT: psrad $2, %xmm5 1755; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 1756; SSE2-NEXT: psrlq $2, %xmm4 1757; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1758; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1759; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] 1760; SSE2-NEXT: movdqa %xmm2, %xmm4 1761; SSE2-NEXT: psrad $31, %xmm4 1762; SSE2-NEXT: psrlq $62, %xmm4 1763; SSE2-NEXT: paddq %xmm2, %xmm4 1764; SSE2-NEXT: movdqa %xmm4, %xmm5 1765; SSE2-NEXT: psrad $2, %xmm5 1766; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 1767; SSE2-NEXT: psrlq $2, %xmm4 1768; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1769; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1770; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] 1771; SSE2-NEXT: movdqa %xmm1, %xmm4 1772; SSE2-NEXT: psrad $31, %xmm4 1773; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1774; SSE2-NEXT: movdqa %xmm4, %xmm5 1775; SSE2-NEXT: psrlq $61, %xmm5 1776; SSE2-NEXT: psrlq $60, %xmm4 1777; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] 1778; SSE2-NEXT: paddq %xmm1, %xmm4 1779; SSE2-NEXT: movdqa %xmm4, %xmm1 1780; SSE2-NEXT: psrlq $3, %xmm1 1781; SSE2-NEXT: psrlq $4, %xmm4 1782; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] 1783; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] 1784; SSE2-NEXT: xorpd %xmm1, %xmm4 1785; SSE2-NEXT: psubq %xmm1, %xmm4 1786; SSE2-NEXT: movdqa %xmm3, %xmm5 1787; SSE2-NEXT: psrad $31, %xmm5 1788; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1789; SSE2-NEXT: movdqa %xmm5, %xmm6 1790; SSE2-NEXT: psrlq $61, %xmm6 1791; SSE2-NEXT: psrlq $60, %xmm5 1792; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] 1793; SSE2-NEXT: paddq %xmm3, %xmm5 1794; SSE2-NEXT: movdqa %xmm5, %xmm3 1795; SSE2-NEXT: psrlq $3, %xmm3 1796; SSE2-NEXT: psrlq $4, %xmm5 1797; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] 1798; SSE2-NEXT: xorpd %xmm1, %xmm5 1799; SSE2-NEXT: psubq %xmm1, %xmm5 1800; SSE2-NEXT: movdqa %xmm4, %xmm1 1801; SSE2-NEXT: movdqa %xmm5, %xmm3 1802; SSE2-NEXT: retq 1803; 1804; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1805; SSE41: # %bb.0: 1806; SSE41-NEXT: movdqa %xmm2, %xmm5 1807; SSE41-NEXT: movdqa %xmm1, %xmm4 1808; SSE41-NEXT: movdqa %xmm0, %xmm1 1809; SSE41-NEXT: psrad $31, %xmm0 1810; SSE41-NEXT: psrlq $62, %xmm0 1811; SSE41-NEXT: paddq %xmm1, %xmm0 1812; SSE41-NEXT: movdqa %xmm0, %xmm2 1813; SSE41-NEXT: psrad $2, %xmm2 1814; SSE41-NEXT: psrlq $2, %xmm0 1815; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1816; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1817; SSE41-NEXT: movdqa %xmm5, %xmm2 1818; SSE41-NEXT: psrad $31, %xmm2 1819; SSE41-NEXT: psrlq $62, %xmm2 1820; SSE41-NEXT: paddq %xmm5, %xmm2 1821; SSE41-NEXT: movdqa %xmm2, %xmm1 1822; SSE41-NEXT: psrad $2, %xmm1 1823; SSE41-NEXT: psrlq $2, %xmm2 1824; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 1825; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] 1826; SSE41-NEXT: movdqa %xmm4, %xmm1 1827; SSE41-NEXT: psrad $31, %xmm1 1828; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1829; SSE41-NEXT: movdqa %xmm1, %xmm5 1830; SSE41-NEXT: psrlq $60, %xmm5 1831; SSE41-NEXT: psrlq $61, %xmm1 1832; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7] 1833; SSE41-NEXT: paddq %xmm4, %xmm1 1834; SSE41-NEXT: movdqa %xmm1, %xmm4 1835; SSE41-NEXT: psrlq $4, %xmm4 1836; SSE41-NEXT: psrlq $3, %xmm1 1837; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 1838; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488] 1839; SSE41-NEXT: pxor %xmm5, %xmm1 1840; SSE41-NEXT: psubq %xmm5, %xmm1 1841; SSE41-NEXT: movdqa %xmm3, %xmm4 1842; SSE41-NEXT: psrad $31, %xmm4 1843; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1844; SSE41-NEXT: movdqa %xmm4, %xmm6 1845; SSE41-NEXT: psrlq $60, %xmm6 1846; SSE41-NEXT: psrlq $61, %xmm4 1847; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] 1848; SSE41-NEXT: paddq %xmm3, %xmm4 1849; SSE41-NEXT: movdqa %xmm4, %xmm3 1850; SSE41-NEXT: psrlq $4, %xmm3 1851; SSE41-NEXT: psrlq $3, %xmm4 1852; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1853; SSE41-NEXT: pxor %xmm5, %xmm4 1854; SSE41-NEXT: psubq %xmm5, %xmm4 1855; SSE41-NEXT: movdqa %xmm4, %xmm3 1856; SSE41-NEXT: retq 1857; 1858; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1859; AVX1: # %bb.0: 1860; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1861; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1862; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 1863; AVX1-NEXT: vpsrlq $60, %xmm4, %xmm5 1864; AVX1-NEXT: vpsrlq $61, %xmm4, %xmm4 1865; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7] 1866; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 1867; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm4 1868; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3 1869; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 1870; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] 1871; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 1872; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 1873; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 1874; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5 1875; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5 1876; AVX1-NEXT: vpsrad $2, %xmm5, %xmm6 1877; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm5 1878; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 1879; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 1880; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] 1881; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1882; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm5 1883; AVX1-NEXT: vpsrlq $60, %xmm5, %xmm6 1884; AVX1-NEXT: vpsrlq $61, %xmm5, %xmm5 1885; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] 1886; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 1887; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm5 1888; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3 1889; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] 1890; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 1891; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 1892; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 1893; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2 1894; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2 1895; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1896; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2 1897; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1898; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1899; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1900; AVX1-NEXT: retq 1901; 1902; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1903; AVX2: # %bb.0: 1904; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1905; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 1906; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <u,62,61,60> 1907; AVX2-NEXT: vpsrlvq %ymm4, %ymm3, %ymm3 1908; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm3 1909; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <u,2,3,4> 1910; AVX2-NEXT: vpsrlvq %ymm5, %ymm3, %ymm3 1911; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <u,2305843009213693952,1152921504606846976,576460752303423488> 1912; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm3 1913; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3 1914; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] 1915; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 1916; AVX2-NEXT: vpsrlvq %ymm4, %ymm2, %ymm2 1917; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm2 1918; AVX2-NEXT: vpsrlvq %ymm5, %ymm2, %ymm2 1919; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 1920; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1921; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1922; AVX2-NEXT: retq 1923; 1924; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1925; AVX512F: # %bb.0: 1926; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1 1927; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1 1928; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 1929; AVX512F-NEXT: vpsravq {{.*}}(%rip), %zmm1, %zmm1 1930; AVX512F-NEXT: movb $17, %al 1931; AVX512F-NEXT: kmovw %eax, %k1 1932; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 1933; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 1934; AVX512F-NEXT: retq 1935; 1936; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1937; AVX512BW: # %bb.0: 1938; AVX512BW-NEXT: vpsraq $63, %zmm0, %zmm1 1939; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1 1940; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 1941; AVX512BW-NEXT: vpsravq {{.*}}(%rip), %zmm1, %zmm1 1942; AVX512BW-NEXT: movb $17, %al 1943; AVX512BW-NEXT: kmovd %eax, %k1 1944; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 1945; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 1946; AVX512BW-NEXT: retq 1947; 1948; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1949; XOP: # %bb.0: 1950; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 1951; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553] 1952; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4 1953; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556] 1954; XOP-NEXT: vpshlq %xmm5, %xmm4, %xmm4 1955; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2 1956; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612] 1957; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2 1958; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6 1959; XOP-NEXT: vpsrlq $62, %xmm6, %xmm6 1960; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6 1961; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = <u,18446744073709551614> 1962; XOP-NEXT: vpshaq %xmm7, %xmm6, %xmm6 1963; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 1964; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] 1965; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1966; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm6 1967; XOP-NEXT: vpshlq %xmm5, %xmm6, %xmm5 1968; XOP-NEXT: vpaddq %xmm5, %xmm2, %xmm2 1969; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2 1970; XOP-NEXT: vpshaq %xmm3, %xmm1, %xmm3 1971; XOP-NEXT: vpsrlq $62, %xmm3, %xmm3 1972; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm3 1973; XOP-NEXT: vpshaq %xmm7, %xmm3, %xmm3 1974; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1975; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1976; XOP-NEXT: retq 1977 %1 = sdiv <8 x i64> %x, <i64 1, i64 4, i64 8, i64 16, i64 1, i64 4, i64 8, i64 16> 1978 ret <8 x i64> %1 1979} 1980 1981define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) { 1982; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 1983; SSE2: # %bb.0: 1984; SSE2-NEXT: movdqa %xmm0, %xmm1 1985; SSE2-NEXT: psrad $31, %xmm0 1986; SSE2-NEXT: movdqa %xmm0, %xmm2 1987; SSE2-NEXT: psrld $28, %xmm2 1988; SSE2-NEXT: movdqa %xmm0, %xmm3 1989; SSE2-NEXT: psrld $29, %xmm3 1990; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1991; SSE2-NEXT: psrld $30, %xmm0 1992; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3] 1993; SSE2-NEXT: paddd %xmm1, %xmm0 1994; SSE2-NEXT: movdqa %xmm0, %xmm2 1995; SSE2-NEXT: psrad $4, %xmm2 1996; SSE2-NEXT: movdqa %xmm0, %xmm3 1997; SSE2-NEXT: psrad $3, %xmm3 1998; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1999; SSE2-NEXT: psrad $2, %xmm0 2000; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3] 2001; SSE2-NEXT: pxor %xmm2, %xmm2 2002; SSE2-NEXT: psubd %xmm0, %xmm2 2003; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 2004; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2005; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] 2006; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2007; SSE2-NEXT: retq 2008; 2009; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 2010; SSE41: # %bb.0: 2011; SSE41-NEXT: movdqa %xmm0, %xmm1 2012; SSE41-NEXT: psrad $31, %xmm1 2013; SSE41-NEXT: movdqa %xmm1, %xmm2 2014; SSE41-NEXT: psrld $28, %xmm2 2015; SSE41-NEXT: movdqa %xmm1, %xmm3 2016; SSE41-NEXT: psrld $30, %xmm3 2017; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 2018; SSE41-NEXT: psrld $29, %xmm1 2019; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 2020; SSE41-NEXT: paddd %xmm0, %xmm1 2021; SSE41-NEXT: movdqa %xmm1, %xmm2 2022; SSE41-NEXT: psrad $4, %xmm2 2023; SSE41-NEXT: movdqa %xmm1, %xmm3 2024; SSE41-NEXT: psrad $2, %xmm3 2025; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 2026; SSE41-NEXT: pxor %xmm2, %xmm2 2027; SSE41-NEXT: psubd %xmm3, %xmm2 2028; SSE41-NEXT: psrad $3, %xmm1 2029; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2030; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 2031; SSE41-NEXT: movdqa %xmm1, %xmm0 2032; SSE41-NEXT: retq 2033; 2034; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 2035; AVX1: # %bb.0: 2036; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 2037; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 2038; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3 2039; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 2040; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1 2041; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 2042; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2043; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 2044; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 2045; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 2046; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 2047; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 2048; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 2049; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2050; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 2051; AVX1-NEXT: retq 2052; 2053; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 2054; AVX2ORLATER: # %bb.0: 2055; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1 2056; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 2057; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2058; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 2059; AVX2ORLATER-NEXT: vpxor %xmm2, %xmm2, %xmm2 2060; AVX2ORLATER-NEXT: vpsubd %xmm1, %xmm2, %xmm2 2061; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2062; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 2063; AVX2ORLATER-NEXT: retq 2064; 2065; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 2066; XOP: # %bb.0: 2067; XOP-NEXT: vpsrad $31, %xmm0, %xmm1 2068; XOP-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1 2069; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2070; XOP-NEXT: vpshad {{.*}}(%rip), %xmm1, %xmm1 2071; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 2072; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm2 2073; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2074; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 2075; XOP-NEXT: retq 2076 %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16> 2077 ret <4 x i32> %1 2078} 2079 2080define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) { 2081; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef1: 2082; CHECK: # %bb.0: 2083; CHECK-NEXT: retq 2084 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 -16> 2085 ret <4 x i32> %1 2086} 2087 2088define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) { 2089; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef2: 2090; CHECK: # %bb.0: 2091; CHECK-NEXT: retq 2092 %1 = sdiv <4 x i32> %x, <i32 undef, i32 4, i32 undef, i32 16> 2093 ret <4 x i32> %1 2094} 2095 2096define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) { 2097; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef3: 2098; CHECK: # %bb.0: 2099; CHECK-NEXT: retq 2100 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 16> 2101 ret <4 x i32> %1 2102} 2103 2104; PR37119 2105define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) { 2106; SSE-LABEL: non_splat_minus_one_divisor_0: 2107; SSE: # %bb.0: 2108; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2109; SSE-NEXT: pxor %xmm1, %xmm0 2110; SSE-NEXT: psubb %xmm1, %xmm0 2111; SSE-NEXT: retq 2112; 2113; AVX1-LABEL: non_splat_minus_one_divisor_0: 2114; AVX1: # %bb.0: 2115; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2116; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2117; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2118; AVX1-NEXT: retq 2119; 2120; AVX2-LABEL: non_splat_minus_one_divisor_0: 2121; AVX2: # %bb.0: 2122; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2123; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2124; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2125; AVX2-NEXT: retq 2126; 2127; AVX512F-LABEL: non_splat_minus_one_divisor_0: 2128; AVX512F: # %bb.0: 2129; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2130; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 2131; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2132; AVX512F-NEXT: retq 2133; 2134; AVX512BW-LABEL: non_splat_minus_one_divisor_0: 2135; AVX512BW: # %bb.0: 2136; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2137; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB 2138; AVX512BW-NEXT: kmovd %eax, %k1 2139; AVX512BW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} 2140; AVX512BW-NEXT: retq 2141; 2142; XOP-LABEL: non_splat_minus_one_divisor_0: 2143; XOP: # %bb.0: 2144; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2145; XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 2146; XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2147; XOP-NEXT: retq 2148 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2149 ret <16 x i8> %div 2150} 2151 2152define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { 2153; SSE2-LABEL: non_splat_minus_one_divisor_1: 2154; SSE2: # %bb.0: 2155; SSE2-NEXT: pxor %xmm1, %xmm1 2156; SSE2-NEXT: pxor %xmm2, %xmm2 2157; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 2158; SSE2-NEXT: movdqa %xmm2, %xmm3 2159; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 2160; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3 2161; SSE2-NEXT: psrlw $8, %xmm3 2162; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 2163; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 2164; SSE2-NEXT: psrlw $8, %xmm2 2165; SSE2-NEXT: packuswb %xmm3, %xmm2 2166; SSE2-NEXT: paddb %xmm0, %xmm2 2167; SSE2-NEXT: movdqa %xmm2, %xmm1 2168; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 2169; SSE2-NEXT: psraw $8, %xmm1 2170; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1 2171; SSE2-NEXT: psrlw $8, %xmm1 2172; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2173; SSE2-NEXT: psraw $8, %xmm2 2174; SSE2-NEXT: psllw $7, %xmm2 2175; SSE2-NEXT: psrlw $8, %xmm2 2176; SSE2-NEXT: packuswb %xmm1, %xmm2 2177; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2178; SSE2-NEXT: pand %xmm1, %xmm2 2179; SSE2-NEXT: pandn %xmm0, %xmm1 2180; SSE2-NEXT: por %xmm2, %xmm1 2181; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2182; SSE2-NEXT: pxor %xmm0, %xmm1 2183; SSE2-NEXT: psubb %xmm0, %xmm1 2184; SSE2-NEXT: movdqa %xmm1, %xmm0 2185; SSE2-NEXT: retq 2186; 2187; SSE41-LABEL: non_splat_minus_one_divisor_1: 2188; SSE41: # %bb.0: 2189; SSE41-NEXT: movdqa %xmm0, %xmm1 2190; SSE41-NEXT: pxor %xmm0, %xmm0 2191; SSE41-NEXT: pxor %xmm3, %xmm3 2192; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 2193; SSE41-NEXT: pxor %xmm4, %xmm4 2194; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 2195; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2196; SSE41-NEXT: psllw $1, %xmm2 2197; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7] 2198; SSE41-NEXT: psrlw $8, %xmm2 2199; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 2200; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm3 2201; SSE41-NEXT: psrlw $8, %xmm3 2202; SSE41-NEXT: packuswb %xmm3, %xmm2 2203; SSE41-NEXT: paddb %xmm1, %xmm2 2204; SSE41-NEXT: movdqa %xmm2, %xmm0 2205; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 2206; SSE41-NEXT: psraw $8, %xmm0 2207; SSE41-NEXT: movdqa %xmm0, %xmm3 2208; SSE41-NEXT: psllw $1, %xmm3 2209; SSE41-NEXT: psllw $7, %xmm0 2210; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7] 2211; SSE41-NEXT: psrlw $8, %xmm0 2212; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2213; SSE41-NEXT: psraw $8, %xmm2 2214; SSE41-NEXT: psllw $7, %xmm2 2215; SSE41-NEXT: psrlw $8, %xmm2 2216; SSE41-NEXT: packuswb %xmm0, %xmm2 2217; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2218; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 2219; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2220; SSE41-NEXT: pxor %xmm0, %xmm1 2221; SSE41-NEXT: psubb %xmm0, %xmm1 2222; SSE41-NEXT: movdqa %xmm1, %xmm0 2223; SSE41-NEXT: retq 2224; 2225; AVX1-LABEL: non_splat_minus_one_divisor_1: 2226; AVX1: # %bb.0: 2227; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2228; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 2229; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2230; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2231; AVX1-NEXT: vpsllw $1, %xmm4, %xmm4 2232; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7] 2233; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 2234; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2235; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 2236; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2237; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 2238; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 2239; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2240; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 2241; AVX1-NEXT: vpsllw $1, %xmm2, %xmm3 2242; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2 2243; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7] 2244; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 2245; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2246; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 2247; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 2248; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2249; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2250; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2251; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2252; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2253; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2254; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2255; AVX1-NEXT: retq 2256; 2257; AVX2-LABEL: non_splat_minus_one_divisor_1: 2258; AVX2: # %bb.0: 2259; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2260; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 2261; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2262; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 2263; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2264; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2265; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2266; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 2267; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2268; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 2269; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2270; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2271; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2272; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2273; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2274; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2275; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2276; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2277; AVX2-NEXT: vzeroupper 2278; AVX2-NEXT: retq 2279; 2280; AVX512F-LABEL: non_splat_minus_one_divisor_1: 2281; AVX512F: # %bb.0: 2282; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2283; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 2284; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 2285; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1 2286; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 2287; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1 2288; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 2289; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1 2290; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 2291; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2292; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2293; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2294; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 2295; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2296; AVX512F-NEXT: vzeroupper 2297; AVX512F-NEXT: retq 2298; 2299; AVX512BW-LABEL: non_splat_minus_one_divisor_1: 2300; AVX512BW: # %bb.0: 2301; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2302; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 2303; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2304; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2 2305; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2 2306; AVX512BW-NEXT: vpaddb %xmm2, %xmm0, %xmm2 2307; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm2 2308; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm2, %ymm2 2309; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2 2310; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB 2311; AVX512BW-NEXT: kmovd %eax, %k1 2312; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} 2313; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm0 2314; AVX512BW-NEXT: movw $24132, %ax # imm = 0x5E44 2315; AVX512BW-NEXT: kmovd %eax, %k1 2316; AVX512BW-NEXT: vmovdqu8 %xmm2, %xmm0 {%k1} 2317; AVX512BW-NEXT: vzeroupper 2318; AVX512BW-NEXT: retq 2319; 2320; XOP-LABEL: non_splat_minus_one_divisor_1: 2321; XOP: # %bb.0: 2322; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 2323; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 2324; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1 2325; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1 2326; XOP-NEXT: vpshab {{.*}}(%rip), %xmm1, %xmm1 2327; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2328; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2329; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2330; XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 2331; XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2332; XOP-NEXT: retq 2333 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 2, i8 2, i8 2, i8 2, i8 -128, i8 2, i8 -128> 2334 ret <16 x i8> %div 2335} 2336 2337define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) { 2338; SSE2-LABEL: non_splat_minus_one_divisor_2: 2339; SSE2: # %bb.0: 2340; SSE2-NEXT: movdqa %xmm0, %xmm1 2341; SSE2-NEXT: psrld $31, %xmm1 2342; SSE2-NEXT: paddd %xmm0, %xmm1 2343; SSE2-NEXT: psrad $1, %xmm1 2344; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2345; SSE2-NEXT: pxor %xmm0, %xmm0 2346; SSE2-NEXT: psubd %xmm1, %xmm0 2347; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] 2348; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] 2349; SSE2-NEXT: retq 2350; 2351; SSE41-LABEL: non_splat_minus_one_divisor_2: 2352; SSE41: # %bb.0: 2353; SSE41-NEXT: movdqa %xmm0, %xmm1 2354; SSE41-NEXT: psrld $31, %xmm1 2355; SSE41-NEXT: paddd %xmm0, %xmm1 2356; SSE41-NEXT: psrad $1, %xmm1 2357; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2358; SSE41-NEXT: pxor %xmm0, %xmm0 2359; SSE41-NEXT: psubd %xmm1, %xmm0 2360; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] 2361; SSE41-NEXT: movdqa %xmm1, %xmm0 2362; SSE41-NEXT: retq 2363; 2364; AVX1-LABEL: non_splat_minus_one_divisor_2: 2365; AVX1: # %bb.0: 2366; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 2367; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2368; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1 2369; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2370; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2371; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1 2372; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 2373; AVX1-NEXT: retq 2374; 2375; AVX2ORLATER-LABEL: non_splat_minus_one_divisor_2: 2376; AVX2ORLATER: # %bb.0: 2377; AVX2ORLATER-NEXT: vpsrld $31, %xmm0, %xmm1 2378; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2379; AVX2ORLATER-NEXT: vpsrad $1, %xmm1, %xmm1 2380; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2381; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 2382; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1 2383; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 2384; AVX2ORLATER-NEXT: retq 2385; 2386; XOP-LABEL: non_splat_minus_one_divisor_2: 2387; XOP: # %bb.0: 2388; XOP-NEXT: vpsrld $31, %xmm0, %xmm1 2389; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2390; XOP-NEXT: vpsrad $1, %xmm1, %xmm1 2391; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2392; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 2393; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1 2394; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 2395; XOP-NEXT: retq 2396 %div = sdiv <4 x i32> %A, <i32 -1, i32 1, i32 2, i32 -2> 2397 ret <4 x i32> %div 2398} 2399 2400define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) { 2401; SSE-LABEL: combine_vec_sdiv_nonuniform: 2402; SSE: # %bb.0: 2403; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0 2404; SSE-NEXT: movdqa %xmm0, %xmm1 2405; SSE-NEXT: psrlw $15, %xmm1 2406; SSE-NEXT: paddw %xmm0, %xmm1 2407; SSE-NEXT: movdqa %xmm1, %xmm0 2408; SSE-NEXT: retq 2409; 2410; AVX-LABEL: combine_vec_sdiv_nonuniform: 2411; AVX: # %bb.0: 2412; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2413; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1 2414; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2415; AVX-NEXT: retq 2416 %1 = sdiv <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 22, i16 22, i16 22, i16 22> 2417 ret <8 x i16> %1 2418} 2419 2420define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) { 2421; SSE2-LABEL: combine_vec_sdiv_nonuniform2: 2422; SSE2: # %bb.0: 2423; SSE2-NEXT: pmulhw {{.*}}(%rip), %xmm0 2424; SSE2-NEXT: movdqa %xmm0, %xmm1 2425; SSE2-NEXT: psraw $2, %xmm1 2426; SSE2-NEXT: movdqa %xmm0, %xmm2 2427; SSE2-NEXT: psraw $1, %xmm2 2428; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 2429; SSE2-NEXT: psrlw $15, %xmm0 2430; SSE2-NEXT: paddw %xmm2, %xmm0 2431; SSE2-NEXT: retq 2432; 2433; SSE41-LABEL: combine_vec_sdiv_nonuniform2: 2434; SSE41: # %bb.0: 2435; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0 2436; SSE41-NEXT: movdqa %xmm0, %xmm1 2437; SSE41-NEXT: psraw $1, %xmm1 2438; SSE41-NEXT: movdqa %xmm0, %xmm2 2439; SSE41-NEXT: psraw $2, %xmm2 2440; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2441; SSE41-NEXT: psrlw $15, %xmm0 2442; SSE41-NEXT: paddw %xmm2, %xmm0 2443; SSE41-NEXT: retq 2444; 2445; AVX1-LABEL: combine_vec_sdiv_nonuniform2: 2446; AVX1: # %bb.0: 2447; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2448; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 2449; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2 2450; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2451; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2452; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2453; AVX1-NEXT: retq 2454; 2455; AVX2-LABEL: combine_vec_sdiv_nonuniform2: 2456; AVX2: # %bb.0: 2457; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2458; AVX2-NEXT: vpsraw $1, %xmm0, %xmm1 2459; AVX2-NEXT: vpsraw $2, %xmm0, %xmm2 2460; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2461; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2462; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2463; AVX2-NEXT: retq 2464; 2465; AVX512F-LABEL: combine_vec_sdiv_nonuniform2: 2466; AVX512F: # %bb.0: 2467; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2468; AVX512F-NEXT: vpsraw $1, %xmm0, %xmm1 2469; AVX512F-NEXT: vpsraw $2, %xmm0, %xmm2 2470; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2471; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 2472; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2473; AVX512F-NEXT: retq 2474; 2475; AVX512BW-LABEL: combine_vec_sdiv_nonuniform2: 2476; AVX512BW: # %bb.0: 2477; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2478; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2479; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 2480; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2481; AVX512BW-NEXT: retq 2482; 2483; XOP-LABEL: combine_vec_sdiv_nonuniform2: 2484; XOP: # %bb.0: 2485; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2486; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2487; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 2488; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2489; XOP-NEXT: retq 2490 %1 = sdiv <8 x i16> %x, <i16 24, i16 24, i16 24, i16 24, i16 25, i16 25, i16 25, i16 25> 2491 ret <8 x i16> %1 2492} 2493 2494define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) { 2495; SSE2-LABEL: combine_vec_sdiv_nonuniform3: 2496; SSE2: # %bb.0: 2497; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833] 2498; SSE2-NEXT: pmulhw %xmm0, %xmm1 2499; SSE2-NEXT: paddw %xmm0, %xmm1 2500; SSE2-NEXT: movdqa %xmm1, %xmm0 2501; SSE2-NEXT: psraw $4, %xmm0 2502; SSE2-NEXT: movdqa %xmm1, %xmm2 2503; SSE2-NEXT: psraw $8, %xmm2 2504; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2505; SSE2-NEXT: psrlw $15, %xmm1 2506; SSE2-NEXT: paddw %xmm2, %xmm1 2507; SSE2-NEXT: movdqa %xmm1, %xmm0 2508; SSE2-NEXT: retq 2509; 2510; SSE41-LABEL: combine_vec_sdiv_nonuniform3: 2511; SSE41: # %bb.0: 2512; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833] 2513; SSE41-NEXT: pmulhw %xmm0, %xmm1 2514; SSE41-NEXT: paddw %xmm0, %xmm1 2515; SSE41-NEXT: movdqa %xmm1, %xmm0 2516; SSE41-NEXT: psraw $8, %xmm0 2517; SSE41-NEXT: movdqa %xmm1, %xmm2 2518; SSE41-NEXT: psraw $4, %xmm2 2519; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] 2520; SSE41-NEXT: psrlw $15, %xmm1 2521; SSE41-NEXT: paddw %xmm2, %xmm1 2522; SSE41-NEXT: movdqa %xmm1, %xmm0 2523; SSE41-NEXT: retq 2524; 2525; AVX1-LABEL: combine_vec_sdiv_nonuniform3: 2526; AVX1: # %bb.0: 2527; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2528; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2529; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1 2530; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2 2531; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2532; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2533; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2534; AVX1-NEXT: retq 2535; 2536; AVX2-LABEL: combine_vec_sdiv_nonuniform3: 2537; AVX2: # %bb.0: 2538; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2539; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2540; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1 2541; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2 2542; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2543; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2544; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2545; AVX2-NEXT: retq 2546; 2547; AVX512F-LABEL: combine_vec_sdiv_nonuniform3: 2548; AVX512F: # %bb.0: 2549; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2550; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2551; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1 2552; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2 2553; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2554; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 2555; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2556; AVX512F-NEXT: retq 2557; 2558; AVX512BW-LABEL: combine_vec_sdiv_nonuniform3: 2559; AVX512BW: # %bb.0: 2560; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2561; AVX512BW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2562; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2563; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 2564; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2565; AVX512BW-NEXT: retq 2566; 2567; XOP-LABEL: combine_vec_sdiv_nonuniform3: 2568; XOP: # %bb.0: 2569; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2570; XOP-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2571; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2572; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 2573; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2574; XOP-NEXT: retq 2575 %1 = sdiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 511, i16 511, i16 511, i16 511> 2576 ret <8 x i16> %1 2577} 2578 2579define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) { 2580; SSE2-LABEL: combine_vec_sdiv_nonuniform4: 2581; SSE2: # %bb.0: 2582; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639] 2583; SSE2-NEXT: pmulhw %xmm0, %xmm1 2584; SSE2-NEXT: psubw %xmm0, %xmm1 2585; SSE2-NEXT: movdqa %xmm1, %xmm0 2586; SSE2-NEXT: psraw $4, %xmm0 2587; SSE2-NEXT: movdqa %xmm1, %xmm2 2588; SSE2-NEXT: psraw $8, %xmm2 2589; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2590; SSE2-NEXT: psrlw $15, %xmm1 2591; SSE2-NEXT: paddw %xmm2, %xmm1 2592; SSE2-NEXT: movdqa %xmm1, %xmm0 2593; SSE2-NEXT: retq 2594; 2595; SSE41-LABEL: combine_vec_sdiv_nonuniform4: 2596; SSE41: # %bb.0: 2597; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639] 2598; SSE41-NEXT: pmulhw %xmm0, %xmm1 2599; SSE41-NEXT: psubw %xmm0, %xmm1 2600; SSE41-NEXT: movdqa %xmm1, %xmm0 2601; SSE41-NEXT: psraw $8, %xmm0 2602; SSE41-NEXT: movdqa %xmm1, %xmm2 2603; SSE41-NEXT: psraw $4, %xmm2 2604; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] 2605; SSE41-NEXT: psrlw $15, %xmm1 2606; SSE41-NEXT: paddw %xmm2, %xmm1 2607; SSE41-NEXT: movdqa %xmm1, %xmm0 2608; SSE41-NEXT: retq 2609; 2610; AVX1-LABEL: combine_vec_sdiv_nonuniform4: 2611; AVX1: # %bb.0: 2612; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2613; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2614; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1 2615; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2 2616; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2617; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2618; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2619; AVX1-NEXT: retq 2620; 2621; AVX2-LABEL: combine_vec_sdiv_nonuniform4: 2622; AVX2: # %bb.0: 2623; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2624; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2625; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1 2626; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2 2627; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2628; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2629; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2630; AVX2-NEXT: retq 2631; 2632; AVX512F-LABEL: combine_vec_sdiv_nonuniform4: 2633; AVX512F: # %bb.0: 2634; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2635; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2636; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1 2637; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2 2638; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2639; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 2640; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2641; AVX512F-NEXT: retq 2642; 2643; AVX512BW-LABEL: combine_vec_sdiv_nonuniform4: 2644; AVX512BW: # %bb.0: 2645; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2646; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2647; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2648; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 2649; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2650; AVX512BW-NEXT: retq 2651; 2652; XOP-LABEL: combine_vec_sdiv_nonuniform4: 2653; XOP: # %bb.0: 2654; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2655; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2656; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2657; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 2658; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2659; XOP-NEXT: retq 2660 %1 = sdiv <8 x i16> %x, <i16 -23, i16 -23, i16 -23, i16 -23, i16 -510, i16 -510, i16 -510, i16 -510> 2661 ret <8 x i16> %1 2662} 2663 2664define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { 2665; SSE2-LABEL: combine_vec_sdiv_nonuniform5: 2666; SSE2: # %bb.0: 2667; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] 2668; SSE2-NEXT: pmullw %xmm0, %xmm1 2669; SSE2-NEXT: pmulhw {{.*}}(%rip), %xmm0 2670; SSE2-NEXT: paddw %xmm1, %xmm0 2671; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] 2672; SSE2-NEXT: movdqa %xmm0, %xmm2 2673; SSE2-NEXT: pand %xmm1, %xmm2 2674; SSE2-NEXT: movdqa %xmm0, %xmm3 2675; SSE2-NEXT: psraw $8, %xmm3 2676; SSE2-NEXT: pandn %xmm3, %xmm1 2677; SSE2-NEXT: por %xmm2, %xmm1 2678; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535] 2679; SSE2-NEXT: pand %xmm2, %xmm1 2680; SSE2-NEXT: movdqa %xmm0, %xmm3 2681; SSE2-NEXT: psraw $4, %xmm3 2682; SSE2-NEXT: pandn %xmm3, %xmm2 2683; SSE2-NEXT: por %xmm1, %xmm2 2684; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535] 2685; SSE2-NEXT: movdqa %xmm2, %xmm3 2686; SSE2-NEXT: pand %xmm1, %xmm3 2687; SSE2-NEXT: psraw $2, %xmm2 2688; SSE2-NEXT: pandn %xmm2, %xmm1 2689; SSE2-NEXT: por %xmm3, %xmm1 2690; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535] 2691; SSE2-NEXT: movdqa %xmm1, %xmm3 2692; SSE2-NEXT: pand %xmm2, %xmm3 2693; SSE2-NEXT: psraw $1, %xmm1 2694; SSE2-NEXT: pandn %xmm1, %xmm2 2695; SSE2-NEXT: por %xmm3, %xmm2 2696; SSE2-NEXT: psrlw $15, %xmm0 2697; SSE2-NEXT: paddw %xmm2, %xmm0 2698; SSE2-NEXT: retq 2699; 2700; SSE41-LABEL: combine_vec_sdiv_nonuniform5: 2701; SSE41: # %bb.0: 2702; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] 2703; SSE41-NEXT: pmullw %xmm0, %xmm1 2704; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0 2705; SSE41-NEXT: paddw %xmm1, %xmm0 2706; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <256,16384,4096,u,u,32768,512,256> 2707; SSE41-NEXT: pmulhw %xmm0, %xmm1 2708; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 2709; SSE41-NEXT: movdqa %xmm0, %xmm2 2710; SSE41-NEXT: psraw $1, %xmm2 2711; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] 2712; SSE41-NEXT: psrlw $15, %xmm0 2713; SSE41-NEXT: paddw %xmm2, %xmm0 2714; SSE41-NEXT: retq 2715; 2716; AVX1-LABEL: combine_vec_sdiv_nonuniform5: 2717; AVX1: # %bb.0: 2718; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 2719; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2720; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2721; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2722; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 2723; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 2724; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] 2725; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2726; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2727; AVX1-NEXT: retq 2728; 2729; AVX2-LABEL: combine_vec_sdiv_nonuniform5: 2730; AVX2: # %bb.0: 2731; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 2732; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2733; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2734; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2735; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 2736; AVX2-NEXT: vpsraw $1, %xmm0, %xmm2 2737; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] 2738; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2739; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2740; AVX2-NEXT: retq 2741; 2742; AVX512F-LABEL: combine_vec_sdiv_nonuniform5: 2743; AVX512F: # %bb.0: 2744; AVX512F-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 2745; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2746; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2747; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1 2748; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 2749; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 2750; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 2751; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2752; AVX512F-NEXT: vzeroupper 2753; AVX512F-NEXT: retq 2754; 2755; AVX512BW-LABEL: combine_vec_sdiv_nonuniform5: 2756; AVX512BW: # %bb.0: 2757; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 2758; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2759; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2760; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2761; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 2762; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2763; AVX512BW-NEXT: retq 2764; 2765; XOP-LABEL: combine_vec_sdiv_nonuniform5: 2766; XOP: # %bb.0: 2767; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2768; XOP-NEXT: vpmacsww %xmm1, {{.*}}(%rip), %xmm0, %xmm0 2769; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2770; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 2771; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2772; XOP-NEXT: retq 2773 %1 = sdiv <8 x i16> %x, <i16 -510, i16 -24, i16 -23, i16 3, i16 22, i16 25, i16 255, i16 511> 2774 ret <8 x i16> %1 2775} 2776 2777define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { 2778; SSE2-LABEL: combine_vec_sdiv_nonuniform6: 2779; SSE2: # %bb.0: 2780; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0] 2781; SSE2-NEXT: pmullw %xmm0, %xmm1 2782; SSE2-NEXT: pmulhw {{.*}}(%rip), %xmm0 2783; SSE2-NEXT: paddw %xmm1, %xmm0 2784; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] 2785; SSE2-NEXT: movdqa %xmm0, %xmm2 2786; SSE2-NEXT: psraw $8, %xmm2 2787; SSE2-NEXT: pand %xmm1, %xmm2 2788; SSE2-NEXT: pandn %xmm0, %xmm1 2789; SSE2-NEXT: por %xmm2, %xmm1 2790; SSE2-NEXT: movdqa %xmm1, %xmm2 2791; SSE2-NEXT: psraw $6, %xmm2 2792; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535] 2793; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,0] 2794; SSE2-NEXT: pand %xmm4, %xmm1 2795; SSE2-NEXT: movdqa %xmm0, %xmm5 2796; SSE2-NEXT: psraw $12, %xmm5 2797; SSE2-NEXT: pandn %xmm5, %xmm4 2798; SSE2-NEXT: por %xmm1, %xmm4 2799; SSE2-NEXT: pand %xmm3, %xmm4 2800; SSE2-NEXT: pandn %xmm2, %xmm3 2801; SSE2-NEXT: por %xmm4, %xmm3 2802; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,0] 2803; SSE2-NEXT: movdqa %xmm3, %xmm2 2804; SSE2-NEXT: pand %xmm1, %xmm2 2805; SSE2-NEXT: psraw $1, %xmm3 2806; SSE2-NEXT: pandn %xmm3, %xmm1 2807; SSE2-NEXT: por %xmm2, %xmm1 2808; SSE2-NEXT: psrlw $15, %xmm0 2809; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 2810; SSE2-NEXT: paddw %xmm1, %xmm0 2811; SSE2-NEXT: retq 2812; 2813; SSE41-LABEL: combine_vec_sdiv_nonuniform6: 2814; SSE41: # %bb.0: 2815; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0] 2816; SSE41-NEXT: pmullw %xmm0, %xmm1 2817; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0 2818; SSE41-NEXT: paddw %xmm1, %xmm0 2819; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <4,256,256,u,u,512,256,8> 2820; SSE41-NEXT: pmulhw %xmm0, %xmm2 2821; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 2822; SSE41-NEXT: psrlw $15, %xmm0 2823; SSE41-NEXT: pxor %xmm1, %xmm1 2824; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] 2825; SSE41-NEXT: paddw %xmm2, %xmm1 2826; SSE41-NEXT: movdqa %xmm1, %xmm0 2827; SSE41-NEXT: retq 2828; 2829; AVX1-LABEL: combine_vec_sdiv_nonuniform6: 2830; AVX1: # %bb.0: 2831; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 2832; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2833; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2834; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2835; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 2836; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2837; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 2838; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] 2839; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2840; AVX1-NEXT: retq 2841; 2842; AVX2-LABEL: combine_vec_sdiv_nonuniform6: 2843; AVX2: # %bb.0: 2844; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 2845; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2846; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2847; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2848; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 2849; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2850; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2851; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] 2852; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2853; AVX2-NEXT: retq 2854; 2855; AVX512F-LABEL: combine_vec_sdiv_nonuniform6: 2856; AVX512F: # %bb.0: 2857; AVX512F-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 2858; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2859; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2860; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1 2861; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 2862; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] 2863; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 2864; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 2865; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 2866; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2867; AVX512F-NEXT: vzeroupper 2868; AVX512F-NEXT: retq 2869; 2870; AVX512BW-LABEL: combine_vec_sdiv_nonuniform6: 2871; AVX512BW: # %bb.0: 2872; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 2873; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 2874; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2875; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2876; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2877; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] 2878; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 2879; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2880; AVX512BW-NEXT: retq 2881; 2882; XOP-LABEL: combine_vec_sdiv_nonuniform6: 2883; XOP: # %bb.0: 2884; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 2885; XOP-NEXT: vpmacsww %xmm1, {{.*}}(%rip), %xmm0, %xmm0 2886; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2887; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 2888; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] 2889; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 2890; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2891; XOP-NEXT: retq 2892 %1 = sdiv <8 x i16> %x, <i16 -32768, i16 -512, i16 -511, i16 -1, i16 1, i16 255, i16 512, i16 32767> 2893 ret <8 x i16> %1 2894} 2895 2896define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) { 2897; SSE2-LABEL: combine_vec_sdiv_nonuniform7: 2898; SSE2: # %bb.0: 2899; SSE2-NEXT: pxor %xmm1, %xmm1 2900; SSE2-NEXT: psubw %xmm0, %xmm1 2901; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2902; SSE2-NEXT: retq 2903; 2904; SSE41-LABEL: combine_vec_sdiv_nonuniform7: 2905; SSE41: # %bb.0: 2906; SSE41-NEXT: pxor %xmm1, %xmm1 2907; SSE41-NEXT: psubw %xmm0, %xmm1 2908; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2909; SSE41-NEXT: retq 2910; 2911; AVX1-LABEL: combine_vec_sdiv_nonuniform7: 2912; AVX1: # %bb.0: 2913; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2914; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 2915; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2916; AVX1-NEXT: retq 2917; 2918; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7: 2919; AVX2ORLATER: # %bb.0: 2920; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 2921; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1 2922; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2923; AVX2ORLATER-NEXT: retq 2924; 2925; XOP-LABEL: combine_vec_sdiv_nonuniform7: 2926; XOP: # %bb.0: 2927; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 2928; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm1 2929; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2930; XOP-NEXT: retq 2931 %1 = sdiv <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 1, i16 1, i16 1, i16 1> 2932 ret <8 x i16> %1 2933} 2934 2935define <16 x i8> @pr38658(<16 x i8> %x) { 2936; SSE2-LABEL: pr38658: 2937; SSE2: # %bb.0: 2938; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2939; SSE2-NEXT: psraw $8, %xmm2 2940; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 2941; SSE2-NEXT: psrlw $8, %xmm2 2942; SSE2-NEXT: pxor %xmm3, %xmm3 2943; SSE2-NEXT: pxor %xmm1, %xmm1 2944; SSE2-NEXT: packuswb %xmm2, %xmm1 2945; SSE2-NEXT: paddb %xmm0, %xmm1 2946; SSE2-NEXT: movdqa %xmm1, %xmm0 2947; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 2948; SSE2-NEXT: movdqa %xmm1, %xmm2 2949; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2950; SSE2-NEXT: psraw $8, %xmm2 2951; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 2952; SSE2-NEXT: psrlw $8, %xmm2 2953; SSE2-NEXT: packuswb %xmm2, %xmm0 2954; SSE2-NEXT: psrlw $7, %xmm1 2955; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 2956; SSE2-NEXT: paddb %xmm0, %xmm1 2957; SSE2-NEXT: movdqa %xmm1, %xmm0 2958; SSE2-NEXT: retq 2959; 2960; SSE41-LABEL: pr38658: 2961; SSE41: # %bb.0: 2962; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2963; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 2964; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 2965; SSE41-NEXT: psrlw $8, %xmm2 2966; SSE41-NEXT: pxor %xmm1, %xmm1 2967; SSE41-NEXT: packuswb %xmm2, %xmm1 2968; SSE41-NEXT: paddb %xmm0, %xmm1 2969; SSE41-NEXT: movdqa %xmm1, %xmm0 2970; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 2971; SSE41-NEXT: psraw $8, %xmm0 2972; SSE41-NEXT: movdqa %xmm0, %xmm2 2973; SSE41-NEXT: psllw $6, %xmm2 2974; SSE41-NEXT: psllw $8, %xmm0 2975; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] 2976; SSE41-NEXT: psrlw $8, %xmm0 2977; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2978; SSE41-NEXT: packuswb %xmm0, %xmm2 2979; SSE41-NEXT: psrlw $7, %xmm1 2980; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 2981; SSE41-NEXT: paddb %xmm2, %xmm1 2982; SSE41-NEXT: movdqa %xmm1, %xmm0 2983; SSE41-NEXT: retq 2984; 2985; AVX1-LABEL: pr38658: 2986; AVX1: # %bb.0: 2987; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2988; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 2989; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 2990; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2991; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 2992; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 2993; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2994; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2995; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 2996; AVX1-NEXT: vpsllw $6, %xmm1, %xmm2 2997; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 2998; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] 2999; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 3000; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 3001; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 3002; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 3003; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3004; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3005; AVX1-NEXT: retq 3006; 3007; AVX2-LABEL: pr38658: 3008; AVX2: # %bb.0: 3009; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 3010; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 3011; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 3012; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3013; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3014; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3015; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 3016; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 3017; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 3018; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3019; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3020; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 3021; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3022; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3023; AVX2-NEXT: vzeroupper 3024; AVX2-NEXT: retq 3025; 3026; AVX512F-LABEL: pr38658: 3027; AVX512F: # %bb.0: 3028; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1 3029; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 3030; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 3031; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 3032; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 3033; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3034; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm1 3035; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 3036; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 3037; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 3038; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3039; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm0 3040; AVX512F-NEXT: vzeroupper 3041; AVX512F-NEXT: retq 3042; 3043; AVX512BW-LABEL: pr38658: 3044; AVX512BW: # %bb.0: 3045; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 3046; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 3047; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 3048; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 3049; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3050; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm1 3051; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 3052; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 3053; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 3054; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 3055; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 3056; AVX512BW-NEXT: vzeroupper 3057; AVX512BW-NEXT: retq 3058; 3059; XOP-LABEL: pr38658: 3060; XOP: # %bb.0: 3061; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 3062; XOP-NEXT: vpmovsxbw %xmm1, %xmm1 3063; XOP-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 3064; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 3065; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm2[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15] 3066; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3067; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm1 3068; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 3069; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3070; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3071; XOP-NEXT: retq 3072 %1 = sdiv <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 7> 3073 ret <16 x i8> %1 3074} 3075 3076define i1 @bool_sdiv(i1 %x, i1 %y) { 3077; CHECK-LABEL: bool_sdiv: 3078; CHECK: # %bb.0: 3079; CHECK-NEXT: movl %edi, %eax 3080; CHECK-NEXT: # kill: def $al killed $al killed $eax 3081; CHECK-NEXT: retq 3082 %r = sdiv i1 %x, %y 3083 ret i1 %r 3084} 3085 3086define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) { 3087; CHECK-LABEL: boolvec_sdiv: 3088; CHECK: # %bb.0: 3089; CHECK-NEXT: retq 3090 %r = sdiv <4 x i1> %x, %y 3091 ret <4 x i1> %r 3092} 3093 3094define i32 @combine_sdiv_two(i32 %x) { 3095; CHECK-LABEL: combine_sdiv_two: 3096; CHECK: # %bb.0: 3097; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3098; CHECK-NEXT: movl %edi, %eax 3099; CHECK-NEXT: shrl $31, %eax 3100; CHECK-NEXT: addl %edi, %eax 3101; CHECK-NEXT: sarl %eax 3102; CHECK-NEXT: retq 3103 %1 = sdiv i32 %x, 2 3104 ret i32 %1 3105} 3106 3107define i32 @combine_sdiv_negtwo(i32 %x) { 3108; CHECK-LABEL: combine_sdiv_negtwo: 3109; CHECK: # %bb.0: 3110; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3111; CHECK-NEXT: movl %edi, %eax 3112; CHECK-NEXT: shrl $31, %eax 3113; CHECK-NEXT: addl %edi, %eax 3114; CHECK-NEXT: sarl %eax 3115; CHECK-NEXT: negl %eax 3116; CHECK-NEXT: retq 3117 %1 = sdiv i32 %x, -2 3118 ret i32 %1 3119} 3120 3121define i8 @combine_i8_sdiv_pow2(i8 %x) { 3122; CHECK-LABEL: combine_i8_sdiv_pow2: 3123; CHECK: # %bb.0: 3124; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3125; CHECK-NEXT: movl %edi, %eax 3126; CHECK-NEXT: sarb $7, %al 3127; CHECK-NEXT: shrb $4, %al 3128; CHECK-NEXT: addl %edi, %eax 3129; CHECK-NEXT: sarb $4, %al 3130; CHECK-NEXT: # kill: def $al killed $al killed $eax 3131; CHECK-NEXT: retq 3132 %1 = sdiv i8 %x, 16 3133 ret i8 %1 3134} 3135 3136define i8 @combine_i8_sdiv_negpow2(i8 %x) { 3137; CHECK-LABEL: combine_i8_sdiv_negpow2: 3138; CHECK: # %bb.0: 3139; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3140; CHECK-NEXT: movl %edi, %eax 3141; CHECK-NEXT: sarb $7, %al 3142; CHECK-NEXT: shrb $2, %al 3143; CHECK-NEXT: addl %edi, %eax 3144; CHECK-NEXT: sarb $6, %al 3145; CHECK-NEXT: negb %al 3146; CHECK-NEXT: # kill: def $al killed $al killed $eax 3147; CHECK-NEXT: retq 3148 %1 = sdiv i8 %x, -64 3149 ret i8 %1 3150} 3151 3152define i16 @combine_i16_sdiv_pow2(i16 %x) { 3153; CHECK-LABEL: combine_i16_sdiv_pow2: 3154; CHECK: # %bb.0: 3155; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3156; CHECK-NEXT: leal 15(%rdi), %eax 3157; CHECK-NEXT: testw %di, %di 3158; CHECK-NEXT: cmovnsl %edi, %eax 3159; CHECK-NEXT: cwtl 3160; CHECK-NEXT: shrl $4, %eax 3161; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 3162; CHECK-NEXT: retq 3163 %1 = sdiv i16 %x, 16 3164 ret i16 %1 3165} 3166 3167define i16 @combine_i16_sdiv_negpow2(i16 %x) { 3168; CHECK-LABEL: combine_i16_sdiv_negpow2: 3169; CHECK: # %bb.0: 3170; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3171; CHECK-NEXT: leal 255(%rdi), %eax 3172; CHECK-NEXT: testw %di, %di 3173; CHECK-NEXT: cmovnsl %edi, %eax 3174; CHECK-NEXT: cwtl 3175; CHECK-NEXT: sarl $8, %eax 3176; CHECK-NEXT: negl %eax 3177; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 3178; CHECK-NEXT: retq 3179 %1 = sdiv i16 %x, -256 3180 ret i16 %1 3181} 3182 3183define i32 @combine_i32_sdiv_pow2(i32 %x) { 3184; CHECK-LABEL: combine_i32_sdiv_pow2: 3185; CHECK: # %bb.0: 3186; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3187; CHECK-NEXT: leal 15(%rdi), %eax 3188; CHECK-NEXT: testl %edi, %edi 3189; CHECK-NEXT: cmovnsl %edi, %eax 3190; CHECK-NEXT: sarl $4, %eax 3191; CHECK-NEXT: retq 3192 %1 = sdiv i32 %x, 16 3193 ret i32 %1 3194} 3195 3196define i32 @combine_i32_sdiv_negpow2(i32 %x) { 3197; CHECK-LABEL: combine_i32_sdiv_negpow2: 3198; CHECK: # %bb.0: 3199; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3200; CHECK-NEXT: leal 255(%rdi), %eax 3201; CHECK-NEXT: testl %edi, %edi 3202; CHECK-NEXT: cmovnsl %edi, %eax 3203; CHECK-NEXT: sarl $8, %eax 3204; CHECK-NEXT: negl %eax 3205; CHECK-NEXT: retq 3206 %1 = sdiv i32 %x, -256 3207 ret i32 %1 3208} 3209 3210define i64 @combine_i64_sdiv_pow2(i64 %x) { 3211; CHECK-LABEL: combine_i64_sdiv_pow2: 3212; CHECK: # %bb.0: 3213; CHECK-NEXT: leaq 15(%rdi), %rax 3214; CHECK-NEXT: testq %rdi, %rdi 3215; CHECK-NEXT: cmovnsq %rdi, %rax 3216; CHECK-NEXT: sarq $4, %rax 3217; CHECK-NEXT: retq 3218 %1 = sdiv i64 %x, 16 3219 ret i64 %1 3220} 3221 3222define i64 @combine_i64_sdiv_negpow2(i64 %x) { 3223; CHECK-LABEL: combine_i64_sdiv_negpow2: 3224; CHECK: # %bb.0: 3225; CHECK-NEXT: leaq 255(%rdi), %rax 3226; CHECK-NEXT: testq %rdi, %rdi 3227; CHECK-NEXT: cmovnsq %rdi, %rax 3228; CHECK-NEXT: sarq $8, %rax 3229; CHECK-NEXT: negq %rax 3230; CHECK-NEXT: retq 3231 %1 = sdiv i64 %x, -256 3232 ret i64 %1 3233} 3234