1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 6 7; 8; udiv by 7 9; 10 11define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { 12; SSE2-LABEL: test_div7_2i64: 13; SSE2: # BB#0: 14; SSE2-NEXT: movd %xmm0, %rcx 15; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 16; SSE2-NEXT: movq %rcx, %rax 17; SSE2-NEXT: mulq %rsi 18; SSE2-NEXT: subq %rdx, %rcx 19; SSE2-NEXT: shrq %rcx 20; SSE2-NEXT: addq %rdx, %rcx 21; SSE2-NEXT: shrq $2, %rcx 22; SSE2-NEXT: movd %rcx, %xmm1 23; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 24; SSE2-NEXT: movd %xmm0, %rcx 25; SSE2-NEXT: movq %rcx, %rax 26; SSE2-NEXT: mulq %rsi 27; SSE2-NEXT: subq %rdx, %rcx 28; SSE2-NEXT: shrq %rcx 29; SSE2-NEXT: addq %rdx, %rcx 30; SSE2-NEXT: shrq $2, %rcx 31; SSE2-NEXT: movd %rcx, %xmm0 32; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 33; SSE2-NEXT: movdqa %xmm1, %xmm0 34; SSE2-NEXT: retq 35; 36; SSE41-LABEL: test_div7_2i64: 37; SSE41: # BB#0: 38; SSE41-NEXT: pextrq $1, %xmm0, %rcx 39; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 40; SSE41-NEXT: movq %rcx, %rax 41; SSE41-NEXT: mulq %rsi 42; SSE41-NEXT: subq %rdx, %rcx 43; SSE41-NEXT: shrq %rcx 44; SSE41-NEXT: addq %rdx, %rcx 45; SSE41-NEXT: shrq $2, %rcx 46; SSE41-NEXT: movd %rcx, %xmm1 47; SSE41-NEXT: movd %xmm0, %rcx 48; SSE41-NEXT: movq %rcx, %rax 49; SSE41-NEXT: mulq %rsi 50; SSE41-NEXT: subq %rdx, %rcx 51; SSE41-NEXT: shrq %rcx 52; SSE41-NEXT: addq %rdx, %rcx 53; SSE41-NEXT: shrq $2, %rcx 54; SSE41-NEXT: movd %rcx, %xmm0 55; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 56; SSE41-NEXT: retq 57; 58; AVX-LABEL: test_div7_2i64: 59; AVX: # BB#0: 60; AVX-NEXT: vpextrq $1, %xmm0, %rcx 61; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 62; AVX-NEXT: movq %rcx, %rax 63; AVX-NEXT: mulq %rsi 64; AVX-NEXT: subq %rdx, %rcx 65; AVX-NEXT: shrq %rcx 66; AVX-NEXT: addq %rdx, %rcx 67; AVX-NEXT: shrq $2, %rcx 68; AVX-NEXT: vmovq %rcx, %xmm1 69; AVX-NEXT: vmovq %xmm0, %rcx 70; AVX-NEXT: movq %rcx, %rax 71; AVX-NEXT: mulq %rsi 72; AVX-NEXT: subq %rdx, %rcx 73; AVX-NEXT: shrq %rcx 74; AVX-NEXT: addq %rdx, %rcx 75; AVX-NEXT: shrq $2, %rcx 76; AVX-NEXT: vmovq %rcx, %xmm0 77; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 78; AVX-NEXT: retq 79 %res = udiv <2 x i64> %a, <i64 7, i64 7> 80 ret <2 x i64> %res 81} 82 83define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { 84; SSE2-LABEL: test_div7_4i32: 85; SSE2: # BB#0: 86; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] 87; SSE2-NEXT: movdqa %xmm0, %xmm2 88; SSE2-NEXT: pmuludq %xmm1, %xmm2 89; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 90; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 91; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 92; SSE2-NEXT: pmuludq %xmm1, %xmm3 93; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 94; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 95; SSE2-NEXT: psubd %xmm2, %xmm0 96; SSE2-NEXT: psrld $1, %xmm0 97; SSE2-NEXT: paddd %xmm2, %xmm0 98; SSE2-NEXT: psrld $2, %xmm0 99; SSE2-NEXT: retq 100; 101; SSE41-LABEL: test_div7_4i32: 102; SSE41: # BB#0: 103; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] 104; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 105; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 106; SSE41-NEXT: pmuludq %xmm2, %xmm3 107; SSE41-NEXT: pmuludq %xmm0, %xmm1 108; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 109; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 110; SSE41-NEXT: psubd %xmm1, %xmm0 111; SSE41-NEXT: psrld $1, %xmm0 112; SSE41-NEXT: paddd %xmm1, %xmm0 113; SSE41-NEXT: psrld $2, %xmm0 114; SSE41-NEXT: retq 115; 116; AVX1-LABEL: test_div7_4i32: 117; AVX1: # BB#0: 118; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] 119; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 120; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 121; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 122; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 123; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 124; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 125; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 126; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 127; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 128; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0 129; AVX1-NEXT: retq 130; 131; AVX2-LABEL: test_div7_4i32: 132; AVX2: # BB#0: 133; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 134; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 135; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 136; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 137; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 138; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 139; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 140; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 141; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 142; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 143; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0 144; AVX2-NEXT: retq 145 %res = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 146 ret <4 x i32> %res 147} 148 149define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind { 150; SSE-LABEL: test_div7_8i16: 151; SSE: # BB#0: 152; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] 153; SSE-NEXT: pmulhuw %xmm0, %xmm1 154; SSE-NEXT: psubw %xmm1, %xmm0 155; SSE-NEXT: psrlw $1, %xmm0 156; SSE-NEXT: paddw %xmm1, %xmm0 157; SSE-NEXT: psrlw $2, %xmm0 158; SSE-NEXT: retq 159; 160; AVX-LABEL: test_div7_8i16: 161; AVX: # BB#0: 162; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 163; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 164; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 165; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 166; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 167; AVX-NEXT: retq 168 %res = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 169 ret <8 x i16> %res 170} 171 172define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { 173; SSE2-LABEL: test_div7_16i8: 174; SSE2: # BB#0: 175; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] 176; SSE2-NEXT: psrlw $8, %xmm1 177; SSE2-NEXT: movdqa %xmm0, %xmm2 178; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 179; SSE2-NEXT: psrlw $8, %xmm2 180; SSE2-NEXT: pmullw %xmm1, %xmm2 181; SSE2-NEXT: psrlw $8, %xmm2 182; SSE2-NEXT: movdqa %xmm0, %xmm3 183; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 184; SSE2-NEXT: psrlw $8, %xmm3 185; SSE2-NEXT: pmullw %xmm1, %xmm3 186; SSE2-NEXT: psrlw $8, %xmm3 187; SSE2-NEXT: packuswb %xmm2, %xmm3 188; SSE2-NEXT: psubb %xmm3, %xmm0 189; SSE2-NEXT: psrlw $1, %xmm0 190; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 191; SSE2-NEXT: paddb %xmm3, %xmm0 192; SSE2-NEXT: psrlw $2, %xmm0 193; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 194; SSE2-NEXT: retq 195; 196; SSE41-LABEL: test_div7_16i8: 197; SSE41: # BB#0: 198; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 199; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 200; SSE41-NEXT: pmullw %xmm2, %xmm1 201; SSE41-NEXT: psrlw $8, %xmm1 202; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 203; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 204; SSE41-NEXT: pmullw %xmm2, %xmm3 205; SSE41-NEXT: psrlw $8, %xmm3 206; SSE41-NEXT: packuswb %xmm3, %xmm1 207; SSE41-NEXT: psubb %xmm1, %xmm0 208; SSE41-NEXT: psrlw $1, %xmm0 209; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 210; SSE41-NEXT: paddb %xmm1, %xmm0 211; SSE41-NEXT: psrlw $2, %xmm0 212; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 213; SSE41-NEXT: retq 214; 215; AVX1-LABEL: test_div7_16i8: 216; AVX1: # BB#0: 217; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 218; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 219; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 220; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 221; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 222; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 223; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 224; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 225; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 226; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 227; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 228; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 229; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 230; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 231; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 232; AVX1-NEXT: retq 233; 234; AVX2-LABEL: test_div7_16i8: 235; AVX2: # BB#0: 236; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 237; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 238; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 239; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 240; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 241; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 242; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 243; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 244; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 245; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 246; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0 247; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 248; AVX2-NEXT: vzeroupper 249; AVX2-NEXT: retq 250 %res = udiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 251 ret <16 x i8> %res 252} 253 254; 255; urem by 7 256; 257 258define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { 259; SSE2-LABEL: test_rem7_2i64: 260; SSE2: # BB#0: 261; SSE2-NEXT: movd %xmm0, %rcx 262; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 263; SSE2-NEXT: movq %rcx, %rax 264; SSE2-NEXT: mulq %rsi 265; SSE2-NEXT: movq %rcx, %rax 266; SSE2-NEXT: subq %rdx, %rax 267; SSE2-NEXT: shrq %rax 268; SSE2-NEXT: addq %rdx, %rax 269; SSE2-NEXT: shrq $2, %rax 270; SSE2-NEXT: leaq (,%rax,8), %rdx 271; SSE2-NEXT: subq %rax, %rdx 272; SSE2-NEXT: subq %rdx, %rcx 273; SSE2-NEXT: movd %rcx, %xmm1 274; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 275; SSE2-NEXT: movd %xmm0, %rcx 276; SSE2-NEXT: movq %rcx, %rax 277; SSE2-NEXT: mulq %rsi 278; SSE2-NEXT: movq %rcx, %rax 279; SSE2-NEXT: subq %rdx, %rax 280; SSE2-NEXT: shrq %rax 281; SSE2-NEXT: addq %rdx, %rax 282; SSE2-NEXT: shrq $2, %rax 283; SSE2-NEXT: leaq (,%rax,8), %rdx 284; SSE2-NEXT: subq %rax, %rdx 285; SSE2-NEXT: subq %rdx, %rcx 286; SSE2-NEXT: movd %rcx, %xmm0 287; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 288; SSE2-NEXT: movdqa %xmm1, %xmm0 289; SSE2-NEXT: retq 290; 291; SSE41-LABEL: test_rem7_2i64: 292; SSE41: # BB#0: 293; SSE41-NEXT: pextrq $1, %xmm0, %rcx 294; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 295; SSE41-NEXT: movq %rcx, %rax 296; SSE41-NEXT: mulq %rsi 297; SSE41-NEXT: movq %rcx, %rax 298; SSE41-NEXT: subq %rdx, %rax 299; SSE41-NEXT: shrq %rax 300; SSE41-NEXT: addq %rdx, %rax 301; SSE41-NEXT: shrq $2, %rax 302; SSE41-NEXT: leaq (,%rax,8), %rdx 303; SSE41-NEXT: subq %rax, %rdx 304; SSE41-NEXT: subq %rdx, %rcx 305; SSE41-NEXT: movd %rcx, %xmm1 306; SSE41-NEXT: movd %xmm0, %rcx 307; SSE41-NEXT: movq %rcx, %rax 308; SSE41-NEXT: mulq %rsi 309; SSE41-NEXT: movq %rcx, %rax 310; SSE41-NEXT: subq %rdx, %rax 311; SSE41-NEXT: shrq %rax 312; SSE41-NEXT: addq %rdx, %rax 313; SSE41-NEXT: shrq $2, %rax 314; SSE41-NEXT: leaq (,%rax,8), %rdx 315; SSE41-NEXT: subq %rax, %rdx 316; SSE41-NEXT: subq %rdx, %rcx 317; SSE41-NEXT: movd %rcx, %xmm0 318; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 319; SSE41-NEXT: retq 320; 321; AVX-LABEL: test_rem7_2i64: 322; AVX: # BB#0: 323; AVX-NEXT: vpextrq $1, %xmm0, %rcx 324; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 325; AVX-NEXT: movq %rcx, %rax 326; AVX-NEXT: mulq %rsi 327; AVX-NEXT: movq %rcx, %rax 328; AVX-NEXT: subq %rdx, %rax 329; AVX-NEXT: shrq %rax 330; AVX-NEXT: addq %rdx, %rax 331; AVX-NEXT: shrq $2, %rax 332; AVX-NEXT: leaq (,%rax,8), %rdx 333; AVX-NEXT: subq %rax, %rdx 334; AVX-NEXT: subq %rdx, %rcx 335; AVX-NEXT: vmovq %rcx, %xmm1 336; AVX-NEXT: vmovq %xmm0, %rcx 337; AVX-NEXT: movq %rcx, %rax 338; AVX-NEXT: mulq %rsi 339; AVX-NEXT: movq %rcx, %rax 340; AVX-NEXT: subq %rdx, %rax 341; AVX-NEXT: shrq %rax 342; AVX-NEXT: addq %rdx, %rax 343; AVX-NEXT: shrq $2, %rax 344; AVX-NEXT: leaq (,%rax,8), %rdx 345; AVX-NEXT: subq %rax, %rdx 346; AVX-NEXT: subq %rdx, %rcx 347; AVX-NEXT: vmovq %rcx, %xmm0 348; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 349; AVX-NEXT: retq 350 %res = urem <2 x i64> %a, <i64 7, i64 7> 351 ret <2 x i64> %res 352} 353 354define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { 355; SSE2-LABEL: test_rem7_4i32: 356; SSE2: # BB#0: 357; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] 358; SSE2-NEXT: movdqa %xmm0, %xmm2 359; SSE2-NEXT: pmuludq %xmm1, %xmm2 360; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 361; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 362; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 363; SSE2-NEXT: pmuludq %xmm1, %xmm3 364; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 365; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 366; SSE2-NEXT: movdqa %xmm0, %xmm1 367; SSE2-NEXT: psubd %xmm2, %xmm1 368; SSE2-NEXT: psrld $1, %xmm1 369; SSE2-NEXT: paddd %xmm2, %xmm1 370; SSE2-NEXT: psrld $2, %xmm1 371; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7] 372; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 373; SSE2-NEXT: pmuludq %xmm2, %xmm1 374; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 375; SSE2-NEXT: pmuludq %xmm2, %xmm3 376; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 377; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 378; SSE2-NEXT: psubd %xmm1, %xmm0 379; SSE2-NEXT: retq 380; 381; SSE41-LABEL: test_rem7_4i32: 382; SSE41: # BB#0: 383; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] 384; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 385; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 386; SSE41-NEXT: pmuludq %xmm2, %xmm3 387; SSE41-NEXT: pmuludq %xmm0, %xmm1 388; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 389; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 390; SSE41-NEXT: movdqa %xmm0, %xmm2 391; SSE41-NEXT: psubd %xmm1, %xmm2 392; SSE41-NEXT: psrld $1, %xmm2 393; SSE41-NEXT: paddd %xmm1, %xmm2 394; SSE41-NEXT: psrld $2, %xmm2 395; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 396; SSE41-NEXT: psubd %xmm2, %xmm0 397; SSE41-NEXT: retq 398; 399; AVX1-LABEL: test_rem7_4i32: 400; AVX1: # BB#0: 401; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] 402; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 403; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 404; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 405; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 406; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 407; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 408; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 409; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 410; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 411; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 412; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 413; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 414; AVX1-NEXT: retq 415; 416; AVX2-LABEL: test_rem7_4i32: 417; AVX2: # BB#0: 418; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 419; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 420; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 421; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 422; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 423; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 424; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 425; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2 426; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2 427; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 428; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 429; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 430; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 431; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 432; AVX2-NEXT: retq 433 %res = urem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 434 ret <4 x i32> %res 435} 436 437define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind { 438; SSE-LABEL: test_rem7_8i16: 439; SSE: # BB#0: 440; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] 441; SSE-NEXT: pmulhuw %xmm0, %xmm1 442; SSE-NEXT: movdqa %xmm0, %xmm2 443; SSE-NEXT: psubw %xmm1, %xmm2 444; SSE-NEXT: psrlw $1, %xmm2 445; SSE-NEXT: paddw %xmm1, %xmm2 446; SSE-NEXT: psrlw $2, %xmm2 447; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2 448; SSE-NEXT: psubw %xmm2, %xmm0 449; SSE-NEXT: retq 450; 451; AVX-LABEL: test_rem7_8i16: 452; AVX: # BB#0: 453; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 454; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2 455; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2 456; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 457; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1 458; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 459; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 460; AVX-NEXT: retq 461 %res = urem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 462 ret <8 x i16> %res 463} 464 465define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { 466; SSE2-LABEL: test_rem7_16i8: 467; SSE2: # BB#0: 468; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] 469; SSE2-NEXT: psrlw $8, %xmm1 470; SSE2-NEXT: movdqa %xmm0, %xmm2 471; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 472; SSE2-NEXT: psrlw $8, %xmm2 473; SSE2-NEXT: pmullw %xmm1, %xmm2 474; SSE2-NEXT: psrlw $8, %xmm2 475; SSE2-NEXT: movdqa %xmm0, %xmm3 476; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 477; SSE2-NEXT: psrlw $8, %xmm3 478; SSE2-NEXT: pmullw %xmm1, %xmm3 479; SSE2-NEXT: psrlw $8, %xmm3 480; SSE2-NEXT: packuswb %xmm2, %xmm3 481; SSE2-NEXT: movdqa %xmm0, %xmm1 482; SSE2-NEXT: psubb %xmm3, %xmm1 483; SSE2-NEXT: psrlw $1, %xmm1 484; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 485; SSE2-NEXT: paddb %xmm3, %xmm1 486; SSE2-NEXT: psrlw $2, %xmm1 487; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 488; SSE2-NEXT: movdqa %xmm1, %xmm2 489; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 490; SSE2-NEXT: psraw $8, %xmm2 491; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 492; SSE2-NEXT: psraw $8, %xmm3 493; SSE2-NEXT: pmullw %xmm3, %xmm2 494; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 495; SSE2-NEXT: pand %xmm4, %xmm2 496; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 497; SSE2-NEXT: psraw $8, %xmm1 498; SSE2-NEXT: pmullw %xmm3, %xmm1 499; SSE2-NEXT: pand %xmm4, %xmm1 500; SSE2-NEXT: packuswb %xmm2, %xmm1 501; SSE2-NEXT: psubb %xmm1, %xmm0 502; SSE2-NEXT: retq 503; 504; SSE41-LABEL: test_rem7_16i8: 505; SSE41: # BB#0: 506; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 507; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 508; SSE41-NEXT: pmullw %xmm2, %xmm1 509; SSE41-NEXT: psrlw $8, %xmm1 510; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 511; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 512; SSE41-NEXT: pmullw %xmm2, %xmm3 513; SSE41-NEXT: psrlw $8, %xmm3 514; SSE41-NEXT: packuswb %xmm3, %xmm1 515; SSE41-NEXT: movdqa %xmm0, %xmm2 516; SSE41-NEXT: psubb %xmm1, %xmm2 517; SSE41-NEXT: psrlw $1, %xmm2 518; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 519; SSE41-NEXT: paddb %xmm1, %xmm2 520; SSE41-NEXT: psrlw $2, %xmm2 521; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 522; SSE41-NEXT: pmovsxbw %xmm2, %xmm1 523; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm3 524; SSE41-NEXT: pmullw %xmm3, %xmm1 525; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 526; SSE41-NEXT: pand %xmm4, %xmm1 527; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 528; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 529; SSE41-NEXT: pmullw %xmm3, %xmm2 530; SSE41-NEXT: pand %xmm4, %xmm2 531; SSE41-NEXT: packuswb %xmm2, %xmm1 532; SSE41-NEXT: psubb %xmm1, %xmm0 533; SSE41-NEXT: retq 534; 535; AVX1-LABEL: test_rem7_16i8: 536; AVX1: # BB#0: 537; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 538; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 539; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 540; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 541; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 542; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 543; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 544; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 545; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 546; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2 547; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 548; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 549; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 550; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 551; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 552; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2 553; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm3 554; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 555; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 556; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 557; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 558; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 559; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 560; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 561; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 562; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 563; AVX1-NEXT: retq 564; 565; AVX2-LABEL: test_rem7_16i8: 566; AVX2: # BB#0: 567; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 568; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 569; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 570; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 571; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 572; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 573; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm2 574; AVX2-NEXT: vpsrlw $1, %xmm2, %xmm2 575; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 576; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 577; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm1 578; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 579; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 580; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2 581; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 582; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 583; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 584; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 585; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 586; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 587; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 588; AVX2-NEXT: vzeroupper 589; AVX2-NEXT: retq 590 %res = urem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 591 ret <16 x i8> %res 592} 593