1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW 7 8; 9; sdiv by 7 10; 11 12define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { 13; SSE2-LABEL: test_div7_2i64: 14; SSE2: # %bb.0: 15; SSE2-NEXT: movq %xmm0, %rax 16; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 17; SSE2-NEXT: imulq %rcx 18; SSE2-NEXT: movq %rdx, %rax 19; SSE2-NEXT: shrq $63, %rax 20; SSE2-NEXT: sarq %rdx 21; SSE2-NEXT: addq %rax, %rdx 22; SSE2-NEXT: movq %rdx, %xmm1 23; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 24; SSE2-NEXT: movq %xmm0, %rax 25; SSE2-NEXT: imulq %rcx 26; SSE2-NEXT: movq %rdx, %rax 27; SSE2-NEXT: shrq $63, %rax 28; SSE2-NEXT: sarq %rdx 29; SSE2-NEXT: addq %rax, %rdx 30; SSE2-NEXT: movq %rdx, %xmm0 31; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 32; SSE2-NEXT: movdqa %xmm1, %xmm0 33; SSE2-NEXT: retq 34; 35; SSE41-LABEL: test_div7_2i64: 36; SSE41: # %bb.0: 37; SSE41-NEXT: pextrq $1, %xmm0, %rax 38; SSE41-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 39; SSE41-NEXT: imulq %rcx 40; SSE41-NEXT: movq %rdx, %rax 41; SSE41-NEXT: shrq $63, %rax 42; SSE41-NEXT: sarq %rdx 43; SSE41-NEXT: addq %rax, %rdx 44; SSE41-NEXT: movq %rdx, %xmm1 45; SSE41-NEXT: movq %xmm0, %rax 46; SSE41-NEXT: imulq %rcx 47; SSE41-NEXT: movq %rdx, %rax 48; SSE41-NEXT: shrq $63, %rax 49; SSE41-NEXT: sarq %rdx 50; SSE41-NEXT: addq %rax, %rdx 51; SSE41-NEXT: movq %rdx, %xmm0 52; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 53; SSE41-NEXT: retq 54; 55; AVX-LABEL: test_div7_2i64: 56; AVX: # %bb.0: 57; AVX-NEXT: vpextrq $1, %xmm0, %rax 58; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 59; AVX-NEXT: imulq %rcx 60; AVX-NEXT: movq %rdx, %rax 61; AVX-NEXT: shrq $63, %rax 62; AVX-NEXT: sarq %rdx 63; AVX-NEXT: addq %rax, %rdx 64; AVX-NEXT: vmovq %rdx, %xmm1 65; AVX-NEXT: vmovq %xmm0, %rax 66; AVX-NEXT: imulq %rcx 67; AVX-NEXT: movq %rdx, %rax 68; AVX-NEXT: shrq $63, %rax 69; AVX-NEXT: sarq %rdx 70; AVX-NEXT: addq %rax, %rdx 71; AVX-NEXT: vmovq %rdx, %xmm0 72; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 73; AVX-NEXT: retq 74 %res = sdiv <2 x i64> %a, <i64 7, i64 7> 75 ret <2 x i64> %res 76} 77 78define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { 79; SSE2-LABEL: test_div7_4i32: 80; SSE2: # %bb.0: 81; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 82; SSE2-NEXT: movdqa %xmm0, %xmm2 83; SSE2-NEXT: psrad $31, %xmm2 84; SSE2-NEXT: pand %xmm1, %xmm2 85; SSE2-NEXT: movdqa %xmm0, %xmm3 86; SSE2-NEXT: pmuludq %xmm1, %xmm3 87; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 88; SSE2-NEXT: psrad $31, %xmm1 89; SSE2-NEXT: pand %xmm0, %xmm1 90; SSE2-NEXT: paddd %xmm1, %xmm2 91; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 92; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 93; SSE2-NEXT: pmuludq %xmm4, %xmm3 94; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 95; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 96; SSE2-NEXT: psubd %xmm2, %xmm1 97; SSE2-NEXT: paddd %xmm0, %xmm1 98; SSE2-NEXT: movdqa %xmm1, %xmm0 99; SSE2-NEXT: psrld $31, %xmm0 100; SSE2-NEXT: psrad $2, %xmm1 101; SSE2-NEXT: paddd %xmm0, %xmm1 102; SSE2-NEXT: movdqa %xmm1, %xmm0 103; SSE2-NEXT: retq 104; 105; SSE41-LABEL: test_div7_4i32: 106; SSE41: # %bb.0: 107; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 108; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 109; SSE41-NEXT: pmuldq %xmm1, %xmm2 110; SSE41-NEXT: pmuldq %xmm0, %xmm1 111; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 112; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 113; SSE41-NEXT: paddd %xmm0, %xmm1 114; SSE41-NEXT: movdqa %xmm1, %xmm0 115; SSE41-NEXT: psrld $31, %xmm0 116; SSE41-NEXT: psrad $2, %xmm1 117; SSE41-NEXT: paddd %xmm0, %xmm1 118; SSE41-NEXT: movdqa %xmm1, %xmm0 119; SSE41-NEXT: retq 120; 121; AVX1-LABEL: test_div7_4i32: 122; AVX1: # %bb.0: 123; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 124; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 125; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 126; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 127; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 128; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 129; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 130; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 131; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 132; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 133; AVX1-NEXT: retq 134; 135; AVX2-LABEL: test_div7_4i32: 136; AVX2: # %bb.0: 137; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 138; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 139; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 140; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 141; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 142; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 143; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 144; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 145; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1 146; AVX2-NEXT: vpsrad $2, %xmm0, %xmm0 147; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 148; AVX2-NEXT: retq 149 %res = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 150 ret <4 x i32> %res 151} 152 153define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind { 154; SSE-LABEL: test_div7_8i16: 155; SSE: # %bb.0: 156; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0 157; SSE-NEXT: movdqa %xmm0, %xmm1 158; SSE-NEXT: psrlw $15, %xmm1 159; SSE-NEXT: psraw $1, %xmm0 160; SSE-NEXT: paddw %xmm1, %xmm0 161; SSE-NEXT: retq 162; 163; AVX-LABEL: test_div7_8i16: 164; AVX: # %bb.0: 165; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 166; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1 167; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 168; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 169; AVX-NEXT: retq 170 %res = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 171 ret <8 x i16> %res 172} 173 174define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { 175; SSE2-LABEL: test_div7_16i8: 176; SSE2: # %bb.0: 177; SSE2-NEXT: movdqa %xmm0, %xmm2 178; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 179; SSE2-NEXT: psraw $8, %xmm2 180; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427] 181; SSE2-NEXT: pmullw %xmm3, %xmm2 182; SSE2-NEXT: psrlw $8, %xmm2 183; SSE2-NEXT: movdqa %xmm0, %xmm1 184; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 185; SSE2-NEXT: psraw $8, %xmm1 186; SSE2-NEXT: pmullw %xmm3, %xmm1 187; SSE2-NEXT: psrlw $8, %xmm1 188; SSE2-NEXT: packuswb %xmm2, %xmm1 189; SSE2-NEXT: paddb %xmm0, %xmm1 190; SSE2-NEXT: movdqa %xmm1, %xmm0 191; SSE2-NEXT: psrlw $2, %xmm0 192; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 193; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 194; SSE2-NEXT: pxor %xmm2, %xmm0 195; SSE2-NEXT: psubb %xmm2, %xmm0 196; SSE2-NEXT: psrlw $7, %xmm1 197; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 198; SSE2-NEXT: paddb %xmm0, %xmm1 199; SSE2-NEXT: movdqa %xmm1, %xmm0 200; SSE2-NEXT: retq 201; 202; SSE41-LABEL: test_div7_16i8: 203; SSE41: # %bb.0: 204; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 205; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] 206; SSE41-NEXT: pmullw %xmm2, %xmm1 207; SSE41-NEXT: psrlw $8, %xmm1 208; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 209; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 210; SSE41-NEXT: pmullw %xmm2, %xmm3 211; SSE41-NEXT: psrlw $8, %xmm3 212; SSE41-NEXT: packuswb %xmm3, %xmm1 213; SSE41-NEXT: paddb %xmm0, %xmm1 214; SSE41-NEXT: movdqa %xmm1, %xmm0 215; SSE41-NEXT: psrlw $2, %xmm0 216; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 217; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 218; SSE41-NEXT: pxor %xmm2, %xmm0 219; SSE41-NEXT: psubb %xmm2, %xmm0 220; SSE41-NEXT: psrlw $7, %xmm1 221; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 222; SSE41-NEXT: paddb %xmm0, %xmm1 223; SSE41-NEXT: movdqa %xmm1, %xmm0 224; SSE41-NEXT: retq 225; 226; AVX1-LABEL: test_div7_16i8: 227; AVX1: # %bb.0: 228; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 229; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] 230; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 231; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 232; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 233; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 234; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 235; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 236; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 237; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 238; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 239; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 240; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 241; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 242; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 243; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 244; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 245; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 246; AVX1-NEXT: retq 247; 248; AVX2NOBW-LABEL: test_div7_16i8: 249; AVX2NOBW: # %bb.0: 250; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 251; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 252; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 253; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 254; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 255; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 256; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1 257; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 258; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 259; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1 260; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 261; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 262; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 263; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 264; AVX2NOBW-NEXT: vzeroupper 265; AVX2NOBW-NEXT: retq 266; 267; AVX512BW-LABEL: test_div7_16i8: 268; AVX512BW: # %bb.0: 269; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 270; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 271; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 272; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 273; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 274; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1 275; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 276; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 277; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1 278; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 279; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 280; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 281; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 282; AVX512BW-NEXT: vzeroupper 283; AVX512BW-NEXT: retq 284 %res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 285 ret <16 x i8> %res 286} 287 288; 289; srem by 7 290; 291 292define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { 293; SSE2-LABEL: test_rem7_2i64: 294; SSE2: # %bb.0: 295; SSE2-NEXT: movq %xmm0, %rcx 296; SSE2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 297; SSE2-NEXT: movq %rcx, %rax 298; SSE2-NEXT: imulq %rsi 299; SSE2-NEXT: movq %rdx, %rax 300; SSE2-NEXT: shrq $63, %rax 301; SSE2-NEXT: sarq %rdx 302; SSE2-NEXT: addq %rax, %rdx 303; SSE2-NEXT: leaq (,%rdx,8), %rax 304; SSE2-NEXT: subq %rax, %rdx 305; SSE2-NEXT: addq %rcx, %rdx 306; SSE2-NEXT: movq %rdx, %xmm1 307; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 308; SSE2-NEXT: movq %xmm0, %rcx 309; SSE2-NEXT: movq %rcx, %rax 310; SSE2-NEXT: imulq %rsi 311; SSE2-NEXT: movq %rdx, %rax 312; SSE2-NEXT: shrq $63, %rax 313; SSE2-NEXT: sarq %rdx 314; SSE2-NEXT: addq %rax, %rdx 315; SSE2-NEXT: leaq (,%rdx,8), %rax 316; SSE2-NEXT: subq %rax, %rdx 317; SSE2-NEXT: addq %rcx, %rdx 318; SSE2-NEXT: movq %rdx, %xmm0 319; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 320; SSE2-NEXT: movdqa %xmm1, %xmm0 321; SSE2-NEXT: retq 322; 323; SSE41-LABEL: test_rem7_2i64: 324; SSE41: # %bb.0: 325; SSE41-NEXT: pextrq $1, %xmm0, %rcx 326; SSE41-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 327; SSE41-NEXT: movq %rcx, %rax 328; SSE41-NEXT: imulq %rsi 329; SSE41-NEXT: movq %rdx, %rax 330; SSE41-NEXT: shrq $63, %rax 331; SSE41-NEXT: sarq %rdx 332; SSE41-NEXT: addq %rax, %rdx 333; SSE41-NEXT: leaq (,%rdx,8), %rax 334; SSE41-NEXT: subq %rax, %rdx 335; SSE41-NEXT: addq %rcx, %rdx 336; SSE41-NEXT: movq %rdx, %xmm1 337; SSE41-NEXT: movq %xmm0, %rcx 338; SSE41-NEXT: movq %rcx, %rax 339; SSE41-NEXT: imulq %rsi 340; SSE41-NEXT: movq %rdx, %rax 341; SSE41-NEXT: shrq $63, %rax 342; SSE41-NEXT: sarq %rdx 343; SSE41-NEXT: addq %rax, %rdx 344; SSE41-NEXT: leaq (,%rdx,8), %rax 345; SSE41-NEXT: subq %rax, %rdx 346; SSE41-NEXT: addq %rcx, %rdx 347; SSE41-NEXT: movq %rdx, %xmm0 348; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 349; SSE41-NEXT: retq 350; 351; AVX-LABEL: test_rem7_2i64: 352; AVX: # %bb.0: 353; AVX-NEXT: vpextrq $1, %xmm0, %rcx 354; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 355; AVX-NEXT: movq %rcx, %rax 356; AVX-NEXT: imulq %rsi 357; AVX-NEXT: movq %rdx, %rax 358; AVX-NEXT: shrq $63, %rax 359; AVX-NEXT: sarq %rdx 360; AVX-NEXT: addq %rax, %rdx 361; AVX-NEXT: leaq (,%rdx,8), %rax 362; AVX-NEXT: subq %rax, %rdx 363; AVX-NEXT: addq %rcx, %rdx 364; AVX-NEXT: vmovq %rdx, %xmm1 365; AVX-NEXT: vmovq %xmm0, %rcx 366; AVX-NEXT: movq %rcx, %rax 367; AVX-NEXT: imulq %rsi 368; AVX-NEXT: movq %rdx, %rax 369; AVX-NEXT: shrq $63, %rax 370; AVX-NEXT: sarq %rdx 371; AVX-NEXT: addq %rax, %rdx 372; AVX-NEXT: leaq (,%rdx,8), %rax 373; AVX-NEXT: subq %rax, %rdx 374; AVX-NEXT: addq %rcx, %rdx 375; AVX-NEXT: vmovq %rdx, %xmm0 376; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 377; AVX-NEXT: retq 378 %res = srem <2 x i64> %a, <i64 7, i64 7> 379 ret <2 x i64> %res 380} 381 382define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { 383; SSE2-LABEL: test_rem7_4i32: 384; SSE2: # %bb.0: 385; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 386; SSE2-NEXT: movdqa %xmm0, %xmm2 387; SSE2-NEXT: psrad $31, %xmm2 388; SSE2-NEXT: pand %xmm1, %xmm2 389; SSE2-NEXT: movdqa %xmm0, %xmm3 390; SSE2-NEXT: pmuludq %xmm1, %xmm3 391; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 392; SSE2-NEXT: psrad $31, %xmm1 393; SSE2-NEXT: pand %xmm0, %xmm1 394; SSE2-NEXT: paddd %xmm1, %xmm2 395; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 396; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 397; SSE2-NEXT: pmuludq %xmm4, %xmm3 398; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 399; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 400; SSE2-NEXT: psubd %xmm2, %xmm1 401; SSE2-NEXT: paddd %xmm0, %xmm1 402; SSE2-NEXT: movdqa %xmm1, %xmm2 403; SSE2-NEXT: psrld $31, %xmm2 404; SSE2-NEXT: psrad $2, %xmm1 405; SSE2-NEXT: paddd %xmm2, %xmm1 406; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7] 407; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 408; SSE2-NEXT: pmuludq %xmm2, %xmm1 409; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 410; SSE2-NEXT: pmuludq %xmm2, %xmm3 411; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 412; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 413; SSE2-NEXT: psubd %xmm1, %xmm0 414; SSE2-NEXT: retq 415; 416; SSE41-LABEL: test_rem7_4i32: 417; SSE41: # %bb.0: 418; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 419; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 420; SSE41-NEXT: pmuldq %xmm2, %xmm1 421; SSE41-NEXT: pmuldq %xmm0, %xmm2 422; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 423; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 424; SSE41-NEXT: paddd %xmm0, %xmm2 425; SSE41-NEXT: movdqa %xmm2, %xmm1 426; SSE41-NEXT: psrld $31, %xmm1 427; SSE41-NEXT: psrad $2, %xmm2 428; SSE41-NEXT: paddd %xmm1, %xmm2 429; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 430; SSE41-NEXT: psubd %xmm2, %xmm0 431; SSE41-NEXT: retq 432; 433; AVX1-LABEL: test_rem7_4i32: 434; AVX1: # %bb.0: 435; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 436; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 437; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 438; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 439; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 440; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 441; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 442; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 443; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1 444; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 445; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 446; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 447; AVX1-NEXT: retq 448; 449; AVX2-LABEL: test_rem7_4i32: 450; AVX2: # %bb.0: 451; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 452; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 453; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 454; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 455; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 456; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 457; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 458; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 459; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 460; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1 461; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 462; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] 463; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 464; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 465; AVX2-NEXT: retq 466 %res = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 467 ret <4 x i32> %res 468} 469 470define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind { 471; SSE-LABEL: test_rem7_8i16: 472; SSE: # %bb.0: 473; SSE-NEXT: movdqa {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725] 474; SSE-NEXT: pmulhw %xmm0, %xmm1 475; SSE-NEXT: movdqa %xmm1, %xmm2 476; SSE-NEXT: psrlw $15, %xmm2 477; SSE-NEXT: psraw $1, %xmm1 478; SSE-NEXT: paddw %xmm2, %xmm1 479; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 480; SSE-NEXT: psubw %xmm1, %xmm0 481; SSE-NEXT: retq 482; 483; AVX-LABEL: test_rem7_8i16: 484; AVX: # %bb.0: 485; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 486; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 487; AVX-NEXT: vpsraw $1, %xmm1, %xmm1 488; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 489; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 490; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 491; AVX-NEXT: retq 492 %res = srem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 493 ret <8 x i16> %res 494} 495 496define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { 497; SSE2-LABEL: test_rem7_16i8: 498; SSE2: # %bb.0: 499; SSE2-NEXT: movdqa %xmm0, %xmm2 500; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 501; SSE2-NEXT: psraw $8, %xmm2 502; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427] 503; SSE2-NEXT: pmullw %xmm3, %xmm2 504; SSE2-NEXT: psrlw $8, %xmm2 505; SSE2-NEXT: movdqa %xmm0, %xmm1 506; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 507; SSE2-NEXT: psraw $8, %xmm1 508; SSE2-NEXT: pmullw %xmm3, %xmm1 509; SSE2-NEXT: psrlw $8, %xmm1 510; SSE2-NEXT: packuswb %xmm2, %xmm1 511; SSE2-NEXT: paddb %xmm0, %xmm1 512; SSE2-NEXT: movdqa %xmm1, %xmm2 513; SSE2-NEXT: psrlw $2, %xmm2 514; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 515; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 516; SSE2-NEXT: pxor %xmm3, %xmm2 517; SSE2-NEXT: psubb %xmm3, %xmm2 518; SSE2-NEXT: psrlw $7, %xmm1 519; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 520; SSE2-NEXT: paddb %xmm2, %xmm1 521; SSE2-NEXT: movdqa %xmm1, %xmm2 522; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 523; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 524; SSE2-NEXT: pmullw %xmm3, %xmm2 525; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 526; SSE2-NEXT: pand %xmm4, %xmm2 527; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 528; SSE2-NEXT: pmullw %xmm3, %xmm1 529; SSE2-NEXT: pand %xmm4, %xmm1 530; SSE2-NEXT: packuswb %xmm2, %xmm1 531; SSE2-NEXT: psubb %xmm1, %xmm0 532; SSE2-NEXT: retq 533; 534; SSE41-LABEL: test_rem7_16i8: 535; SSE41: # %bb.0: 536; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 537; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] 538; SSE41-NEXT: pmullw %xmm2, %xmm1 539; SSE41-NEXT: psrlw $8, %xmm1 540; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 541; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 542; SSE41-NEXT: pmullw %xmm2, %xmm3 543; SSE41-NEXT: psrlw $8, %xmm3 544; SSE41-NEXT: packuswb %xmm3, %xmm1 545; SSE41-NEXT: paddb %xmm0, %xmm1 546; SSE41-NEXT: movdqa %xmm1, %xmm2 547; SSE41-NEXT: psrlw $2, %xmm2 548; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 549; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 550; SSE41-NEXT: pxor %xmm3, %xmm2 551; SSE41-NEXT: psubb %xmm3, %xmm2 552; SSE41-NEXT: psrlw $7, %xmm1 553; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 554; SSE41-NEXT: paddb %xmm2, %xmm1 555; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 556; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 557; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 558; SSE41-NEXT: pmullw %xmm3, %xmm1 559; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 560; SSE41-NEXT: pand %xmm4, %xmm1 561; SSE41-NEXT: pmullw %xmm3, %xmm2 562; SSE41-NEXT: pand %xmm4, %xmm2 563; SSE41-NEXT: packuswb %xmm1, %xmm2 564; SSE41-NEXT: psubb %xmm2, %xmm0 565; SSE41-NEXT: retq 566; 567; AVX1-LABEL: test_rem7_16i8: 568; AVX1: # %bb.0: 569; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 570; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] 571; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 572; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 573; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 574; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 575; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 576; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 577; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 578; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 579; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 580; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 581; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 582; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 583; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2 584; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 585; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 586; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 587; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 588; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 589; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 590; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 591; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 592; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 593; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 594; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 595; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 596; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 597; AVX1-NEXT: retq 598; 599; AVX2NOBW-LABEL: test_rem7_16i8: 600; AVX2NOBW: # %bb.0: 601; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 602; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 603; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 604; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 605; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 606; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 607; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 608; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 609; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 610; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 611; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm2, %xmm2 612; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 613; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 614; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 615; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1 616; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 617; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 618; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 619; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 620; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 621; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 622; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 623; AVX2NOBW-NEXT: vzeroupper 624; AVX2NOBW-NEXT: retq 625; 626; AVX512BW-LABEL: test_rem7_16i8: 627; AVX512BW: # %bb.0: 628; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 629; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 630; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 631; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 632; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 633; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 634; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 635; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 636; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 637; AVX512BW-NEXT: vpsubb %xmm3, %xmm2, %xmm2 638; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 639; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 640; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 641; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 642; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 643; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 644; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 645; AVX512BW-NEXT: vzeroupper 646; AVX512BW-NEXT: retq 647 %res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 648 ret <16 x i8> %res 649} 650