1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX2NOBW 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW 5 6; 7; sdiv by 7 8; 9 10define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind { 11; AVX1-LABEL: test_div7_4i64: 12; AVX1: # %bb.0: 13; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 14; AVX1-NEXT: vpextrq $1, %xmm1, %rax 15; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 16; AVX1-NEXT: imulq %rcx 17; AVX1-NEXT: movq %rdx, %rax 18; AVX1-NEXT: shrq $63, %rax 19; AVX1-NEXT: sarq %rdx 20; AVX1-NEXT: addq %rax, %rdx 21; AVX1-NEXT: vmovq %rdx, %xmm2 22; AVX1-NEXT: vmovq %xmm1, %rax 23; AVX1-NEXT: imulq %rcx 24; AVX1-NEXT: movq %rdx, %rax 25; AVX1-NEXT: shrq $63, %rax 26; AVX1-NEXT: sarq %rdx 27; AVX1-NEXT: addq %rax, %rdx 28; AVX1-NEXT: vmovq %rdx, %xmm1 29; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 30; AVX1-NEXT: vpextrq $1, %xmm0, %rax 31; AVX1-NEXT: imulq %rcx 32; AVX1-NEXT: movq %rdx, %rax 33; AVX1-NEXT: shrq $63, %rax 34; AVX1-NEXT: sarq %rdx 35; AVX1-NEXT: addq %rax, %rdx 36; AVX1-NEXT: vmovq %rdx, %xmm2 37; AVX1-NEXT: vmovq %xmm0, %rax 38; AVX1-NEXT: imulq %rcx 39; AVX1-NEXT: movq %rdx, %rax 40; AVX1-NEXT: shrq $63, %rax 41; AVX1-NEXT: sarq %rdx 42; AVX1-NEXT: addq %rax, %rdx 43; AVX1-NEXT: vmovq %rdx, %xmm0 44; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 45; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 46; AVX1-NEXT: retq 47; 48; AVX2-LABEL: test_div7_4i64: 49; AVX2: # %bb.0: 50; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 51; AVX2-NEXT: vpextrq $1, %xmm1, %rax 52; AVX2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 53; AVX2-NEXT: imulq %rcx 54; AVX2-NEXT: movq %rdx, %rax 55; AVX2-NEXT: shrq $63, %rax 56; AVX2-NEXT: sarq %rdx 57; AVX2-NEXT: addq %rax, %rdx 58; AVX2-NEXT: vmovq %rdx, %xmm2 59; AVX2-NEXT: vmovq %xmm1, %rax 60; AVX2-NEXT: imulq %rcx 61; AVX2-NEXT: movq %rdx, %rax 62; AVX2-NEXT: shrq $63, %rax 63; AVX2-NEXT: sarq %rdx 64; AVX2-NEXT: addq %rax, %rdx 65; AVX2-NEXT: vmovq %rdx, %xmm1 66; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 67; AVX2-NEXT: vpextrq $1, %xmm0, %rax 68; AVX2-NEXT: imulq %rcx 69; AVX2-NEXT: movq %rdx, %rax 70; AVX2-NEXT: shrq $63, %rax 71; AVX2-NEXT: sarq %rdx 72; AVX2-NEXT: addq %rax, %rdx 73; AVX2-NEXT: vmovq %rdx, %xmm2 74; AVX2-NEXT: vmovq %xmm0, %rax 75; AVX2-NEXT: imulq %rcx 76; AVX2-NEXT: movq %rdx, %rax 77; AVX2-NEXT: shrq $63, %rax 78; AVX2-NEXT: sarq %rdx 79; AVX2-NEXT: addq %rax, %rdx 80; AVX2-NEXT: vmovq %rdx, %xmm0 81; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 82; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 83; AVX2-NEXT: retq 84 %res = sdiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> 85 ret <4 x i64> %res 86} 87 88define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { 89; AVX1-LABEL: test_div7_8i32: 90; AVX1: # %bb.0: 91; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] 92; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 93; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 94; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 95; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 96; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3 97; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2 98; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 99; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 100; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 101; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 102; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2 103; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 104; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 105; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 106; AVX1-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 107; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 108; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 109; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 110; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 111; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 112; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 113; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 114; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 115; AVX1-NEXT: retq 116; 117; AVX2-LABEL: test_div7_8i32: 118; AVX2: # %bb.0: 119; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] 120; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 121; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 122; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 123; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 124; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] 125; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 126; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 127; AVX2-NEXT: vpsrld $31, %ymm0, %ymm1 128; AVX2-NEXT: vpsrad $2, %ymm0, %ymm0 129; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 130; AVX2-NEXT: retq 131 %res = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 132 ret <8 x i32> %res 133} 134 135define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind { 136; AVX1-LABEL: test_div7_16i16: 137; AVX1: # %bb.0: 138; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 139; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] 140; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm1 141; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm3 142; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 143; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 144; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm0 145; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2 146; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 147; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 148; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 149; AVX1-NEXT: retq 150; 151; AVX2-LABEL: test_div7_16i16: 152; AVX2: # %bb.0: 153; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm0 154; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm1 155; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0 156; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 157; AVX2-NEXT: retq 158 %res = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 159 ret <16 x i16> %res 160} 161 162define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { 163; AVX1-LABEL: test_div7_32i8: 164; AVX1: # %bb.0: 165; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 166; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2 167; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427] 168; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 169; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 170; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] 171; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 172; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4 173; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 174; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 175; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 176; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm2 177; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 178; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 179; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 180; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 181; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 182; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 183; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 184; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1 185; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 186; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2 187; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 188; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 189; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] 190; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 191; AVX1-NEXT: vpmullw %xmm3, %xmm7, %xmm3 192; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 193; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 194; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 195; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm2 196; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 197; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 198; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 199; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 200; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0 201; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 202; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 203; AVX1-NEXT: retq 204; 205; AVX2NOBW-LABEL: test_div7_32i8: 206; AVX2NOBW: # %bb.0: 207; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 208; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1 209; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] 210; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 211; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 212; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm3 213; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm3, %ymm2 214; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 215; AVX2NOBW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3] 216; AVX2NOBW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 217; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 218; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 219; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm1 220; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 221; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 222; AVX2NOBW-NEXT: vpxor %ymm2, %ymm1, %ymm1 223; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 224; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm0 225; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 226; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 227; AVX2NOBW-NEXT: retq 228; 229; AVX512BW-LABEL: test_div7_32i8: 230; AVX512BW: # %bb.0: 231; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 232; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 233; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 234; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 235; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 236; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm1 237; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 238; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 239; AVX512BW-NEXT: vpxor %ymm2, %ymm1, %ymm1 240; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 241; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm0 242; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 243; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 244; AVX512BW-NEXT: retq 245 %res = sdiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 246 ret <32 x i8> %res 247} 248 249; 250; srem by 7 251; 252 253define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { 254; AVX1-LABEL: test_rem7_4i64: 255; AVX1: # %bb.0: 256; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 257; AVX1-NEXT: vpextrq $1, %xmm1, %rcx 258; AVX1-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 259; AVX1-NEXT: movq %rcx, %rax 260; AVX1-NEXT: imulq %rsi 261; AVX1-NEXT: movq %rdx, %rax 262; AVX1-NEXT: shrq $63, %rax 263; AVX1-NEXT: sarq %rdx 264; AVX1-NEXT: addq %rax, %rdx 265; AVX1-NEXT: leaq (,%rdx,8), %rax 266; AVX1-NEXT: subq %rax, %rdx 267; AVX1-NEXT: addq %rcx, %rdx 268; AVX1-NEXT: vmovq %rdx, %xmm2 269; AVX1-NEXT: vmovq %xmm1, %rcx 270; AVX1-NEXT: movq %rcx, %rax 271; AVX1-NEXT: imulq %rsi 272; AVX1-NEXT: movq %rdx, %rax 273; AVX1-NEXT: shrq $63, %rax 274; AVX1-NEXT: sarq %rdx 275; AVX1-NEXT: addq %rax, %rdx 276; AVX1-NEXT: leaq (,%rdx,8), %rax 277; AVX1-NEXT: subq %rax, %rdx 278; AVX1-NEXT: addq %rcx, %rdx 279; AVX1-NEXT: vmovq %rdx, %xmm1 280; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 281; AVX1-NEXT: vpextrq $1, %xmm0, %rcx 282; AVX1-NEXT: movq %rcx, %rax 283; AVX1-NEXT: imulq %rsi 284; AVX1-NEXT: movq %rdx, %rax 285; AVX1-NEXT: shrq $63, %rax 286; AVX1-NEXT: sarq %rdx 287; AVX1-NEXT: addq %rax, %rdx 288; AVX1-NEXT: leaq (,%rdx,8), %rax 289; AVX1-NEXT: subq %rax, %rdx 290; AVX1-NEXT: addq %rcx, %rdx 291; AVX1-NEXT: vmovq %rdx, %xmm2 292; AVX1-NEXT: vmovq %xmm0, %rcx 293; AVX1-NEXT: movq %rcx, %rax 294; AVX1-NEXT: imulq %rsi 295; AVX1-NEXT: movq %rdx, %rax 296; AVX1-NEXT: shrq $63, %rax 297; AVX1-NEXT: sarq %rdx 298; AVX1-NEXT: addq %rax, %rdx 299; AVX1-NEXT: leaq (,%rdx,8), %rax 300; AVX1-NEXT: subq %rax, %rdx 301; AVX1-NEXT: addq %rcx, %rdx 302; AVX1-NEXT: vmovq %rdx, %xmm0 303; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 304; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 305; AVX1-NEXT: retq 306; 307; AVX2-LABEL: test_rem7_4i64: 308; AVX2: # %bb.0: 309; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 310; AVX2-NEXT: vpextrq $1, %xmm1, %rcx 311; AVX2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 312; AVX2-NEXT: movq %rcx, %rax 313; AVX2-NEXT: imulq %rsi 314; AVX2-NEXT: movq %rdx, %rax 315; AVX2-NEXT: shrq $63, %rax 316; AVX2-NEXT: sarq %rdx 317; AVX2-NEXT: addq %rax, %rdx 318; AVX2-NEXT: leaq (,%rdx,8), %rax 319; AVX2-NEXT: subq %rax, %rdx 320; AVX2-NEXT: addq %rcx, %rdx 321; AVX2-NEXT: vmovq %rdx, %xmm2 322; AVX2-NEXT: vmovq %xmm1, %rcx 323; AVX2-NEXT: movq %rcx, %rax 324; AVX2-NEXT: imulq %rsi 325; AVX2-NEXT: movq %rdx, %rax 326; AVX2-NEXT: shrq $63, %rax 327; AVX2-NEXT: sarq %rdx 328; AVX2-NEXT: addq %rax, %rdx 329; AVX2-NEXT: leaq (,%rdx,8), %rax 330; AVX2-NEXT: subq %rax, %rdx 331; AVX2-NEXT: addq %rcx, %rdx 332; AVX2-NEXT: vmovq %rdx, %xmm1 333; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 334; AVX2-NEXT: vpextrq $1, %xmm0, %rcx 335; AVX2-NEXT: movq %rcx, %rax 336; AVX2-NEXT: imulq %rsi 337; AVX2-NEXT: movq %rdx, %rax 338; AVX2-NEXT: shrq $63, %rax 339; AVX2-NEXT: sarq %rdx 340; AVX2-NEXT: addq %rax, %rdx 341; AVX2-NEXT: leaq (,%rdx,8), %rax 342; AVX2-NEXT: subq %rax, %rdx 343; AVX2-NEXT: addq %rcx, %rdx 344; AVX2-NEXT: vmovq %rdx, %xmm2 345; AVX2-NEXT: vmovq %xmm0, %rcx 346; AVX2-NEXT: movq %rcx, %rax 347; AVX2-NEXT: imulq %rsi 348; AVX2-NEXT: movq %rdx, %rax 349; AVX2-NEXT: shrq $63, %rax 350; AVX2-NEXT: sarq %rdx 351; AVX2-NEXT: addq %rax, %rdx 352; AVX2-NEXT: leaq (,%rdx,8), %rax 353; AVX2-NEXT: subq %rax, %rdx 354; AVX2-NEXT: addq %rcx, %rdx 355; AVX2-NEXT: vmovq %rdx, %xmm0 356; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 357; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 358; AVX2-NEXT: retq 359 %res = srem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> 360 ret <4 x i64> %res 361} 362 363define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { 364; AVX1-LABEL: test_rem7_8i32: 365; AVX1: # %bb.0: 366; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] 367; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 368; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 369; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 370; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 371; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3 372; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2 373; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 374; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 375; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 376; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 377; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2 378; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 379; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7] 380; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 381; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 382; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 383; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 384; AVX1-NEXT: vpmuldq %xmm4, %xmm5, %xmm4 385; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 386; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 387; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] 388; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 389; AVX1-NEXT: vpsrld $31, %xmm1, %xmm4 390; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1 391; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 392; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 393; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 394; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 395; AVX1-NEXT: retq 396; 397; AVX2-LABEL: test_rem7_8i32: 398; AVX2: # %bb.0: 399; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] 400; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 401; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 402; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 403; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 404; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] 405; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 406; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 407; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2 408; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1 409; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 410; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] 411; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 412; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 413; AVX2-NEXT: retq 414 %res = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 415 ret <8 x i32> %res 416} 417 418define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind { 419; AVX1-LABEL: test_rem7_16i16: 420; AVX1: # %bb.0: 421; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 422; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] 423; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm3 424; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm4 425; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3 426; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 427; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7] 428; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 429; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1 430; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm2 431; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm3 432; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2 433; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 434; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 435; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 436; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 437; AVX1-NEXT: retq 438; 439; AVX2-LABEL: test_rem7_16i16: 440; AVX2: # %bb.0: 441; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1 442; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm2 443; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1 444; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 445; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 446; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 447; AVX2-NEXT: retq 448 %res = srem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 449 ret <16 x i16> %res 450} 451 452define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { 453; AVX1-LABEL: test_rem7_32i8: 454; AVX1: # %bb.0: 455; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 456; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3 457; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65427,65427,65427,65427,65427,65427,65427,65427] 458; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3 459; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 460; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] 461; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 462; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm4 463; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 464; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 465; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm3 466; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm4 467; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 468; AVX1-NEXT: vpand %xmm8, %xmm4, %xmm4 469; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 470; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 471; AVX1-NEXT: vpand %xmm9, %xmm3, %xmm3 472; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 473; AVX1-NEXT: vpxor %xmm7, %xmm3, %xmm3 474; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3 475; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3 476; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 477; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 478; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 479; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] 480; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 481; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 482; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3 483; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 484; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 485; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2 486; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3 487; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3 488; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 489; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 490; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 491; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm1 492; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 493; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 494; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 495; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm3 496; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 497; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 498; AVX1-NEXT: vpand %xmm9, %xmm1, %xmm1 499; AVX1-NEXT: vpxor %xmm7, %xmm1, %xmm1 500; AVX1-NEXT: vpsubb %xmm7, %xmm1, %xmm1 501; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 502; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 503; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3 504; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 505; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 506; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 507; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 508; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 509; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 510; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 511; AVX1-NEXT: retq 512; 513; AVX2NOBW-LABEL: test_rem7_32i8: 514; AVX2NOBW: # %bb.0: 515; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 516; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1 517; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] 518; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 519; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 520; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm3 521; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm3, %ymm2 522; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 523; AVX2NOBW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3] 524; AVX2NOBW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 525; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 526; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 527; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm2 528; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 529; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 530; AVX2NOBW-NEXT: vpxor %ymm3, %ymm2, %ymm2 531; AVX2NOBW-NEXT: vpsubb %ymm3, %ymm2, %ymm2 532; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm1 533; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 534; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 535; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 536; AVX2NOBW-NEXT: vpmovsxbw %xmm2, %ymm2 537; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 538; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2 539; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm4 540; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 541; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 542; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 543; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] 544; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1 545; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm1, %ymm1 546; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm3 547; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 548; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 549; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 550; AVX2NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 551; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 552; AVX2NOBW-NEXT: retq 553; 554; AVX512BW-LABEL: test_rem7_32i8: 555; AVX512BW: # %bb.0: 556; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 557; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 558; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 559; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 560; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 561; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm2 562; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 563; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 564; AVX512BW-NEXT: vpxor %ymm3, %ymm2, %ymm2 565; AVX512BW-NEXT: vpsubb %ymm3, %ymm2, %ymm2 566; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm1 567; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 568; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 569; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 570; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 571; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 572; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 573; AVX512BW-NEXT: retq 574 %res = srem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 575 ret <32 x i8> %res 576} 577