1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 4 5; 6; sdiv by 7 7; 8 9define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind { 10; AVX-LABEL: test_div7_8i64: 11; AVX: # %bb.0: 12; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 13; AVX-NEXT: vpextrq $1, %xmm1, %rax 14; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 15; AVX-NEXT: imulq %rcx 16; AVX-NEXT: movq %rdx, %rax 17; AVX-NEXT: shrq $63, %rax 18; AVX-NEXT: sarq %rdx 19; AVX-NEXT: addq %rax, %rdx 20; AVX-NEXT: vmovq %rdx, %xmm2 21; AVX-NEXT: vmovq %xmm1, %rax 22; AVX-NEXT: imulq %rcx 23; AVX-NEXT: movq %rdx, %rax 24; AVX-NEXT: shrq $63, %rax 25; AVX-NEXT: sarq %rdx 26; AVX-NEXT: addq %rax, %rdx 27; AVX-NEXT: vmovq %rdx, %xmm1 28; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 29; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 30; AVX-NEXT: vpextrq $1, %xmm2, %rax 31; AVX-NEXT: imulq %rcx 32; AVX-NEXT: movq %rdx, %rax 33; AVX-NEXT: shrq $63, %rax 34; AVX-NEXT: sarq %rdx 35; AVX-NEXT: addq %rax, %rdx 36; AVX-NEXT: vmovq %rdx, %xmm3 37; AVX-NEXT: vmovq %xmm2, %rax 38; AVX-NEXT: imulq %rcx 39; AVX-NEXT: movq %rdx, %rax 40; AVX-NEXT: shrq $63, %rax 41; AVX-NEXT: sarq %rdx 42; AVX-NEXT: addq %rax, %rdx 43; AVX-NEXT: vmovq %rdx, %xmm2 44; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 45; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 46; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 47; AVX-NEXT: vpextrq $1, %xmm2, %rax 48; AVX-NEXT: imulq %rcx 49; AVX-NEXT: movq %rdx, %rax 50; AVX-NEXT: shrq $63, %rax 51; AVX-NEXT: sarq %rdx 52; AVX-NEXT: addq %rax, %rdx 53; AVX-NEXT: vmovq %rdx, %xmm3 54; AVX-NEXT: vmovq %xmm2, %rax 55; AVX-NEXT: imulq %rcx 56; AVX-NEXT: movq %rdx, %rax 57; AVX-NEXT: shrq $63, %rax 58; AVX-NEXT: sarq %rdx 59; AVX-NEXT: addq %rax, %rdx 60; AVX-NEXT: vmovq %rdx, %xmm2 61; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 62; AVX-NEXT: vpextrq $1, %xmm0, %rax 63; AVX-NEXT: imulq %rcx 64; AVX-NEXT: movq %rdx, %rax 65; AVX-NEXT: shrq $63, %rax 66; AVX-NEXT: sarq %rdx 67; AVX-NEXT: addq %rax, %rdx 68; AVX-NEXT: vmovq %rdx, %xmm3 69; AVX-NEXT: vmovq %xmm0, %rax 70; AVX-NEXT: imulq %rcx 71; AVX-NEXT: movq %rdx, %rax 72; AVX-NEXT: shrq $63, %rax 73; AVX-NEXT: sarq %rdx 74; AVX-NEXT: addq %rax, %rdx 75; AVX-NEXT: vmovq %rdx, %xmm0 76; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] 77; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 78; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 79; AVX-NEXT: retq 80 %res = sdiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7> 81 ret <8 x i64> %res 82} 83 84define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind { 85; AVX-LABEL: test_div7_16i32: 86; AVX: # %bb.0: 87; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] 88; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 89; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 90; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 91; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1 92; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] 93; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 94; AVX-NEXT: vpaddd %zmm0, %zmm3, %zmm0 95; AVX-NEXT: vpsrld $31, %zmm0, %zmm1 96; AVX-NEXT: vpsrad $2, %zmm0, %zmm0 97; AVX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 98; AVX-NEXT: retq 99 %res = sdiv <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 100 ret <16 x i32> %res 101} 102 103define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind { 104; AVX512F-LABEL: test_div7_32i16: 105; AVX512F: # %bb.0: 106; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] 107; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 108; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm3 109; AVX512F-NEXT: vpsraw $1, %ymm0, %ymm0 110; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0 111; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm1 112; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm2 113; AVX512F-NEXT: vpsraw $1, %ymm1, %ymm1 114; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1 115; AVX512F-NEXT: retq 116; 117; AVX512BW-LABEL: test_div7_32i16: 118; AVX512BW: # %bb.0: 119; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %zmm0, %zmm0 120; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm1 121; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm0 122; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 123; AVX512BW-NEXT: retq 124 %res = sdiv <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 125 ret <32 x i16> %res 126} 127 128define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { 129; AVX512F-LABEL: test_div7_64i8: 130; AVX512F: # %bb.0: 131; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 132; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 133; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] 134; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 135; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 136; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4 137; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4 138; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 139; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm2[2,3] 140; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 141; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 142; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 143; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 144; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 145; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 146; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0 147; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 148; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 149; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 150; AVX512F-NEXT: vpxor %ymm6, %ymm0, %ymm0 151; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0 152; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 153; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 154; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 155; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 156; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 157; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm7 158; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm3 159; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 160; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm2[2,3] 161; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 162; AVX512F-NEXT: vpackuswb %ymm7, %ymm2, %ymm2 163; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1 164; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 165; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 166; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1 167; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 168; AVX512F-NEXT: vpxor %ymm6, %ymm1, %ymm1 169; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 170; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 171; AVX512F-NEXT: retq 172; 173; AVX512BW-LABEL: test_div7_64i8: 174; AVX512BW: # %bb.0: 175; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 176; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] 177; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 178; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 179; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 180; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 181; AVX512BW-NEXT: vpmovsxbw %ymm3, %zmm3 182; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 183; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 184; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 185; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 186; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 187; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1 188; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 189; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 190; AVX512BW-NEXT: vpxorq %zmm2, %zmm1, %zmm1 191; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 192; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 193; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 194; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 195; AVX512BW-NEXT: retq 196 %res = sdiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 197 ret <64 x i8> %res 198} 199 200; 201; srem by 7 202; 203 204define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { 205; AVX-LABEL: test_rem7_8i64: 206; AVX: # %bb.0: 207; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 208; AVX-NEXT: vpextrq $1, %xmm1, %rcx 209; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 210; AVX-NEXT: movq %rcx, %rax 211; AVX-NEXT: imulq %rsi 212; AVX-NEXT: movq %rdx, %rax 213; AVX-NEXT: shrq $63, %rax 214; AVX-NEXT: sarq %rdx 215; AVX-NEXT: addq %rax, %rdx 216; AVX-NEXT: leaq (,%rdx,8), %rax 217; AVX-NEXT: subq %rax, %rdx 218; AVX-NEXT: addq %rcx, %rdx 219; AVX-NEXT: vmovq %rdx, %xmm2 220; AVX-NEXT: vmovq %xmm1, %rcx 221; AVX-NEXT: movq %rcx, %rax 222; AVX-NEXT: imulq %rsi 223; AVX-NEXT: movq %rdx, %rax 224; AVX-NEXT: shrq $63, %rax 225; AVX-NEXT: sarq %rdx 226; AVX-NEXT: addq %rax, %rdx 227; AVX-NEXT: leaq (,%rdx,8), %rax 228; AVX-NEXT: subq %rax, %rdx 229; AVX-NEXT: addq %rcx, %rdx 230; AVX-NEXT: vmovq %rdx, %xmm1 231; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 232; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 233; AVX-NEXT: vpextrq $1, %xmm2, %rcx 234; AVX-NEXT: movq %rcx, %rax 235; AVX-NEXT: imulq %rsi 236; AVX-NEXT: movq %rdx, %rax 237; AVX-NEXT: shrq $63, %rax 238; AVX-NEXT: sarq %rdx 239; AVX-NEXT: addq %rax, %rdx 240; AVX-NEXT: leaq (,%rdx,8), %rax 241; AVX-NEXT: subq %rax, %rdx 242; AVX-NEXT: addq %rcx, %rdx 243; AVX-NEXT: vmovq %rdx, %xmm3 244; AVX-NEXT: vmovq %xmm2, %rcx 245; AVX-NEXT: movq %rcx, %rax 246; AVX-NEXT: imulq %rsi 247; AVX-NEXT: movq %rdx, %rax 248; AVX-NEXT: shrq $63, %rax 249; AVX-NEXT: sarq %rdx 250; AVX-NEXT: addq %rax, %rdx 251; AVX-NEXT: leaq (,%rdx,8), %rax 252; AVX-NEXT: subq %rax, %rdx 253; AVX-NEXT: addq %rcx, %rdx 254; AVX-NEXT: vmovq %rdx, %xmm2 255; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 256; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 257; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 258; AVX-NEXT: vpextrq $1, %xmm2, %rcx 259; AVX-NEXT: movq %rcx, %rax 260; AVX-NEXT: imulq %rsi 261; AVX-NEXT: movq %rdx, %rax 262; AVX-NEXT: shrq $63, %rax 263; AVX-NEXT: sarq %rdx 264; AVX-NEXT: addq %rax, %rdx 265; AVX-NEXT: leaq (,%rdx,8), %rax 266; AVX-NEXT: subq %rax, %rdx 267; AVX-NEXT: addq %rcx, %rdx 268; AVX-NEXT: vmovq %rdx, %xmm3 269; AVX-NEXT: vmovq %xmm2, %rcx 270; AVX-NEXT: movq %rcx, %rax 271; AVX-NEXT: imulq %rsi 272; AVX-NEXT: movq %rdx, %rax 273; AVX-NEXT: shrq $63, %rax 274; AVX-NEXT: sarq %rdx 275; AVX-NEXT: addq %rax, %rdx 276; AVX-NEXT: leaq (,%rdx,8), %rax 277; AVX-NEXT: subq %rax, %rdx 278; AVX-NEXT: addq %rcx, %rdx 279; AVX-NEXT: vmovq %rdx, %xmm2 280; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 281; AVX-NEXT: vpextrq $1, %xmm0, %rcx 282; AVX-NEXT: movq %rcx, %rax 283; AVX-NEXT: imulq %rsi 284; AVX-NEXT: movq %rdx, %rax 285; AVX-NEXT: shrq $63, %rax 286; AVX-NEXT: sarq %rdx 287; AVX-NEXT: addq %rax, %rdx 288; AVX-NEXT: leaq (,%rdx,8), %rax 289; AVX-NEXT: subq %rax, %rdx 290; AVX-NEXT: addq %rcx, %rdx 291; AVX-NEXT: vmovq %rdx, %xmm3 292; AVX-NEXT: vmovq %xmm0, %rcx 293; AVX-NEXT: movq %rcx, %rax 294; AVX-NEXT: imulq %rsi 295; AVX-NEXT: movq %rdx, %rax 296; AVX-NEXT: shrq $63, %rax 297; AVX-NEXT: sarq %rdx 298; AVX-NEXT: addq %rax, %rdx 299; AVX-NEXT: leaq (,%rdx,8), %rax 300; AVX-NEXT: subq %rax, %rdx 301; AVX-NEXT: addq %rcx, %rdx 302; AVX-NEXT: vmovq %rdx, %xmm0 303; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] 304; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 305; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 306; AVX-NEXT: retq 307 %res = srem <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7> 308 ret <8 x i64> %res 309} 310 311define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind { 312; AVX-LABEL: test_rem7_16i32: 313; AVX: # %bb.0: 314; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] 315; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 316; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 317; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 318; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1 319; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] 320; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 321; AVX-NEXT: vpaddd %zmm0, %zmm3, %zmm1 322; AVX-NEXT: vpsrld $31, %zmm1, %zmm2 323; AVX-NEXT: vpsrad $2, %zmm1, %zmm1 324; AVX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 325; AVX-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm1, %zmm1 326; AVX-NEXT: vpsubd %zmm1, %zmm0, %zmm0 327; AVX-NEXT: retq 328 %res = srem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 329 ret <16 x i32> %res 330} 331 332define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind { 333; AVX512F-LABEL: test_rem7_32i16: 334; AVX512F: # %bb.0: 335; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] 336; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm3 337; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4 338; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3 339; AVX512F-NEXT: vpaddw %ymm4, %ymm3, %ymm3 340; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 341; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 342; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0 343; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm2 344; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm3 345; AVX512F-NEXT: vpsraw $1, %ymm2, %ymm2 346; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 347; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 348; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1 349; AVX512F-NEXT: retq 350; 351; AVX512BW-LABEL: test_rem7_32i16: 352; AVX512BW: # %bb.0: 353; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %zmm0, %zmm1 354; AVX512BW-NEXT: vpsrlw $15, %zmm1, %zmm2 355; AVX512BW-NEXT: vpsraw $1, %zmm1, %zmm1 356; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1 357; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 358; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0 359; AVX512BW-NEXT: retq 360 %res = srem <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 361 ret <32 x i16> %res 362} 363 364define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { 365; AVX512F-LABEL: test_rem7_64i8: 366; AVX512F: # %bb.0: 367; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 368; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm3 369; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] 370; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm3 371; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 372; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4 373; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm4 374; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 375; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm3[2,3] 376; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 377; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 378; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm3 379; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm5 380; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 381; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7 382; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 383; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 384; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 385; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 386; AVX512F-NEXT: vpxor %ymm6, %ymm3, %ymm3 387; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 388; AVX512F-NEXT: vpaddb %ymm7, %ymm3, %ymm7 389; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm8 390; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 391; AVX512F-NEXT: vpmullw %ymm3, %ymm8, %ymm8 392; AVX512F-NEXT: vpmovsxwd %ymm8, %zmm8 393; AVX512F-NEXT: vpmovdb %zmm8, %xmm8 394; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm7 395; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm7 396; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm7 397; AVX512F-NEXT: vpmovsxwd %ymm7, %zmm7 398; AVX512F-NEXT: vpmovdb %zmm7, %xmm7 399; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 400; AVX512F-NEXT: vpsubb %ymm7, %ymm0, %ymm0 401; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm7 402; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm7 403; AVX512F-NEXT: vpmullw %ymm2, %ymm7, %ymm7 404; AVX512F-NEXT: vpsrlw $8, %ymm7, %ymm7 405; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm8 406; AVX512F-NEXT: vpmullw %ymm2, %ymm8, %ymm2 407; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 408; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm2[2,3],ymm7[2,3] 409; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 410; AVX512F-NEXT: vpackuswb %ymm8, %ymm2, %ymm2 411; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm2 412; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm7 413; AVX512F-NEXT: vpand %ymm4, %ymm7, %ymm4 414; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 415; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 416; AVX512F-NEXT: vpxor %ymm6, %ymm2, %ymm2 417; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 418; AVX512F-NEXT: vpaddb %ymm4, %ymm2, %ymm2 419; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm4 420; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4 421; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 422; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 423; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 424; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 425; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 426; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 427; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 428; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 429; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1 430; AVX512F-NEXT: retq 431; 432; AVX512BW-LABEL: test_rem7_64i8: 433; AVX512BW: # %bb.0: 434; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 435; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] 436; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 437; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 438; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 439; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 440; AVX512BW-NEXT: vpmovsxbw %ymm3, %zmm3 441; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 442; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 443; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 444; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 445; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm1 446; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm2 447; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 448; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 449; AVX512BW-NEXT: vpxorq %zmm3, %zmm2, %zmm2 450; AVX512BW-NEXT: vpsubb %zmm3, %zmm2, %zmm2 451; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1 452; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 453; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 454; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2 455; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 456; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 457; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 458; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 459; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 460; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1 461; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 462; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 463; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 464; AVX512BW-NEXT: retq 465 %res = srem <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 466 ret <64 x i8> %res 467} 468