1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW 7 8; 9; sdiv by 7 10; 11 12define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { 13; SSE2-LABEL: test_div7_2i64: 14; SSE2: # %bb.0: 15; SSE2-NEXT: movq %xmm0, %rax 16; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 17; SSE2-NEXT: imulq %rcx 18; SSE2-NEXT: movq %rdx, %rax 19; SSE2-NEXT: shrq $63, %rax 20; SSE2-NEXT: sarq %rdx 21; SSE2-NEXT: addq %rax, %rdx 22; SSE2-NEXT: movq %rdx, %xmm1 23; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 24; SSE2-NEXT: movq %xmm0, %rax 25; SSE2-NEXT: imulq %rcx 26; SSE2-NEXT: movq %rdx, %rax 27; SSE2-NEXT: shrq $63, %rax 28; SSE2-NEXT: sarq %rdx 29; SSE2-NEXT: addq %rax, %rdx 30; SSE2-NEXT: movq %rdx, %xmm0 31; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 32; SSE2-NEXT: movdqa %xmm1, %xmm0 33; SSE2-NEXT: retq 34; 35; SSE41-LABEL: test_div7_2i64: 36; SSE41: # %bb.0: 37; SSE41-NEXT: pextrq $1, %xmm0, %rax 38; SSE41-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 39; SSE41-NEXT: imulq %rcx 40; SSE41-NEXT: movq %rdx, %rax 41; SSE41-NEXT: shrq $63, %rax 42; SSE41-NEXT: sarq %rdx 43; SSE41-NEXT: addq %rax, %rdx 44; SSE41-NEXT: movq %rdx, %xmm1 45; SSE41-NEXT: movq %xmm0, %rax 46; SSE41-NEXT: imulq %rcx 47; SSE41-NEXT: movq %rdx, %rax 48; SSE41-NEXT: shrq $63, %rax 49; SSE41-NEXT: sarq %rdx 50; SSE41-NEXT: addq %rax, %rdx 51; SSE41-NEXT: movq %rdx, %xmm0 52; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 53; SSE41-NEXT: retq 54; 55; AVX-LABEL: test_div7_2i64: 56; AVX: # %bb.0: 57; AVX-NEXT: vpextrq $1, %xmm0, %rax 58; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 59; AVX-NEXT: imulq %rcx 60; AVX-NEXT: movq %rdx, %rax 61; AVX-NEXT: shrq $63, %rax 62; AVX-NEXT: sarq %rdx 63; AVX-NEXT: addq %rax, %rdx 64; AVX-NEXT: vmovq %rdx, %xmm1 65; AVX-NEXT: vmovq %xmm0, %rax 66; AVX-NEXT: imulq %rcx 67; AVX-NEXT: movq %rdx, %rax 68; AVX-NEXT: shrq $63, %rax 69; AVX-NEXT: sarq %rdx 70; AVX-NEXT: addq %rax, %rdx 71; AVX-NEXT: vmovq %rdx, %xmm0 72; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 73; AVX-NEXT: retq 74 %res = sdiv <2 x i64> %a, <i64 7, i64 7> 75 ret <2 x i64> %res 76} 77 78define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { 79; SSE2-LABEL: test_div7_4i32: 80; SSE2: # %bb.0: 81; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 82; SSE2-NEXT: movdqa %xmm0, %xmm1 83; SSE2-NEXT: pmuludq %xmm2, %xmm1 84; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 85; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 86; SSE2-NEXT: pmuludq %xmm2, %xmm3 87; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 88; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 89; SSE2-NEXT: pxor %xmm3, %xmm3 90; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 91; SSE2-NEXT: pand %xmm2, %xmm3 92; SSE2-NEXT: paddd %xmm0, %xmm3 93; SSE2-NEXT: psubd %xmm3, %xmm1 94; SSE2-NEXT: paddd %xmm0, %xmm1 95; SSE2-NEXT: movdqa %xmm1, %xmm0 96; SSE2-NEXT: psrld $31, %xmm0 97; SSE2-NEXT: psrad $2, %xmm1 98; SSE2-NEXT: paddd %xmm0, %xmm1 99; SSE2-NEXT: movdqa %xmm1, %xmm0 100; SSE2-NEXT: retq 101; 102; SSE41-LABEL: test_div7_4i32: 103; SSE41: # %bb.0: 104; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 105; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 106; SSE41-NEXT: pmuldq %xmm1, %xmm2 107; SSE41-NEXT: pmuldq %xmm0, %xmm1 108; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 109; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 110; SSE41-NEXT: paddd %xmm0, %xmm1 111; SSE41-NEXT: movdqa %xmm1, %xmm0 112; SSE41-NEXT: psrld $31, %xmm0 113; SSE41-NEXT: psrad $2, %xmm1 114; SSE41-NEXT: paddd %xmm0, %xmm1 115; SSE41-NEXT: movdqa %xmm1, %xmm0 116; SSE41-NEXT: retq 117; 118; AVX1-LABEL: test_div7_4i32: 119; AVX1: # %bb.0: 120; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 121; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 122; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 123; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 124; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 125; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 126; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 127; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 128; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 129; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 130; AVX1-NEXT: retq 131; 132; AVX2-LABEL: test_div7_4i32: 133; AVX2: # %bb.0: 134; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 135; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 136; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 137; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 138; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 139; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] 140; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 141; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1 142; AVX2-NEXT: vpsrad $2, %xmm0, %xmm0 143; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 144; AVX2-NEXT: retq 145 %res = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 146 ret <4 x i32> %res 147} 148 149define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind { 150; SSE-LABEL: test_div7_8i16: 151; SSE: # %bb.0: 152; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0 153; SSE-NEXT: movdqa %xmm0, %xmm1 154; SSE-NEXT: psrlw $15, %xmm1 155; SSE-NEXT: psraw $1, %xmm0 156; SSE-NEXT: paddw %xmm1, %xmm0 157; SSE-NEXT: retq 158; 159; AVX-LABEL: test_div7_8i16: 160; AVX: # %bb.0: 161; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 162; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1 163; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 164; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 165; AVX-NEXT: retq 166 %res = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 167 ret <8 x i16> %res 168} 169 170define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { 171; SSE2-LABEL: test_div7_16i8: 172; SSE2: # %bb.0: 173; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 174; SSE2-NEXT: psraw $8, %xmm2 175; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427] 176; SSE2-NEXT: pmullw %xmm3, %xmm2 177; SSE2-NEXT: psrlw $8, %xmm2 178; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 179; SSE2-NEXT: psraw $8, %xmm1 180; SSE2-NEXT: pmullw %xmm3, %xmm1 181; SSE2-NEXT: psrlw $8, %xmm1 182; SSE2-NEXT: packuswb %xmm2, %xmm1 183; SSE2-NEXT: paddb %xmm0, %xmm1 184; SSE2-NEXT: movdqa %xmm1, %xmm0 185; SSE2-NEXT: psrlw $2, %xmm0 186; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 187; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 188; SSE2-NEXT: pxor %xmm2, %xmm0 189; SSE2-NEXT: psrlw $7, %xmm1 190; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 191; SSE2-NEXT: paddb %xmm0, %xmm1 192; SSE2-NEXT: psubb %xmm2, %xmm1 193; SSE2-NEXT: movdqa %xmm1, %xmm0 194; SSE2-NEXT: retq 195; 196; SSE41-LABEL: test_div7_16i8: 197; SSE41: # %bb.0: 198; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 199; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] 200; SSE41-NEXT: pmullw %xmm2, %xmm1 201; SSE41-NEXT: psrlw $8, %xmm1 202; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 203; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 204; SSE41-NEXT: pmullw %xmm2, %xmm3 205; SSE41-NEXT: psrlw $8, %xmm3 206; SSE41-NEXT: packuswb %xmm3, %xmm1 207; SSE41-NEXT: paddb %xmm0, %xmm1 208; SSE41-NEXT: movdqa %xmm1, %xmm0 209; SSE41-NEXT: psrlw $2, %xmm0 210; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 211; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 212; SSE41-NEXT: pxor %xmm2, %xmm0 213; SSE41-NEXT: psrlw $7, %xmm1 214; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 215; SSE41-NEXT: paddb %xmm0, %xmm1 216; SSE41-NEXT: psubb %xmm2, %xmm1 217; SSE41-NEXT: movdqa %xmm1, %xmm0 218; SSE41-NEXT: retq 219; 220; AVX1-LABEL: test_div7_16i8: 221; AVX1: # %bb.0: 222; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 223; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] 224; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 225; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 226; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 227; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 228; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 229; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 230; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 231; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 232; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 233; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 234; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 235; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 236; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 237; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 238; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 239; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 240; AVX1-NEXT: retq 241; 242; AVX2NOBW-LABEL: test_div7_16i8: 243; AVX2NOBW: # %bb.0: 244; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 245; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 246; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 247; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 248; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 249; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 250; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1 251; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 252; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 253; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1 254; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 255; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 256; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 257; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 258; AVX2NOBW-NEXT: vzeroupper 259; AVX2NOBW-NEXT: retq 260; 261; AVX512BW-LABEL: test_div7_16i8: 262; AVX512BW: # %bb.0: 263; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 264; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 265; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 266; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 267; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 268; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1 269; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 270; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 271; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1 272; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 273; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 274; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 275; AVX512BW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 276; AVX512BW-NEXT: vzeroupper 277; AVX512BW-NEXT: retq 278 %res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 279 ret <16 x i8> %res 280} 281 282; 283; sdiv by non-splat constant 284; 285 286define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { 287; SSE2-LABEL: test_divconstant_16i8: 288; SSE2: # %bb.0: 289; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 290; SSE2-NEXT: psraw $8, %xmm1 291; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1 292; SSE2-NEXT: psrlw $8, %xmm1 293; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 294; SSE2-NEXT: psraw $8, %xmm2 295; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 296; SSE2-NEXT: psrlw $8, %xmm2 297; SSE2-NEXT: packuswb %xmm1, %xmm2 298; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 299; SSE2-NEXT: paddb %xmm2, %xmm0 300; SSE2-NEXT: movdqa %xmm0, %xmm1 301; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 302; SSE2-NEXT: psraw $8, %xmm1 303; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1 304; SSE2-NEXT: psrlw $8, %xmm1 305; SSE2-NEXT: movdqa %xmm0, %xmm2 306; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 307; SSE2-NEXT: psraw $8, %xmm2 308; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 309; SSE2-NEXT: psrlw $8, %xmm2 310; SSE2-NEXT: packuswb %xmm1, %xmm2 311; SSE2-NEXT: psrlw $7, %xmm0 312; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 313; SSE2-NEXT: paddb %xmm2, %xmm0 314; SSE2-NEXT: retq 315; 316; SSE41-LABEL: test_divconstant_16i8: 317; SSE41: # %bb.0: 318; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 319; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 320; SSE41-NEXT: psrlw $8, %xmm1 321; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 322; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 323; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 324; SSE41-NEXT: psrlw $8, %xmm2 325; SSE41-NEXT: packuswb %xmm2, %xmm1 326; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 327; SSE41-NEXT: paddb %xmm1, %xmm0 328; SSE41-NEXT: movdqa %xmm0, %xmm1 329; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 330; SSE41-NEXT: psraw $8, %xmm1 331; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 332; SSE41-NEXT: psrlw $8, %xmm1 333; SSE41-NEXT: movdqa %xmm0, %xmm2 334; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 335; SSE41-NEXT: psraw $8, %xmm2 336; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 337; SSE41-NEXT: psrlw $8, %xmm2 338; SSE41-NEXT: packuswb %xmm1, %xmm2 339; SSE41-NEXT: psrlw $7, %xmm0 340; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 341; SSE41-NEXT: paddb %xmm2, %xmm0 342; SSE41-NEXT: retq 343; 344; AVX1-LABEL: test_divconstant_16i8: 345; AVX1: # %bb.0: 346; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 347; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 348; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 349; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 350; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 351; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 352; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 353; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 354; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 355; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 356; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 357; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 358; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 359; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 360; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 361; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 362; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 363; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 364; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 365; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 366; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 367; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 368; AVX1-NEXT: retq 369; 370; AVX2NOBW-LABEL: test_divconstant_16i8: 371; AVX2NOBW: # %bb.0: 372; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 373; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 374; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 375; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 376; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 377; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 378; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 379; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 380; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 381; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 382; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 383; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 384; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 385; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 386; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 387; AVX2NOBW-NEXT: vzeroupper 388; AVX2NOBW-NEXT: retq 389; 390; AVX512BW-LABEL: test_divconstant_16i8: 391; AVX512BW: # %bb.0: 392; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] 393; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 394; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 395; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 396; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 397; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 398; AVX512BW-NEXT: vpaddb %xmm0, %xmm2, %xmm0 399; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 400; AVX512BW-NEXT: vpsravw %zmm1, %zmm2, %zmm1 401; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 402; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 403; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 404; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 405; AVX512BW-NEXT: vzeroupper 406; AVX512BW-NEXT: retq 407 %res = sdiv <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7> 408 ret <16 x i8> %res 409} 410 411; 412; srem by 7 413; 414 415define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { 416; SSE2-LABEL: test_rem7_2i64: 417; SSE2: # %bb.0: 418; SSE2-NEXT: movq %xmm0, %rcx 419; SSE2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 420; SSE2-NEXT: movq %rcx, %rax 421; SSE2-NEXT: imulq %rsi 422; SSE2-NEXT: movq %rdx, %rax 423; SSE2-NEXT: shrq $63, %rax 424; SSE2-NEXT: sarq %rdx 425; SSE2-NEXT: addq %rax, %rdx 426; SSE2-NEXT: leaq (,%rdx,8), %rax 427; SSE2-NEXT: subq %rax, %rdx 428; SSE2-NEXT: addq %rcx, %rdx 429; SSE2-NEXT: movq %rdx, %xmm1 430; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 431; SSE2-NEXT: movq %xmm0, %rcx 432; SSE2-NEXT: movq %rcx, %rax 433; SSE2-NEXT: imulq %rsi 434; SSE2-NEXT: movq %rdx, %rax 435; SSE2-NEXT: shrq $63, %rax 436; SSE2-NEXT: sarq %rdx 437; SSE2-NEXT: addq %rax, %rdx 438; SSE2-NEXT: leaq (,%rdx,8), %rax 439; SSE2-NEXT: subq %rax, %rdx 440; SSE2-NEXT: addq %rcx, %rdx 441; SSE2-NEXT: movq %rdx, %xmm0 442; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 443; SSE2-NEXT: movdqa %xmm1, %xmm0 444; SSE2-NEXT: retq 445; 446; SSE41-LABEL: test_rem7_2i64: 447; SSE41: # %bb.0: 448; SSE41-NEXT: pextrq $1, %xmm0, %rcx 449; SSE41-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 450; SSE41-NEXT: movq %rcx, %rax 451; SSE41-NEXT: imulq %rsi 452; SSE41-NEXT: movq %rdx, %rax 453; SSE41-NEXT: shrq $63, %rax 454; SSE41-NEXT: sarq %rdx 455; SSE41-NEXT: addq %rax, %rdx 456; SSE41-NEXT: leaq (,%rdx,8), %rax 457; SSE41-NEXT: subq %rax, %rdx 458; SSE41-NEXT: addq %rcx, %rdx 459; SSE41-NEXT: movq %rdx, %xmm1 460; SSE41-NEXT: movq %xmm0, %rcx 461; SSE41-NEXT: movq %rcx, %rax 462; SSE41-NEXT: imulq %rsi 463; SSE41-NEXT: movq %rdx, %rax 464; SSE41-NEXT: shrq $63, %rax 465; SSE41-NEXT: sarq %rdx 466; SSE41-NEXT: addq %rax, %rdx 467; SSE41-NEXT: leaq (,%rdx,8), %rax 468; SSE41-NEXT: subq %rax, %rdx 469; SSE41-NEXT: addq %rcx, %rdx 470; SSE41-NEXT: movq %rdx, %xmm0 471; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 472; SSE41-NEXT: retq 473; 474; AVX-LABEL: test_rem7_2i64: 475; AVX: # %bb.0: 476; AVX-NEXT: vpextrq $1, %xmm0, %rcx 477; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 478; AVX-NEXT: movq %rcx, %rax 479; AVX-NEXT: imulq %rsi 480; AVX-NEXT: movq %rdx, %rax 481; AVX-NEXT: shrq $63, %rax 482; AVX-NEXT: sarq %rdx 483; AVX-NEXT: addq %rax, %rdx 484; AVX-NEXT: leaq (,%rdx,8), %rax 485; AVX-NEXT: subq %rax, %rdx 486; AVX-NEXT: addq %rcx, %rdx 487; AVX-NEXT: vmovq %rdx, %xmm1 488; AVX-NEXT: vmovq %xmm0, %rcx 489; AVX-NEXT: movq %rcx, %rax 490; AVX-NEXT: imulq %rsi 491; AVX-NEXT: movq %rdx, %rax 492; AVX-NEXT: shrq $63, %rax 493; AVX-NEXT: sarq %rdx 494; AVX-NEXT: addq %rax, %rdx 495; AVX-NEXT: leaq (,%rdx,8), %rax 496; AVX-NEXT: subq %rax, %rdx 497; AVX-NEXT: addq %rcx, %rdx 498; AVX-NEXT: vmovq %rdx, %xmm0 499; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 500; AVX-NEXT: retq 501 %res = srem <2 x i64> %a, <i64 7, i64 7> 502 ret <2 x i64> %res 503} 504 505define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { 506; SSE2-LABEL: test_rem7_4i32: 507; SSE2: # %bb.0: 508; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 509; SSE2-NEXT: movdqa %xmm0, %xmm2 510; SSE2-NEXT: pmuludq %xmm1, %xmm2 511; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 512; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 513; SSE2-NEXT: pmuludq %xmm1, %xmm3 514; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 515; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 516; SSE2-NEXT: pxor %xmm3, %xmm3 517; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 518; SSE2-NEXT: pand %xmm1, %xmm3 519; SSE2-NEXT: paddd %xmm0, %xmm3 520; SSE2-NEXT: psubd %xmm3, %xmm2 521; SSE2-NEXT: paddd %xmm0, %xmm2 522; SSE2-NEXT: movdqa %xmm2, %xmm1 523; SSE2-NEXT: psrld $31, %xmm1 524; SSE2-NEXT: psrad $2, %xmm2 525; SSE2-NEXT: paddd %xmm1, %xmm2 526; SSE2-NEXT: movdqa %xmm2, %xmm1 527; SSE2-NEXT: pslld $3, %xmm1 528; SSE2-NEXT: psubd %xmm1, %xmm2 529; SSE2-NEXT: paddd %xmm2, %xmm0 530; SSE2-NEXT: retq 531; 532; SSE41-LABEL: test_rem7_4i32: 533; SSE41: # %bb.0: 534; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 535; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 536; SSE41-NEXT: pmuldq %xmm2, %xmm1 537; SSE41-NEXT: pmuldq %xmm0, %xmm2 538; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 539; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 540; SSE41-NEXT: paddd %xmm0, %xmm2 541; SSE41-NEXT: movdqa %xmm2, %xmm1 542; SSE41-NEXT: psrld $31, %xmm1 543; SSE41-NEXT: psrad $2, %xmm2 544; SSE41-NEXT: paddd %xmm1, %xmm2 545; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 546; SSE41-NEXT: psubd %xmm2, %xmm0 547; SSE41-NEXT: retq 548; 549; AVX1-LABEL: test_rem7_4i32: 550; AVX1: # %bb.0: 551; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 552; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 553; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 554; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 555; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 556; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 557; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 558; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 559; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1 560; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 561; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 562; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 563; AVX1-NEXT: retq 564; 565; AVX2-LABEL: test_rem7_4i32: 566; AVX2: # %bb.0: 567; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 568; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 569; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 570; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 571; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 572; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] 573; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 574; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 575; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1 576; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 577; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] 578; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 579; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 580; AVX2-NEXT: retq 581 %res = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 582 ret <4 x i32> %res 583} 584 585define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind { 586; SSE-LABEL: test_rem7_8i16: 587; SSE: # %bb.0: 588; SSE-NEXT: movdqa {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725] 589; SSE-NEXT: pmulhw %xmm0, %xmm1 590; SSE-NEXT: movdqa %xmm1, %xmm2 591; SSE-NEXT: psrlw $15, %xmm2 592; SSE-NEXT: psraw $1, %xmm1 593; SSE-NEXT: paddw %xmm2, %xmm1 594; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 595; SSE-NEXT: psubw %xmm1, %xmm0 596; SSE-NEXT: retq 597; 598; AVX-LABEL: test_rem7_8i16: 599; AVX: # %bb.0: 600; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 601; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 602; AVX-NEXT: vpsraw $1, %xmm1, %xmm1 603; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 604; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 605; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 606; AVX-NEXT: retq 607 %res = srem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 608 ret <8 x i16> %res 609} 610 611define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { 612; SSE2-LABEL: test_rem7_16i8: 613; SSE2: # %bb.0: 614; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 615; SSE2-NEXT: psraw $8, %xmm2 616; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427] 617; SSE2-NEXT: pmullw %xmm3, %xmm2 618; SSE2-NEXT: psrlw $8, %xmm2 619; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 620; SSE2-NEXT: psraw $8, %xmm1 621; SSE2-NEXT: pmullw %xmm3, %xmm1 622; SSE2-NEXT: psrlw $8, %xmm1 623; SSE2-NEXT: packuswb %xmm2, %xmm1 624; SSE2-NEXT: paddb %xmm0, %xmm1 625; SSE2-NEXT: movdqa %xmm1, %xmm2 626; SSE2-NEXT: psrlw $2, %xmm2 627; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 628; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 629; SSE2-NEXT: pxor %xmm3, %xmm2 630; SSE2-NEXT: psrlw $7, %xmm1 631; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 632; SSE2-NEXT: paddb %xmm2, %xmm1 633; SSE2-NEXT: psubb %xmm3, %xmm1 634; SSE2-NEXT: movdqa %xmm1, %xmm2 635; SSE2-NEXT: psllw $3, %xmm2 636; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 637; SSE2-NEXT: psubb %xmm2, %xmm1 638; SSE2-NEXT: paddb %xmm1, %xmm0 639; SSE2-NEXT: retq 640; 641; SSE41-LABEL: test_rem7_16i8: 642; SSE41: # %bb.0: 643; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 644; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] 645; SSE41-NEXT: pmullw %xmm2, %xmm1 646; SSE41-NEXT: psrlw $8, %xmm1 647; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 648; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 649; SSE41-NEXT: pmullw %xmm2, %xmm3 650; SSE41-NEXT: psrlw $8, %xmm3 651; SSE41-NEXT: packuswb %xmm3, %xmm1 652; SSE41-NEXT: paddb %xmm0, %xmm1 653; SSE41-NEXT: movdqa %xmm1, %xmm2 654; SSE41-NEXT: psrlw $2, %xmm2 655; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 656; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 657; SSE41-NEXT: pxor %xmm3, %xmm2 658; SSE41-NEXT: psrlw $7, %xmm1 659; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 660; SSE41-NEXT: paddb %xmm2, %xmm1 661; SSE41-NEXT: psubb %xmm3, %xmm1 662; SSE41-NEXT: movdqa %xmm1, %xmm2 663; SSE41-NEXT: psllw $3, %xmm2 664; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 665; SSE41-NEXT: psubb %xmm2, %xmm1 666; SSE41-NEXT: paddb %xmm1, %xmm0 667; SSE41-NEXT: retq 668; 669; AVX1-LABEL: test_rem7_16i8: 670; AVX1: # %bb.0: 671; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 672; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] 673; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 674; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 675; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 676; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 677; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 678; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 679; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 680; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 681; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 682; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 683; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 684; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 685; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 686; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 687; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 688; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 689; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 690; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 691; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 692; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 693; AVX1-NEXT: retq 694; 695; AVX2NOBW-LABEL: test_rem7_16i8: 696; AVX2NOBW: # %bb.0: 697; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 698; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 699; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 700; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 701; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 702; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 703; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 704; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 705; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 706; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 707; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 708; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 709; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 710; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 711; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 712; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 713; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 714; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 715; AVX2NOBW-NEXT: vzeroupper 716; AVX2NOBW-NEXT: retq 717; 718; AVX512BW-LABEL: test_rem7_16i8: 719; AVX512BW: # %bb.0: 720; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 721; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 722; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 723; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 724; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 725; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 726; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 727; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 728; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 729; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 730; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 731; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 732; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 733; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 734; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 735; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 736; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 737; AVX512BW-NEXT: vzeroupper 738; AVX512BW-NEXT: retq 739 %res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 740 ret <16 x i8> %res 741} 742 743; 744; srem by non-splat constant 745; 746 747define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { 748; SSE2-LABEL: test_remconstant_16i8: 749; SSE2: # %bb.0: 750; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 751; SSE2-NEXT: psraw $8, %xmm1 752; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1 753; SSE2-NEXT: psrlw $8, %xmm1 754; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 755; SSE2-NEXT: psraw $8, %xmm2 756; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 757; SSE2-NEXT: psrlw $8, %xmm2 758; SSE2-NEXT: packuswb %xmm1, %xmm2 759; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255] 760; SSE2-NEXT: pand %xmm0, %xmm1 761; SSE2-NEXT: paddb %xmm2, %xmm1 762; SSE2-NEXT: movdqa %xmm1, %xmm2 763; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 764; SSE2-NEXT: psraw $8, %xmm2 765; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 766; SSE2-NEXT: psrlw $8, %xmm2 767; SSE2-NEXT: movdqa %xmm1, %xmm3 768; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 769; SSE2-NEXT: psraw $8, %xmm3 770; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3 771; SSE2-NEXT: psrlw $8, %xmm3 772; SSE2-NEXT: packuswb %xmm2, %xmm3 773; SSE2-NEXT: psrlw $7, %xmm1 774; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 775; SSE2-NEXT: paddb %xmm3, %xmm1 776; SSE2-NEXT: movdqa %xmm1, %xmm2 777; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 778; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 779; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 780; SSE2-NEXT: pand %xmm3, %xmm2 781; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 782; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1 783; SSE2-NEXT: pand %xmm3, %xmm1 784; SSE2-NEXT: packuswb %xmm2, %xmm1 785; SSE2-NEXT: psubb %xmm1, %xmm0 786; SSE2-NEXT: retq 787; 788; SSE41-LABEL: test_remconstant_16i8: 789; SSE41: # %bb.0: 790; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 791; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 792; SSE41-NEXT: psrlw $8, %xmm2 793; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 794; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 795; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 796; SSE41-NEXT: psrlw $8, %xmm1 797; SSE41-NEXT: packuswb %xmm1, %xmm2 798; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255] 799; SSE41-NEXT: pand %xmm0, %xmm1 800; SSE41-NEXT: paddb %xmm2, %xmm1 801; SSE41-NEXT: movdqa %xmm1, %xmm2 802; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 803; SSE41-NEXT: psraw $8, %xmm2 804; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 805; SSE41-NEXT: psrlw $8, %xmm2 806; SSE41-NEXT: movdqa %xmm1, %xmm3 807; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 808; SSE41-NEXT: psraw $8, %xmm3 809; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm3 810; SSE41-NEXT: psrlw $8, %xmm3 811; SSE41-NEXT: packuswb %xmm2, %xmm3 812; SSE41-NEXT: psrlw $7, %xmm1 813; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 814; SSE41-NEXT: paddb %xmm3, %xmm1 815; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 816; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 817; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 818; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 819; SSE41-NEXT: pand %xmm3, %xmm1 820; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 821; SSE41-NEXT: pand %xmm3, %xmm2 822; SSE41-NEXT: packuswb %xmm1, %xmm2 823; SSE41-NEXT: psubb %xmm2, %xmm0 824; SSE41-NEXT: retq 825; 826; AVX1-LABEL: test_remconstant_16i8: 827; AVX1: # %bb.0: 828; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 829; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 830; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 831; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 832; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 833; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 834; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 835; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 836; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 837; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 838; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 839; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 840; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 841; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 842; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 843; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3 844; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 845; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 846; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 847; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 848; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 849; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 850; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 851; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 852; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 853; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 854; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 855; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 856; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 857; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 858; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 859; AVX1-NEXT: retq 860; 861; AVX2NOBW-LABEL: test_remconstant_16i8: 862; AVX2NOBW: # %bb.0: 863; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 864; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 865; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 866; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 867; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 868; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 869; AVX2NOBW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 870; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm2 871; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 872; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 873; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm3 874; AVX2NOBW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 875; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 876; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 877; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 878; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 879; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 880; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 881; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 882; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 883; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 884; AVX2NOBW-NEXT: vzeroupper 885; AVX2NOBW-NEXT: retq 886; 887; AVX512BW-LABEL: test_remconstant_16i8: 888; AVX512BW: # %bb.0: 889; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] 890; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 891; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 892; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 893; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 894; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm3 895; AVX512BW-NEXT: vpaddb %xmm3, %xmm2, %xmm2 896; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm3 897; AVX512BW-NEXT: vpsravw %zmm1, %zmm3, %zmm1 898; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 899; AVX512BW-NEXT: vpsrlw $7, %xmm2, %xmm2 900; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 901; AVX512BW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 902; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 903; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 904; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 905; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 906; AVX512BW-NEXT: vzeroupper 907; AVX512BW-NEXT: retq 908 %res = srem <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7> 909 ret <16 x i8> %res 910} 911 912; This test is just to show what an scalarized v16i8 division looks like. 913define <16 x i8> @test_rem_variable_16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 914; SSE2-LABEL: test_rem_variable_16i8: 915; SSE2: # %bb.0: 916; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 917; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) 918; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 919; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 920; SSE2-NEXT: movsbl %ah, %eax 921; SSE2-NEXT: movd %eax, %xmm0 922; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 923; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 924; SSE2-NEXT: movsbl %ah, %eax 925; SSE2-NEXT: movd %eax, %xmm1 926; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 927; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 928; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 929; SSE2-NEXT: movsbl %ah, %eax 930; SSE2-NEXT: movd %eax, %xmm0 931; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 932; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 933; SSE2-NEXT: movsbl %ah, %eax 934; SSE2-NEXT: movd %eax, %xmm2 935; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 936; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 937; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 938; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 939; SSE2-NEXT: movsbl %ah, %eax 940; SSE2-NEXT: movd %eax, %xmm0 941; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 942; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 943; SSE2-NEXT: movsbl %ah, %eax 944; SSE2-NEXT: movd %eax, %xmm3 945; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 946; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 947; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 948; SSE2-NEXT: movsbl %ah, %eax 949; SSE2-NEXT: movd %eax, %xmm0 950; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 951; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 952; SSE2-NEXT: movsbl %ah, %eax 953; SSE2-NEXT: movd %eax, %xmm1 954; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 955; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 956; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 957; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 958; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 959; SSE2-NEXT: movsbl %ah, %eax 960; SSE2-NEXT: movd %eax, %xmm0 961; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 962; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 963; SSE2-NEXT: movsbl %ah, %eax 964; SSE2-NEXT: movd %eax, %xmm2 965; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 966; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 967; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 968; SSE2-NEXT: movsbl %ah, %eax 969; SSE2-NEXT: movd %eax, %xmm0 970; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 971; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 972; SSE2-NEXT: movsbl %ah, %eax 973; SSE2-NEXT: movd %eax, %xmm3 974; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 975; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 976; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 977; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 978; SSE2-NEXT: movsbl %ah, %eax 979; SSE2-NEXT: movd %eax, %xmm0 980; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 981; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 982; SSE2-NEXT: movsbl %ah, %eax 983; SSE2-NEXT: movd %eax, %xmm2 984; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 985; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 986; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 987; SSE2-NEXT: movsbl %ah, %eax 988; SSE2-NEXT: movd %eax, %xmm4 989; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 990; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 991; SSE2-NEXT: movsbl %ah, %eax 992; SSE2-NEXT: movd %eax, %xmm0 993; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 994; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 995; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 996; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 997; SSE2-NEXT: retq 998; 999; SSE41-LABEL: test_rem_variable_16i8: 1000; SSE41: # %bb.0: 1001; SSE41-NEXT: pextrb $1, %xmm1, %ecx 1002; SSE41-NEXT: pextrb $1, %xmm0, %eax 1003; SSE41-NEXT: cbtw 1004; SSE41-NEXT: idivb %cl 1005; SSE41-NEXT: movsbl %ah, %ecx 1006; SSE41-NEXT: movd %xmm1, %edx 1007; SSE41-NEXT: movd %xmm0, %eax 1008; SSE41-NEXT: cbtw 1009; SSE41-NEXT: idivb %dl 1010; SSE41-NEXT: movsbl %ah, %eax 1011; SSE41-NEXT: movd %eax, %xmm2 1012; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 1013; SSE41-NEXT: pextrb $2, %xmm1, %ecx 1014; SSE41-NEXT: pextrb $2, %xmm0, %eax 1015; SSE41-NEXT: cbtw 1016; SSE41-NEXT: idivb %cl 1017; SSE41-NEXT: movsbl %ah, %eax 1018; SSE41-NEXT: pinsrb $2, %eax, %xmm2 1019; SSE41-NEXT: pextrb $3, %xmm1, %ecx 1020; SSE41-NEXT: pextrb $3, %xmm0, %eax 1021; SSE41-NEXT: cbtw 1022; SSE41-NEXT: idivb %cl 1023; SSE41-NEXT: movsbl %ah, %eax 1024; SSE41-NEXT: pinsrb $3, %eax, %xmm2 1025; SSE41-NEXT: pextrb $4, %xmm1, %ecx 1026; SSE41-NEXT: pextrb $4, %xmm0, %eax 1027; SSE41-NEXT: cbtw 1028; SSE41-NEXT: idivb %cl 1029; SSE41-NEXT: movsbl %ah, %eax 1030; SSE41-NEXT: pinsrb $4, %eax, %xmm2 1031; SSE41-NEXT: pextrb $5, %xmm1, %ecx 1032; SSE41-NEXT: pextrb $5, %xmm0, %eax 1033; SSE41-NEXT: cbtw 1034; SSE41-NEXT: idivb %cl 1035; SSE41-NEXT: movsbl %ah, %eax 1036; SSE41-NEXT: pinsrb $5, %eax, %xmm2 1037; SSE41-NEXT: pextrb $6, %xmm1, %ecx 1038; SSE41-NEXT: pextrb $6, %xmm0, %eax 1039; SSE41-NEXT: cbtw 1040; SSE41-NEXT: idivb %cl 1041; SSE41-NEXT: movsbl %ah, %eax 1042; SSE41-NEXT: pinsrb $6, %eax, %xmm2 1043; SSE41-NEXT: pextrb $7, %xmm1, %ecx 1044; SSE41-NEXT: pextrb $7, %xmm0, %eax 1045; SSE41-NEXT: cbtw 1046; SSE41-NEXT: idivb %cl 1047; SSE41-NEXT: movsbl %ah, %eax 1048; SSE41-NEXT: pinsrb $7, %eax, %xmm2 1049; SSE41-NEXT: pextrb $8, %xmm1, %ecx 1050; SSE41-NEXT: pextrb $8, %xmm0, %eax 1051; SSE41-NEXT: cbtw 1052; SSE41-NEXT: idivb %cl 1053; SSE41-NEXT: movsbl %ah, %eax 1054; SSE41-NEXT: pinsrb $8, %eax, %xmm2 1055; SSE41-NEXT: pextrb $9, %xmm1, %ecx 1056; SSE41-NEXT: pextrb $9, %xmm0, %eax 1057; SSE41-NEXT: cbtw 1058; SSE41-NEXT: idivb %cl 1059; SSE41-NEXT: movsbl %ah, %eax 1060; SSE41-NEXT: pinsrb $9, %eax, %xmm2 1061; SSE41-NEXT: pextrb $10, %xmm1, %ecx 1062; SSE41-NEXT: pextrb $10, %xmm0, %eax 1063; SSE41-NEXT: cbtw 1064; SSE41-NEXT: idivb %cl 1065; SSE41-NEXT: movsbl %ah, %eax 1066; SSE41-NEXT: pinsrb $10, %eax, %xmm2 1067; SSE41-NEXT: pextrb $11, %xmm1, %ecx 1068; SSE41-NEXT: pextrb $11, %xmm0, %eax 1069; SSE41-NEXT: cbtw 1070; SSE41-NEXT: idivb %cl 1071; SSE41-NEXT: movsbl %ah, %eax 1072; SSE41-NEXT: pinsrb $11, %eax, %xmm2 1073; SSE41-NEXT: pextrb $12, %xmm1, %ecx 1074; SSE41-NEXT: pextrb $12, %xmm0, %eax 1075; SSE41-NEXT: cbtw 1076; SSE41-NEXT: idivb %cl 1077; SSE41-NEXT: movsbl %ah, %eax 1078; SSE41-NEXT: pinsrb $12, %eax, %xmm2 1079; SSE41-NEXT: pextrb $13, %xmm1, %ecx 1080; SSE41-NEXT: pextrb $13, %xmm0, %eax 1081; SSE41-NEXT: cbtw 1082; SSE41-NEXT: idivb %cl 1083; SSE41-NEXT: movsbl %ah, %eax 1084; SSE41-NEXT: pinsrb $13, %eax, %xmm2 1085; SSE41-NEXT: pextrb $14, %xmm1, %ecx 1086; SSE41-NEXT: pextrb $14, %xmm0, %eax 1087; SSE41-NEXT: cbtw 1088; SSE41-NEXT: idivb %cl 1089; SSE41-NEXT: movsbl %ah, %eax 1090; SSE41-NEXT: pinsrb $14, %eax, %xmm2 1091; SSE41-NEXT: pextrb $15, %xmm1, %ecx 1092; SSE41-NEXT: pextrb $15, %xmm0, %eax 1093; SSE41-NEXT: cbtw 1094; SSE41-NEXT: idivb %cl 1095; SSE41-NEXT: movsbl %ah, %eax 1096; SSE41-NEXT: pinsrb $15, %eax, %xmm2 1097; SSE41-NEXT: movdqa %xmm2, %xmm0 1098; SSE41-NEXT: retq 1099; 1100; AVX-LABEL: test_rem_variable_16i8: 1101; AVX: # %bb.0: 1102; AVX-NEXT: vpextrb $1, %xmm1, %ecx 1103; AVX-NEXT: vpextrb $1, %xmm0, %eax 1104; AVX-NEXT: cbtw 1105; AVX-NEXT: idivb %cl 1106; AVX-NEXT: movsbl %ah, %ecx 1107; AVX-NEXT: vmovd %xmm1, %edx 1108; AVX-NEXT: vmovd %xmm0, %eax 1109; AVX-NEXT: cbtw 1110; AVX-NEXT: idivb %dl 1111; AVX-NEXT: movsbl %ah, %eax 1112; AVX-NEXT: vmovd %eax, %xmm2 1113; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 1114; AVX-NEXT: vpextrb $2, %xmm1, %ecx 1115; AVX-NEXT: vpextrb $2, %xmm0, %eax 1116; AVX-NEXT: cbtw 1117; AVX-NEXT: idivb %cl 1118; AVX-NEXT: movsbl %ah, %eax 1119; AVX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 1120; AVX-NEXT: vpextrb $3, %xmm1, %ecx 1121; AVX-NEXT: vpextrb $3, %xmm0, %eax 1122; AVX-NEXT: cbtw 1123; AVX-NEXT: idivb %cl 1124; AVX-NEXT: movsbl %ah, %eax 1125; AVX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 1126; AVX-NEXT: vpextrb $4, %xmm1, %ecx 1127; AVX-NEXT: vpextrb $4, %xmm0, %eax 1128; AVX-NEXT: cbtw 1129; AVX-NEXT: idivb %cl 1130; AVX-NEXT: movsbl %ah, %eax 1131; AVX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 1132; AVX-NEXT: vpextrb $5, %xmm1, %ecx 1133; AVX-NEXT: vpextrb $5, %xmm0, %eax 1134; AVX-NEXT: cbtw 1135; AVX-NEXT: idivb %cl 1136; AVX-NEXT: movsbl %ah, %eax 1137; AVX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 1138; AVX-NEXT: vpextrb $6, %xmm1, %ecx 1139; AVX-NEXT: vpextrb $6, %xmm0, %eax 1140; AVX-NEXT: cbtw 1141; AVX-NEXT: idivb %cl 1142; AVX-NEXT: movsbl %ah, %eax 1143; AVX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 1144; AVX-NEXT: vpextrb $7, %xmm1, %ecx 1145; AVX-NEXT: vpextrb $7, %xmm0, %eax 1146; AVX-NEXT: cbtw 1147; AVX-NEXT: idivb %cl 1148; AVX-NEXT: movsbl %ah, %eax 1149; AVX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 1150; AVX-NEXT: vpextrb $8, %xmm1, %ecx 1151; AVX-NEXT: vpextrb $8, %xmm0, %eax 1152; AVX-NEXT: cbtw 1153; AVX-NEXT: idivb %cl 1154; AVX-NEXT: movsbl %ah, %eax 1155; AVX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 1156; AVX-NEXT: vpextrb $9, %xmm1, %ecx 1157; AVX-NEXT: vpextrb $9, %xmm0, %eax 1158; AVX-NEXT: cbtw 1159; AVX-NEXT: idivb %cl 1160; AVX-NEXT: movsbl %ah, %eax 1161; AVX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 1162; AVX-NEXT: vpextrb $10, %xmm1, %ecx 1163; AVX-NEXT: vpextrb $10, %xmm0, %eax 1164; AVX-NEXT: cbtw 1165; AVX-NEXT: idivb %cl 1166; AVX-NEXT: movsbl %ah, %eax 1167; AVX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 1168; AVX-NEXT: vpextrb $11, %xmm1, %ecx 1169; AVX-NEXT: vpextrb $11, %xmm0, %eax 1170; AVX-NEXT: cbtw 1171; AVX-NEXT: idivb %cl 1172; AVX-NEXT: movsbl %ah, %eax 1173; AVX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 1174; AVX-NEXT: vpextrb $12, %xmm1, %ecx 1175; AVX-NEXT: vpextrb $12, %xmm0, %eax 1176; AVX-NEXT: cbtw 1177; AVX-NEXT: idivb %cl 1178; AVX-NEXT: movsbl %ah, %eax 1179; AVX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 1180; AVX-NEXT: vpextrb $13, %xmm1, %ecx 1181; AVX-NEXT: vpextrb $13, %xmm0, %eax 1182; AVX-NEXT: cbtw 1183; AVX-NEXT: idivb %cl 1184; AVX-NEXT: movsbl %ah, %eax 1185; AVX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 1186; AVX-NEXT: vpextrb $14, %xmm1, %ecx 1187; AVX-NEXT: vpextrb $14, %xmm0, %eax 1188; AVX-NEXT: cbtw 1189; AVX-NEXT: idivb %cl 1190; AVX-NEXT: movsbl %ah, %eax 1191; AVX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 1192; AVX-NEXT: vpextrb $15, %xmm1, %ecx 1193; AVX-NEXT: vpextrb $15, %xmm0, %eax 1194; AVX-NEXT: cbtw 1195; AVX-NEXT: idivb %cl 1196; AVX-NEXT: movsbl %ah, %eax 1197; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 1198; AVX-NEXT: retq 1199 %res = srem <16 x i8> %a, %b 1200 ret <16 x i8> %res 1201} 1202