• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW
7
8;
9; sdiv by 7
10;
11
12define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
13; SSE2-LABEL: test_div7_2i64:
14; SSE2:       # %bb.0:
15; SSE2-NEXT:    movq %xmm0, %rax
16; SSE2-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
17; SSE2-NEXT:    imulq %rcx
18; SSE2-NEXT:    movq %rdx, %rax
19; SSE2-NEXT:    shrq $63, %rax
20; SSE2-NEXT:    sarq %rdx
21; SSE2-NEXT:    addq %rax, %rdx
22; SSE2-NEXT:    movq %rdx, %xmm1
23; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
24; SSE2-NEXT:    movq %xmm0, %rax
25; SSE2-NEXT:    imulq %rcx
26; SSE2-NEXT:    movq %rdx, %rax
27; SSE2-NEXT:    shrq $63, %rax
28; SSE2-NEXT:    sarq %rdx
29; SSE2-NEXT:    addq %rax, %rdx
30; SSE2-NEXT:    movq %rdx, %xmm0
31; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
32; SSE2-NEXT:    movdqa %xmm1, %xmm0
33; SSE2-NEXT:    retq
34;
35; SSE41-LABEL: test_div7_2i64:
36; SSE41:       # %bb.0:
37; SSE41-NEXT:    pextrq $1, %xmm0, %rax
38; SSE41-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
39; SSE41-NEXT:    imulq %rcx
40; SSE41-NEXT:    movq %rdx, %rax
41; SSE41-NEXT:    shrq $63, %rax
42; SSE41-NEXT:    sarq %rdx
43; SSE41-NEXT:    addq %rax, %rdx
44; SSE41-NEXT:    movq %rdx, %xmm1
45; SSE41-NEXT:    movq %xmm0, %rax
46; SSE41-NEXT:    imulq %rcx
47; SSE41-NEXT:    movq %rdx, %rax
48; SSE41-NEXT:    shrq $63, %rax
49; SSE41-NEXT:    sarq %rdx
50; SSE41-NEXT:    addq %rax, %rdx
51; SSE41-NEXT:    movq %rdx, %xmm0
52; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
53; SSE41-NEXT:    retq
54;
55; AVX-LABEL: test_div7_2i64:
56; AVX:       # %bb.0:
57; AVX-NEXT:    vpextrq $1, %xmm0, %rax
58; AVX-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
59; AVX-NEXT:    imulq %rcx
60; AVX-NEXT:    movq %rdx, %rax
61; AVX-NEXT:    shrq $63, %rax
62; AVX-NEXT:    sarq %rdx
63; AVX-NEXT:    addq %rax, %rdx
64; AVX-NEXT:    vmovq %rdx, %xmm1
65; AVX-NEXT:    vmovq %xmm0, %rax
66; AVX-NEXT:    imulq %rcx
67; AVX-NEXT:    movq %rdx, %rax
68; AVX-NEXT:    shrq $63, %rax
69; AVX-NEXT:    sarq %rdx
70; AVX-NEXT:    addq %rax, %rdx
71; AVX-NEXT:    vmovq %rdx, %xmm0
72; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
73; AVX-NEXT:    retq
74  %res = sdiv <2 x i64> %a, <i64 7, i64 7>
75  ret <2 x i64> %res
76}
77
78define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
79; SSE2-LABEL: test_div7_4i32:
80; SSE2:       # %bb.0:
81; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
82; SSE2-NEXT:    movdqa %xmm0, %xmm1
83; SSE2-NEXT:    pmuludq %xmm2, %xmm1
84; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
85; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
86; SSE2-NEXT:    pmuludq %xmm2, %xmm3
87; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
88; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
89; SSE2-NEXT:    pxor %xmm3, %xmm3
90; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
91; SSE2-NEXT:    pand %xmm2, %xmm3
92; SSE2-NEXT:    paddd %xmm0, %xmm3
93; SSE2-NEXT:    psubd %xmm3, %xmm1
94; SSE2-NEXT:    paddd %xmm0, %xmm1
95; SSE2-NEXT:    movdqa %xmm1, %xmm0
96; SSE2-NEXT:    psrld $31, %xmm0
97; SSE2-NEXT:    psrad $2, %xmm1
98; SSE2-NEXT:    paddd %xmm0, %xmm1
99; SSE2-NEXT:    movdqa %xmm1, %xmm0
100; SSE2-NEXT:    retq
101;
102; SSE41-LABEL: test_div7_4i32:
103; SSE41:       # %bb.0:
104; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
105; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
106; SSE41-NEXT:    pmuldq %xmm1, %xmm2
107; SSE41-NEXT:    pmuldq %xmm0, %xmm1
108; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
109; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
110; SSE41-NEXT:    paddd %xmm0, %xmm1
111; SSE41-NEXT:    movdqa %xmm1, %xmm0
112; SSE41-NEXT:    psrld $31, %xmm0
113; SSE41-NEXT:    psrad $2, %xmm1
114; SSE41-NEXT:    paddd %xmm0, %xmm1
115; SSE41-NEXT:    movdqa %xmm1, %xmm0
116; SSE41-NEXT:    retq
117;
118; AVX1-LABEL: test_div7_4i32:
119; AVX1:       # %bb.0:
120; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
121; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
122; AVX1-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
123; AVX1-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
124; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
125; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
126; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
127; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
128; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
129; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
130; AVX1-NEXT:    retq
131;
132; AVX2-LABEL: test_div7_4i32:
133; AVX2:       # %bb.0:
134; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
135; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
136; AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
137; AVX2-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
138; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
139; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
140; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
141; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm1
142; AVX2-NEXT:    vpsrad $2, %xmm0, %xmm0
143; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
144; AVX2-NEXT:    retq
145  %res = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
146  ret <4 x i32> %res
147}
148
149define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
150; SSE-LABEL: test_div7_8i16:
151; SSE:       # %bb.0:
152; SSE-NEXT:    pmulhw {{.*}}(%rip), %xmm0
153; SSE-NEXT:    movdqa %xmm0, %xmm1
154; SSE-NEXT:    psrlw $15, %xmm1
155; SSE-NEXT:    psraw $1, %xmm0
156; SSE-NEXT:    paddw %xmm1, %xmm0
157; SSE-NEXT:    retq
158;
159; AVX-LABEL: test_div7_8i16:
160; AVX:       # %bb.0:
161; AVX-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
162; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm1
163; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
164; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
165; AVX-NEXT:    retq
166  %res = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
167  ret <8 x i16> %res
168}
169
170define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
171; SSE2-LABEL: test_div7_16i8:
172; SSE2:       # %bb.0:
173; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
174; SSE2-NEXT:    psraw $8, %xmm2
175; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427]
176; SSE2-NEXT:    pmullw %xmm3, %xmm2
177; SSE2-NEXT:    psrlw $8, %xmm2
178; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
179; SSE2-NEXT:    psraw $8, %xmm1
180; SSE2-NEXT:    pmullw %xmm3, %xmm1
181; SSE2-NEXT:    psrlw $8, %xmm1
182; SSE2-NEXT:    packuswb %xmm2, %xmm1
183; SSE2-NEXT:    paddb %xmm0, %xmm1
184; SSE2-NEXT:    movdqa %xmm1, %xmm0
185; SSE2-NEXT:    psrlw $2, %xmm0
186; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
187; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
188; SSE2-NEXT:    pxor %xmm2, %xmm0
189; SSE2-NEXT:    psrlw $7, %xmm1
190; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
191; SSE2-NEXT:    paddb %xmm0, %xmm1
192; SSE2-NEXT:    psubb %xmm2, %xmm1
193; SSE2-NEXT:    movdqa %xmm1, %xmm0
194; SSE2-NEXT:    retq
195;
196; SSE41-LABEL: test_div7_16i8:
197; SSE41:       # %bb.0:
198; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
199; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
200; SSE41-NEXT:    pmullw %xmm2, %xmm1
201; SSE41-NEXT:    psrlw $8, %xmm1
202; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
203; SSE41-NEXT:    pmovsxbw %xmm3, %xmm3
204; SSE41-NEXT:    pmullw %xmm2, %xmm3
205; SSE41-NEXT:    psrlw $8, %xmm3
206; SSE41-NEXT:    packuswb %xmm3, %xmm1
207; SSE41-NEXT:    paddb %xmm0, %xmm1
208; SSE41-NEXT:    movdqa %xmm1, %xmm0
209; SSE41-NEXT:    psrlw $2, %xmm0
210; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
211; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
212; SSE41-NEXT:    pxor %xmm2, %xmm0
213; SSE41-NEXT:    psrlw $7, %xmm1
214; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
215; SSE41-NEXT:    paddb %xmm0, %xmm1
216; SSE41-NEXT:    psubb %xmm2, %xmm1
217; SSE41-NEXT:    movdqa %xmm1, %xmm0
218; SSE41-NEXT:    retq
219;
220; AVX1-LABEL: test_div7_16i8:
221; AVX1:       # %bb.0:
222; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
223; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
224; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
225; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
226; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
227; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
228; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
229; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
230; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
231; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
232; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
233; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
234; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
235; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
236; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
237; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
238; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
239; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
240; AVX1-NEXT:    retq
241;
242; AVX2NOBW-LABEL: test_div7_16i8:
243; AVX2NOBW:       # %bb.0:
244; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
245; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
246; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
247; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
248; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
249; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
250; AVX2NOBW-NEXT:    vpsrlw $2, %xmm0, %xmm1
251; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
252; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
253; AVX2NOBW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
254; AVX2NOBW-NEXT:    vpsrlw $7, %xmm0, %xmm0
255; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
256; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
257; AVX2NOBW-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
258; AVX2NOBW-NEXT:    vzeroupper
259; AVX2NOBW-NEXT:    retq
260;
261; AVX512BW-LABEL: test_div7_16i8:
262; AVX512BW:       # %bb.0:
263; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm1
264; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
265; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
266; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
267; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
268; AVX512BW-NEXT:    vpsrlw $2, %xmm0, %xmm1
269; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
270; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
271; AVX512BW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
272; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm0
273; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
274; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
275; AVX512BW-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
276; AVX512BW-NEXT:    vzeroupper
277; AVX512BW-NEXT:    retq
278  %res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
279  ret <16 x i8> %res
280}
281
282;
283; sdiv by non-splat constant
284;
285
286define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
287; SSE2-LABEL: test_divconstant_16i8:
288; SSE2:       # %bb.0:
289; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
290; SSE2-NEXT:    psraw $8, %xmm1
291; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
292; SSE2-NEXT:    psrlw $8, %xmm1
293; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
294; SSE2-NEXT:    psraw $8, %xmm2
295; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
296; SSE2-NEXT:    psrlw $8, %xmm2
297; SSE2-NEXT:    packuswb %xmm1, %xmm2
298; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
299; SSE2-NEXT:    paddb %xmm2, %xmm0
300; SSE2-NEXT:    movdqa %xmm0, %xmm1
301; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
302; SSE2-NEXT:    psraw $8, %xmm1
303; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
304; SSE2-NEXT:    psrlw $8, %xmm1
305; SSE2-NEXT:    movdqa %xmm0, %xmm2
306; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
307; SSE2-NEXT:    psraw $8, %xmm2
308; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
309; SSE2-NEXT:    psrlw $8, %xmm2
310; SSE2-NEXT:    packuswb %xmm1, %xmm2
311; SSE2-NEXT:    psrlw $7, %xmm0
312; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
313; SSE2-NEXT:    paddb %xmm2, %xmm0
314; SSE2-NEXT:    retq
315;
316; SSE41-LABEL: test_divconstant_16i8:
317; SSE41:       # %bb.0:
318; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
319; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
320; SSE41-NEXT:    psrlw $8, %xmm1
321; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
322; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
323; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
324; SSE41-NEXT:    psrlw $8, %xmm2
325; SSE41-NEXT:    packuswb %xmm2, %xmm1
326; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
327; SSE41-NEXT:    paddb %xmm1, %xmm0
328; SSE41-NEXT:    movdqa %xmm0, %xmm1
329; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
330; SSE41-NEXT:    psraw $8, %xmm1
331; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
332; SSE41-NEXT:    psrlw $8, %xmm1
333; SSE41-NEXT:    movdqa %xmm0, %xmm2
334; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
335; SSE41-NEXT:    psraw $8, %xmm2
336; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
337; SSE41-NEXT:    psrlw $8, %xmm2
338; SSE41-NEXT:    packuswb %xmm1, %xmm2
339; SSE41-NEXT:    psrlw $7, %xmm0
340; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
341; SSE41-NEXT:    paddb %xmm2, %xmm0
342; SSE41-NEXT:    retq
343;
344; AVX1-LABEL: test_divconstant_16i8:
345; AVX1:       # %bb.0:
346; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
347; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
348; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
349; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
350; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
351; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
352; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
353; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
354; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
355; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
356; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
357; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
358; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
359; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
360; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
361; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
362; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
363; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
364; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
365; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
366; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
367; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
368; AVX1-NEXT:    retq
369;
370; AVX2NOBW-LABEL: test_divconstant_16i8:
371; AVX2NOBW:       # %bb.0:
372; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
373; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
374; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
375; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
376; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
377; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
378; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
379; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
380; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
381; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
382; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
383; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
384; AVX2NOBW-NEXT:    vpsrlw $7, %xmm0, %xmm0
385; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
386; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
387; AVX2NOBW-NEXT:    vzeroupper
388; AVX2NOBW-NEXT:    retq
389;
390; AVX512BW-LABEL: test_divconstant_16i8:
391; AVX512BW:       # %bb.0:
392; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2]
393; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm2
394; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
395; AVX512BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
396; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
397; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
398; AVX512BW-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
399; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm2
400; AVX512BW-NEXT:    vpsravw %zmm1, %zmm2, %zmm1
401; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
402; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm0
403; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
404; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
405; AVX512BW-NEXT:    vzeroupper
406; AVX512BW-NEXT:    retq
407  %res = sdiv <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
408  ret <16 x i8> %res
409}
410
411;
412; srem by 7
413;
414
415define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
416; SSE2-LABEL: test_rem7_2i64:
417; SSE2:       # %bb.0:
418; SSE2-NEXT:    movq %xmm0, %rcx
419; SSE2-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
420; SSE2-NEXT:    movq %rcx, %rax
421; SSE2-NEXT:    imulq %rsi
422; SSE2-NEXT:    movq %rdx, %rax
423; SSE2-NEXT:    shrq $63, %rax
424; SSE2-NEXT:    sarq %rdx
425; SSE2-NEXT:    addq %rax, %rdx
426; SSE2-NEXT:    leaq (,%rdx,8), %rax
427; SSE2-NEXT:    subq %rax, %rdx
428; SSE2-NEXT:    addq %rcx, %rdx
429; SSE2-NEXT:    movq %rdx, %xmm1
430; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
431; SSE2-NEXT:    movq %xmm0, %rcx
432; SSE2-NEXT:    movq %rcx, %rax
433; SSE2-NEXT:    imulq %rsi
434; SSE2-NEXT:    movq %rdx, %rax
435; SSE2-NEXT:    shrq $63, %rax
436; SSE2-NEXT:    sarq %rdx
437; SSE2-NEXT:    addq %rax, %rdx
438; SSE2-NEXT:    leaq (,%rdx,8), %rax
439; SSE2-NEXT:    subq %rax, %rdx
440; SSE2-NEXT:    addq %rcx, %rdx
441; SSE2-NEXT:    movq %rdx, %xmm0
442; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
443; SSE2-NEXT:    movdqa %xmm1, %xmm0
444; SSE2-NEXT:    retq
445;
446; SSE41-LABEL: test_rem7_2i64:
447; SSE41:       # %bb.0:
448; SSE41-NEXT:    pextrq $1, %xmm0, %rcx
449; SSE41-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
450; SSE41-NEXT:    movq %rcx, %rax
451; SSE41-NEXT:    imulq %rsi
452; SSE41-NEXT:    movq %rdx, %rax
453; SSE41-NEXT:    shrq $63, %rax
454; SSE41-NEXT:    sarq %rdx
455; SSE41-NEXT:    addq %rax, %rdx
456; SSE41-NEXT:    leaq (,%rdx,8), %rax
457; SSE41-NEXT:    subq %rax, %rdx
458; SSE41-NEXT:    addq %rcx, %rdx
459; SSE41-NEXT:    movq %rdx, %xmm1
460; SSE41-NEXT:    movq %xmm0, %rcx
461; SSE41-NEXT:    movq %rcx, %rax
462; SSE41-NEXT:    imulq %rsi
463; SSE41-NEXT:    movq %rdx, %rax
464; SSE41-NEXT:    shrq $63, %rax
465; SSE41-NEXT:    sarq %rdx
466; SSE41-NEXT:    addq %rax, %rdx
467; SSE41-NEXT:    leaq (,%rdx,8), %rax
468; SSE41-NEXT:    subq %rax, %rdx
469; SSE41-NEXT:    addq %rcx, %rdx
470; SSE41-NEXT:    movq %rdx, %xmm0
471; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
472; SSE41-NEXT:    retq
473;
474; AVX-LABEL: test_rem7_2i64:
475; AVX:       # %bb.0:
476; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
477; AVX-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
478; AVX-NEXT:    movq %rcx, %rax
479; AVX-NEXT:    imulq %rsi
480; AVX-NEXT:    movq %rdx, %rax
481; AVX-NEXT:    shrq $63, %rax
482; AVX-NEXT:    sarq %rdx
483; AVX-NEXT:    addq %rax, %rdx
484; AVX-NEXT:    leaq (,%rdx,8), %rax
485; AVX-NEXT:    subq %rax, %rdx
486; AVX-NEXT:    addq %rcx, %rdx
487; AVX-NEXT:    vmovq %rdx, %xmm1
488; AVX-NEXT:    vmovq %xmm0, %rcx
489; AVX-NEXT:    movq %rcx, %rax
490; AVX-NEXT:    imulq %rsi
491; AVX-NEXT:    movq %rdx, %rax
492; AVX-NEXT:    shrq $63, %rax
493; AVX-NEXT:    sarq %rdx
494; AVX-NEXT:    addq %rax, %rdx
495; AVX-NEXT:    leaq (,%rdx,8), %rax
496; AVX-NEXT:    subq %rax, %rdx
497; AVX-NEXT:    addq %rcx, %rdx
498; AVX-NEXT:    vmovq %rdx, %xmm0
499; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
500; AVX-NEXT:    retq
501  %res = srem <2 x i64> %a, <i64 7, i64 7>
502  ret <2 x i64> %res
503}
504
505define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
506; SSE2-LABEL: test_rem7_4i32:
507; SSE2:       # %bb.0:
508; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
509; SSE2-NEXT:    movdqa %xmm0, %xmm2
510; SSE2-NEXT:    pmuludq %xmm1, %xmm2
511; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
512; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
513; SSE2-NEXT:    pmuludq %xmm1, %xmm3
514; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
515; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
516; SSE2-NEXT:    pxor %xmm3, %xmm3
517; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
518; SSE2-NEXT:    pand %xmm1, %xmm3
519; SSE2-NEXT:    paddd %xmm0, %xmm3
520; SSE2-NEXT:    psubd %xmm3, %xmm2
521; SSE2-NEXT:    paddd %xmm0, %xmm2
522; SSE2-NEXT:    movdqa %xmm2, %xmm1
523; SSE2-NEXT:    psrld $31, %xmm1
524; SSE2-NEXT:    psrad $2, %xmm2
525; SSE2-NEXT:    paddd %xmm1, %xmm2
526; SSE2-NEXT:    movdqa %xmm2, %xmm1
527; SSE2-NEXT:    pslld $3, %xmm1
528; SSE2-NEXT:    psubd %xmm1, %xmm2
529; SSE2-NEXT:    paddd %xmm2, %xmm0
530; SSE2-NEXT:    retq
531;
532; SSE41-LABEL: test_rem7_4i32:
533; SSE41:       # %bb.0:
534; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
535; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
536; SSE41-NEXT:    pmuldq %xmm2, %xmm1
537; SSE41-NEXT:    pmuldq %xmm0, %xmm2
538; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
539; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
540; SSE41-NEXT:    paddd %xmm0, %xmm2
541; SSE41-NEXT:    movdqa %xmm2, %xmm1
542; SSE41-NEXT:    psrld $31, %xmm1
543; SSE41-NEXT:    psrad $2, %xmm2
544; SSE41-NEXT:    paddd %xmm1, %xmm2
545; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
546; SSE41-NEXT:    psubd %xmm2, %xmm0
547; SSE41-NEXT:    retq
548;
549; AVX1-LABEL: test_rem7_4i32:
550; AVX1:       # %bb.0:
551; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
552; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
553; AVX1-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
554; AVX1-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
555; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
556; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
557; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
558; AVX1-NEXT:    vpsrld $31, %xmm1, %xmm2
559; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm1
560; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
561; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
562; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
563; AVX1-NEXT:    retq
564;
565; AVX2-LABEL: test_rem7_4i32:
566; AVX2:       # %bb.0:
567; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
568; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
569; AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
570; AVX2-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
571; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
572; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
573; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
574; AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
575; AVX2-NEXT:    vpsrad $2, %xmm1, %xmm1
576; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
577; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
578; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
579; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
580; AVX2-NEXT:    retq
581  %res = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
582  ret <4 x i32> %res
583}
584
585define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
586; SSE-LABEL: test_rem7_8i16:
587; SSE:       # %bb.0:
588; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725]
589; SSE-NEXT:    pmulhw %xmm0, %xmm1
590; SSE-NEXT:    movdqa %xmm1, %xmm2
591; SSE-NEXT:    psrlw $15, %xmm2
592; SSE-NEXT:    psraw $1, %xmm1
593; SSE-NEXT:    paddw %xmm2, %xmm1
594; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
595; SSE-NEXT:    psubw %xmm1, %xmm0
596; SSE-NEXT:    retq
597;
598; AVX-LABEL: test_rem7_8i16:
599; AVX:       # %bb.0:
600; AVX-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
601; AVX-NEXT:    vpsrlw $15, %xmm1, %xmm2
602; AVX-NEXT:    vpsraw $1, %xmm1, %xmm1
603; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
604; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
605; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
606; AVX-NEXT:    retq
607  %res = srem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
608  ret <8 x i16> %res
609}
610
611define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
612; SSE2-LABEL: test_rem7_16i8:
613; SSE2:       # %bb.0:
614; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
615; SSE2-NEXT:    psraw $8, %xmm2
616; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427]
617; SSE2-NEXT:    pmullw %xmm3, %xmm2
618; SSE2-NEXT:    psrlw $8, %xmm2
619; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
620; SSE2-NEXT:    psraw $8, %xmm1
621; SSE2-NEXT:    pmullw %xmm3, %xmm1
622; SSE2-NEXT:    psrlw $8, %xmm1
623; SSE2-NEXT:    packuswb %xmm2, %xmm1
624; SSE2-NEXT:    paddb %xmm0, %xmm1
625; SSE2-NEXT:    movdqa %xmm1, %xmm2
626; SSE2-NEXT:    psrlw $2, %xmm2
627; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
628; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
629; SSE2-NEXT:    pxor %xmm3, %xmm2
630; SSE2-NEXT:    psrlw $7, %xmm1
631; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
632; SSE2-NEXT:    paddb %xmm2, %xmm1
633; SSE2-NEXT:    psubb %xmm3, %xmm1
634; SSE2-NEXT:    movdqa %xmm1, %xmm2
635; SSE2-NEXT:    psllw $3, %xmm2
636; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
637; SSE2-NEXT:    psubb %xmm2, %xmm1
638; SSE2-NEXT:    paddb %xmm1, %xmm0
639; SSE2-NEXT:    retq
640;
641; SSE41-LABEL: test_rem7_16i8:
642; SSE41:       # %bb.0:
643; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
644; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
645; SSE41-NEXT:    pmullw %xmm2, %xmm1
646; SSE41-NEXT:    psrlw $8, %xmm1
647; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
648; SSE41-NEXT:    pmovsxbw %xmm3, %xmm3
649; SSE41-NEXT:    pmullw %xmm2, %xmm3
650; SSE41-NEXT:    psrlw $8, %xmm3
651; SSE41-NEXT:    packuswb %xmm3, %xmm1
652; SSE41-NEXT:    paddb %xmm0, %xmm1
653; SSE41-NEXT:    movdqa %xmm1, %xmm2
654; SSE41-NEXT:    psrlw $2, %xmm2
655; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
656; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
657; SSE41-NEXT:    pxor %xmm3, %xmm2
658; SSE41-NEXT:    psrlw $7, %xmm1
659; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
660; SSE41-NEXT:    paddb %xmm2, %xmm1
661; SSE41-NEXT:    psubb %xmm3, %xmm1
662; SSE41-NEXT:    movdqa %xmm1, %xmm2
663; SSE41-NEXT:    psllw $3, %xmm2
664; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
665; SSE41-NEXT:    psubb %xmm2, %xmm1
666; SSE41-NEXT:    paddb %xmm1, %xmm0
667; SSE41-NEXT:    retq
668;
669; AVX1-LABEL: test_rem7_16i8:
670; AVX1:       # %bb.0:
671; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
672; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
673; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
674; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
675; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
676; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
677; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
678; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
679; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
680; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
681; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm2
682; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
683; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
684; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
685; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
686; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
687; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
688; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
689; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm2
690; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
691; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
692; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
693; AVX1-NEXT:    retq
694;
695; AVX2NOBW-LABEL: test_rem7_16i8:
696; AVX2NOBW:       # %bb.0:
697; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
698; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
699; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
700; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
701; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
702; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
703; AVX2NOBW-NEXT:    vpsrlw $2, %xmm1, %xmm2
704; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
705; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
706; AVX2NOBW-NEXT:    vpxor %xmm3, %xmm2, %xmm2
707; AVX2NOBW-NEXT:    vpsrlw $7, %xmm1, %xmm1
708; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
709; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
710; AVX2NOBW-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
711; AVX2NOBW-NEXT:    vpsllw $3, %xmm1, %xmm2
712; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
713; AVX2NOBW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
714; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
715; AVX2NOBW-NEXT:    vzeroupper
716; AVX2NOBW-NEXT:    retq
717;
718; AVX512BW-LABEL: test_rem7_16i8:
719; AVX512BW:       # %bb.0:
720; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm1
721; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
722; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
723; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
724; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
725; AVX512BW-NEXT:    vpsrlw $2, %xmm1, %xmm2
726; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
727; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
728; AVX512BW-NEXT:    vpxor %xmm3, %xmm2, %xmm2
729; AVX512BW-NEXT:    vpsrlw $7, %xmm1, %xmm1
730; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
731; AVX512BW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
732; AVX512BW-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
733; AVX512BW-NEXT:    vpsllw $3, %xmm1, %xmm2
734; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
735; AVX512BW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
736; AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
737; AVX512BW-NEXT:    vzeroupper
738; AVX512BW-NEXT:    retq
739  %res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
740  ret <16 x i8> %res
741}
742
743;
744; srem by non-splat constant
745;
746
747define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
748; SSE2-LABEL: test_remconstant_16i8:
749; SSE2:       # %bb.0:
750; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
751; SSE2-NEXT:    psraw $8, %xmm1
752; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
753; SSE2-NEXT:    psrlw $8, %xmm1
754; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
755; SSE2-NEXT:    psraw $8, %xmm2
756; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
757; SSE2-NEXT:    psrlw $8, %xmm2
758; SSE2-NEXT:    packuswb %xmm1, %xmm2
759; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
760; SSE2-NEXT:    pand %xmm0, %xmm1
761; SSE2-NEXT:    paddb %xmm2, %xmm1
762; SSE2-NEXT:    movdqa %xmm1, %xmm2
763; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
764; SSE2-NEXT:    psraw $8, %xmm2
765; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
766; SSE2-NEXT:    psrlw $8, %xmm2
767; SSE2-NEXT:    movdqa %xmm1, %xmm3
768; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
769; SSE2-NEXT:    psraw $8, %xmm3
770; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
771; SSE2-NEXT:    psrlw $8, %xmm3
772; SSE2-NEXT:    packuswb %xmm2, %xmm3
773; SSE2-NEXT:    psrlw $7, %xmm1
774; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
775; SSE2-NEXT:    paddb %xmm3, %xmm1
776; SSE2-NEXT:    movdqa %xmm1, %xmm2
777; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
778; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
779; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
780; SSE2-NEXT:    pand %xmm3, %xmm2
781; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
782; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
783; SSE2-NEXT:    pand %xmm3, %xmm1
784; SSE2-NEXT:    packuswb %xmm2, %xmm1
785; SSE2-NEXT:    psubb %xmm1, %xmm0
786; SSE2-NEXT:    retq
787;
788; SSE41-LABEL: test_remconstant_16i8:
789; SSE41:       # %bb.0:
790; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
791; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
792; SSE41-NEXT:    psrlw $8, %xmm2
793; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
794; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
795; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
796; SSE41-NEXT:    psrlw $8, %xmm1
797; SSE41-NEXT:    packuswb %xmm1, %xmm2
798; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
799; SSE41-NEXT:    pand %xmm0, %xmm1
800; SSE41-NEXT:    paddb %xmm2, %xmm1
801; SSE41-NEXT:    movdqa %xmm1, %xmm2
802; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
803; SSE41-NEXT:    psraw $8, %xmm2
804; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
805; SSE41-NEXT:    psrlw $8, %xmm2
806; SSE41-NEXT:    movdqa %xmm1, %xmm3
807; SSE41-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
808; SSE41-NEXT:    psraw $8, %xmm3
809; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm3
810; SSE41-NEXT:    psrlw $8, %xmm3
811; SSE41-NEXT:    packuswb %xmm2, %xmm3
812; SSE41-NEXT:    psrlw $7, %xmm1
813; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
814; SSE41-NEXT:    paddb %xmm3, %xmm1
815; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
816; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
817; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
818; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
819; SSE41-NEXT:    pand %xmm3, %xmm1
820; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
821; SSE41-NEXT:    pand %xmm3, %xmm2
822; SSE41-NEXT:    packuswb %xmm1, %xmm2
823; SSE41-NEXT:    psubb %xmm2, %xmm0
824; SSE41-NEXT:    retq
825;
826; AVX1-LABEL: test_remconstant_16i8:
827; AVX1:       # %bb.0:
828; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
829; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
830; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
831; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
832; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
833; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
834; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
835; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
836; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm2
837; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
838; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
839; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
840; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
841; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
842; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
843; AVX1-NEXT:    vpsraw $8, %xmm3, %xmm3
844; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm3, %xmm3
845; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
846; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
847; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
848; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
849; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
850; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
851; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm2, %xmm2
852; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
853; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
854; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
855; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
856; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
857; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
858; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
859; AVX1-NEXT:    retq
860;
861; AVX2NOBW-LABEL: test_remconstant_16i8:
862; AVX2NOBW:       # %bb.0:
863; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
864; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
865; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
866; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
867; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
868; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm2
869; AVX2NOBW-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
870; AVX2NOBW-NEXT:    vpmovsxbw %xmm1, %ymm2
871; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
872; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
873; AVX2NOBW-NEXT:    vextracti128 $1, %ymm2, %xmm3
874; AVX2NOBW-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
875; AVX2NOBW-NEXT:    vpsrlw $7, %xmm1, %xmm1
876; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
877; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
878; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
879; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
880; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
881; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
882; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
883; AVX2NOBW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
884; AVX2NOBW-NEXT:    vzeroupper
885; AVX2NOBW-NEXT:    retq
886;
887; AVX512BW-LABEL: test_remconstant_16i8:
888; AVX512BW:       # %bb.0:
889; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2]
890; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm2
891; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
892; AVX512BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
893; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
894; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm3
895; AVX512BW-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
896; AVX512BW-NEXT:    vpmovsxbw %xmm2, %ymm3
897; AVX512BW-NEXT:    vpsravw %zmm1, %zmm3, %zmm1
898; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
899; AVX512BW-NEXT:    vpsrlw $7, %xmm2, %xmm2
900; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
901; AVX512BW-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
902; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
903; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
904; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
905; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
906; AVX512BW-NEXT:    vzeroupper
907; AVX512BW-NEXT:    retq
908  %res = srem <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
909  ret <16 x i8> %res
910}
911
912; This test is just to show what an scalarized v16i8 division looks like.
913define <16 x i8> @test_rem_variable_16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
914; SSE2-LABEL: test_rem_variable_16i8:
915; SSE2:       # %bb.0:
916; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
917; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
918; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
919; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
920; SSE2-NEXT:    movsbl %ah, %eax
921; SSE2-NEXT:    movd %eax, %xmm0
922; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
923; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
924; SSE2-NEXT:    movsbl %ah, %eax
925; SSE2-NEXT:    movd %eax, %xmm1
926; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
927; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
928; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
929; SSE2-NEXT:    movsbl %ah, %eax
930; SSE2-NEXT:    movd %eax, %xmm0
931; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
932; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
933; SSE2-NEXT:    movsbl %ah, %eax
934; SSE2-NEXT:    movd %eax, %xmm2
935; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
936; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
937; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
938; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
939; SSE2-NEXT:    movsbl %ah, %eax
940; SSE2-NEXT:    movd %eax, %xmm0
941; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
942; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
943; SSE2-NEXT:    movsbl %ah, %eax
944; SSE2-NEXT:    movd %eax, %xmm3
945; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
946; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
947; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
948; SSE2-NEXT:    movsbl %ah, %eax
949; SSE2-NEXT:    movd %eax, %xmm0
950; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
951; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
952; SSE2-NEXT:    movsbl %ah, %eax
953; SSE2-NEXT:    movd %eax, %xmm1
954; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
955; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
956; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
957; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
958; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
959; SSE2-NEXT:    movsbl %ah, %eax
960; SSE2-NEXT:    movd %eax, %xmm0
961; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
962; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
963; SSE2-NEXT:    movsbl %ah, %eax
964; SSE2-NEXT:    movd %eax, %xmm2
965; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
966; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
967; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
968; SSE2-NEXT:    movsbl %ah, %eax
969; SSE2-NEXT:    movd %eax, %xmm0
970; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
971; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
972; SSE2-NEXT:    movsbl %ah, %eax
973; SSE2-NEXT:    movd %eax, %xmm3
974; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
975; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
976; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
977; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
978; SSE2-NEXT:    movsbl %ah, %eax
979; SSE2-NEXT:    movd %eax, %xmm0
980; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
981; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
982; SSE2-NEXT:    movsbl %ah, %eax
983; SSE2-NEXT:    movd %eax, %xmm2
984; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
985; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
986; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
987; SSE2-NEXT:    movsbl %ah, %eax
988; SSE2-NEXT:    movd %eax, %xmm4
989; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
990; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
991; SSE2-NEXT:    movsbl %ah, %eax
992; SSE2-NEXT:    movd %eax, %xmm0
993; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
994; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
995; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
996; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
997; SSE2-NEXT:    retq
998;
999; SSE41-LABEL: test_rem_variable_16i8:
1000; SSE41:       # %bb.0:
1001; SSE41-NEXT:    pextrb $1, %xmm1, %ecx
1002; SSE41-NEXT:    pextrb $1, %xmm0, %eax
1003; SSE41-NEXT:    cbtw
1004; SSE41-NEXT:    idivb %cl
1005; SSE41-NEXT:    movsbl %ah, %ecx
1006; SSE41-NEXT:    movd %xmm1, %edx
1007; SSE41-NEXT:    movd %xmm0, %eax
1008; SSE41-NEXT:    cbtw
1009; SSE41-NEXT:    idivb %dl
1010; SSE41-NEXT:    movsbl %ah, %eax
1011; SSE41-NEXT:    movd %eax, %xmm2
1012; SSE41-NEXT:    pinsrb $1, %ecx, %xmm2
1013; SSE41-NEXT:    pextrb $2, %xmm1, %ecx
1014; SSE41-NEXT:    pextrb $2, %xmm0, %eax
1015; SSE41-NEXT:    cbtw
1016; SSE41-NEXT:    idivb %cl
1017; SSE41-NEXT:    movsbl %ah, %eax
1018; SSE41-NEXT:    pinsrb $2, %eax, %xmm2
1019; SSE41-NEXT:    pextrb $3, %xmm1, %ecx
1020; SSE41-NEXT:    pextrb $3, %xmm0, %eax
1021; SSE41-NEXT:    cbtw
1022; SSE41-NEXT:    idivb %cl
1023; SSE41-NEXT:    movsbl %ah, %eax
1024; SSE41-NEXT:    pinsrb $3, %eax, %xmm2
1025; SSE41-NEXT:    pextrb $4, %xmm1, %ecx
1026; SSE41-NEXT:    pextrb $4, %xmm0, %eax
1027; SSE41-NEXT:    cbtw
1028; SSE41-NEXT:    idivb %cl
1029; SSE41-NEXT:    movsbl %ah, %eax
1030; SSE41-NEXT:    pinsrb $4, %eax, %xmm2
1031; SSE41-NEXT:    pextrb $5, %xmm1, %ecx
1032; SSE41-NEXT:    pextrb $5, %xmm0, %eax
1033; SSE41-NEXT:    cbtw
1034; SSE41-NEXT:    idivb %cl
1035; SSE41-NEXT:    movsbl %ah, %eax
1036; SSE41-NEXT:    pinsrb $5, %eax, %xmm2
1037; SSE41-NEXT:    pextrb $6, %xmm1, %ecx
1038; SSE41-NEXT:    pextrb $6, %xmm0, %eax
1039; SSE41-NEXT:    cbtw
1040; SSE41-NEXT:    idivb %cl
1041; SSE41-NEXT:    movsbl %ah, %eax
1042; SSE41-NEXT:    pinsrb $6, %eax, %xmm2
1043; SSE41-NEXT:    pextrb $7, %xmm1, %ecx
1044; SSE41-NEXT:    pextrb $7, %xmm0, %eax
1045; SSE41-NEXT:    cbtw
1046; SSE41-NEXT:    idivb %cl
1047; SSE41-NEXT:    movsbl %ah, %eax
1048; SSE41-NEXT:    pinsrb $7, %eax, %xmm2
1049; SSE41-NEXT:    pextrb $8, %xmm1, %ecx
1050; SSE41-NEXT:    pextrb $8, %xmm0, %eax
1051; SSE41-NEXT:    cbtw
1052; SSE41-NEXT:    idivb %cl
1053; SSE41-NEXT:    movsbl %ah, %eax
1054; SSE41-NEXT:    pinsrb $8, %eax, %xmm2
1055; SSE41-NEXT:    pextrb $9, %xmm1, %ecx
1056; SSE41-NEXT:    pextrb $9, %xmm0, %eax
1057; SSE41-NEXT:    cbtw
1058; SSE41-NEXT:    idivb %cl
1059; SSE41-NEXT:    movsbl %ah, %eax
1060; SSE41-NEXT:    pinsrb $9, %eax, %xmm2
1061; SSE41-NEXT:    pextrb $10, %xmm1, %ecx
1062; SSE41-NEXT:    pextrb $10, %xmm0, %eax
1063; SSE41-NEXT:    cbtw
1064; SSE41-NEXT:    idivb %cl
1065; SSE41-NEXT:    movsbl %ah, %eax
1066; SSE41-NEXT:    pinsrb $10, %eax, %xmm2
1067; SSE41-NEXT:    pextrb $11, %xmm1, %ecx
1068; SSE41-NEXT:    pextrb $11, %xmm0, %eax
1069; SSE41-NEXT:    cbtw
1070; SSE41-NEXT:    idivb %cl
1071; SSE41-NEXT:    movsbl %ah, %eax
1072; SSE41-NEXT:    pinsrb $11, %eax, %xmm2
1073; SSE41-NEXT:    pextrb $12, %xmm1, %ecx
1074; SSE41-NEXT:    pextrb $12, %xmm0, %eax
1075; SSE41-NEXT:    cbtw
1076; SSE41-NEXT:    idivb %cl
1077; SSE41-NEXT:    movsbl %ah, %eax
1078; SSE41-NEXT:    pinsrb $12, %eax, %xmm2
1079; SSE41-NEXT:    pextrb $13, %xmm1, %ecx
1080; SSE41-NEXT:    pextrb $13, %xmm0, %eax
1081; SSE41-NEXT:    cbtw
1082; SSE41-NEXT:    idivb %cl
1083; SSE41-NEXT:    movsbl %ah, %eax
1084; SSE41-NEXT:    pinsrb $13, %eax, %xmm2
1085; SSE41-NEXT:    pextrb $14, %xmm1, %ecx
1086; SSE41-NEXT:    pextrb $14, %xmm0, %eax
1087; SSE41-NEXT:    cbtw
1088; SSE41-NEXT:    idivb %cl
1089; SSE41-NEXT:    movsbl %ah, %eax
1090; SSE41-NEXT:    pinsrb $14, %eax, %xmm2
1091; SSE41-NEXT:    pextrb $15, %xmm1, %ecx
1092; SSE41-NEXT:    pextrb $15, %xmm0, %eax
1093; SSE41-NEXT:    cbtw
1094; SSE41-NEXT:    idivb %cl
1095; SSE41-NEXT:    movsbl %ah, %eax
1096; SSE41-NEXT:    pinsrb $15, %eax, %xmm2
1097; SSE41-NEXT:    movdqa %xmm2, %xmm0
1098; SSE41-NEXT:    retq
1099;
1100; AVX-LABEL: test_rem_variable_16i8:
1101; AVX:       # %bb.0:
1102; AVX-NEXT:    vpextrb $1, %xmm1, %ecx
1103; AVX-NEXT:    vpextrb $1, %xmm0, %eax
1104; AVX-NEXT:    cbtw
1105; AVX-NEXT:    idivb %cl
1106; AVX-NEXT:    movsbl %ah, %ecx
1107; AVX-NEXT:    vmovd %xmm1, %edx
1108; AVX-NEXT:    vmovd %xmm0, %eax
1109; AVX-NEXT:    cbtw
1110; AVX-NEXT:    idivb %dl
1111; AVX-NEXT:    movsbl %ah, %eax
1112; AVX-NEXT:    vmovd %eax, %xmm2
1113; AVX-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
1114; AVX-NEXT:    vpextrb $2, %xmm1, %ecx
1115; AVX-NEXT:    vpextrb $2, %xmm0, %eax
1116; AVX-NEXT:    cbtw
1117; AVX-NEXT:    idivb %cl
1118; AVX-NEXT:    movsbl %ah, %eax
1119; AVX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
1120; AVX-NEXT:    vpextrb $3, %xmm1, %ecx
1121; AVX-NEXT:    vpextrb $3, %xmm0, %eax
1122; AVX-NEXT:    cbtw
1123; AVX-NEXT:    idivb %cl
1124; AVX-NEXT:    movsbl %ah, %eax
1125; AVX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
1126; AVX-NEXT:    vpextrb $4, %xmm1, %ecx
1127; AVX-NEXT:    vpextrb $4, %xmm0, %eax
1128; AVX-NEXT:    cbtw
1129; AVX-NEXT:    idivb %cl
1130; AVX-NEXT:    movsbl %ah, %eax
1131; AVX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
1132; AVX-NEXT:    vpextrb $5, %xmm1, %ecx
1133; AVX-NEXT:    vpextrb $5, %xmm0, %eax
1134; AVX-NEXT:    cbtw
1135; AVX-NEXT:    idivb %cl
1136; AVX-NEXT:    movsbl %ah, %eax
1137; AVX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
1138; AVX-NEXT:    vpextrb $6, %xmm1, %ecx
1139; AVX-NEXT:    vpextrb $6, %xmm0, %eax
1140; AVX-NEXT:    cbtw
1141; AVX-NEXT:    idivb %cl
1142; AVX-NEXT:    movsbl %ah, %eax
1143; AVX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
1144; AVX-NEXT:    vpextrb $7, %xmm1, %ecx
1145; AVX-NEXT:    vpextrb $7, %xmm0, %eax
1146; AVX-NEXT:    cbtw
1147; AVX-NEXT:    idivb %cl
1148; AVX-NEXT:    movsbl %ah, %eax
1149; AVX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
1150; AVX-NEXT:    vpextrb $8, %xmm1, %ecx
1151; AVX-NEXT:    vpextrb $8, %xmm0, %eax
1152; AVX-NEXT:    cbtw
1153; AVX-NEXT:    idivb %cl
1154; AVX-NEXT:    movsbl %ah, %eax
1155; AVX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
1156; AVX-NEXT:    vpextrb $9, %xmm1, %ecx
1157; AVX-NEXT:    vpextrb $9, %xmm0, %eax
1158; AVX-NEXT:    cbtw
1159; AVX-NEXT:    idivb %cl
1160; AVX-NEXT:    movsbl %ah, %eax
1161; AVX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
1162; AVX-NEXT:    vpextrb $10, %xmm1, %ecx
1163; AVX-NEXT:    vpextrb $10, %xmm0, %eax
1164; AVX-NEXT:    cbtw
1165; AVX-NEXT:    idivb %cl
1166; AVX-NEXT:    movsbl %ah, %eax
1167; AVX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
1168; AVX-NEXT:    vpextrb $11, %xmm1, %ecx
1169; AVX-NEXT:    vpextrb $11, %xmm0, %eax
1170; AVX-NEXT:    cbtw
1171; AVX-NEXT:    idivb %cl
1172; AVX-NEXT:    movsbl %ah, %eax
1173; AVX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
1174; AVX-NEXT:    vpextrb $12, %xmm1, %ecx
1175; AVX-NEXT:    vpextrb $12, %xmm0, %eax
1176; AVX-NEXT:    cbtw
1177; AVX-NEXT:    idivb %cl
1178; AVX-NEXT:    movsbl %ah, %eax
1179; AVX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
1180; AVX-NEXT:    vpextrb $13, %xmm1, %ecx
1181; AVX-NEXT:    vpextrb $13, %xmm0, %eax
1182; AVX-NEXT:    cbtw
1183; AVX-NEXT:    idivb %cl
1184; AVX-NEXT:    movsbl %ah, %eax
1185; AVX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
1186; AVX-NEXT:    vpextrb $14, %xmm1, %ecx
1187; AVX-NEXT:    vpextrb $14, %xmm0, %eax
1188; AVX-NEXT:    cbtw
1189; AVX-NEXT:    idivb %cl
1190; AVX-NEXT:    movsbl %ah, %eax
1191; AVX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
1192; AVX-NEXT:    vpextrb $15, %xmm1, %ecx
1193; AVX-NEXT:    vpextrb $15, %xmm0, %eax
1194; AVX-NEXT:    cbtw
1195; AVX-NEXT:    idivb %cl
1196; AVX-NEXT:    movsbl %ah, %eax
1197; AVX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm0
1198; AVX-NEXT:    retq
1199  %res = srem <16 x i8> %a, %b
1200  ret <16 x i8> %res
1201}
1202