• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW
7
8;
9; sdiv by 7
10;
11
12define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
13; SSE2-LABEL: test_div7_2i64:
14; SSE2:       # %bb.0:
15; SSE2-NEXT:    movq %xmm0, %rax
16; SSE2-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
17; SSE2-NEXT:    imulq %rcx
18; SSE2-NEXT:    movq %rdx, %rax
19; SSE2-NEXT:    shrq $63, %rax
20; SSE2-NEXT:    sarq %rdx
21; SSE2-NEXT:    addq %rax, %rdx
22; SSE2-NEXT:    movq %rdx, %xmm1
23; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
24; SSE2-NEXT:    movq %xmm0, %rax
25; SSE2-NEXT:    imulq %rcx
26; SSE2-NEXT:    movq %rdx, %rax
27; SSE2-NEXT:    shrq $63, %rax
28; SSE2-NEXT:    sarq %rdx
29; SSE2-NEXT:    addq %rax, %rdx
30; SSE2-NEXT:    movq %rdx, %xmm0
31; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
32; SSE2-NEXT:    movdqa %xmm1, %xmm0
33; SSE2-NEXT:    retq
34;
35; SSE41-LABEL: test_div7_2i64:
36; SSE41:       # %bb.0:
37; SSE41-NEXT:    pextrq $1, %xmm0, %rax
38; SSE41-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
39; SSE41-NEXT:    imulq %rcx
40; SSE41-NEXT:    movq %rdx, %rax
41; SSE41-NEXT:    shrq $63, %rax
42; SSE41-NEXT:    sarq %rdx
43; SSE41-NEXT:    addq %rax, %rdx
44; SSE41-NEXT:    movq %rdx, %xmm1
45; SSE41-NEXT:    movq %xmm0, %rax
46; SSE41-NEXT:    imulq %rcx
47; SSE41-NEXT:    movq %rdx, %rax
48; SSE41-NEXT:    shrq $63, %rax
49; SSE41-NEXT:    sarq %rdx
50; SSE41-NEXT:    addq %rax, %rdx
51; SSE41-NEXT:    movq %rdx, %xmm0
52; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
53; SSE41-NEXT:    retq
54;
55; AVX-LABEL: test_div7_2i64:
56; AVX:       # %bb.0:
57; AVX-NEXT:    vpextrq $1, %xmm0, %rax
58; AVX-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
59; AVX-NEXT:    imulq %rcx
60; AVX-NEXT:    movq %rdx, %rax
61; AVX-NEXT:    shrq $63, %rax
62; AVX-NEXT:    sarq %rdx
63; AVX-NEXT:    addq %rax, %rdx
64; AVX-NEXT:    vmovq %rdx, %xmm1
65; AVX-NEXT:    vmovq %xmm0, %rax
66; AVX-NEXT:    imulq %rcx
67; AVX-NEXT:    movq %rdx, %rax
68; AVX-NEXT:    shrq $63, %rax
69; AVX-NEXT:    sarq %rdx
70; AVX-NEXT:    addq %rax, %rdx
71; AVX-NEXT:    vmovq %rdx, %xmm0
72; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
73; AVX-NEXT:    retq
74  %res = sdiv <2 x i64> %a, <i64 7, i64 7>
75  ret <2 x i64> %res
76}
77
78define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
79; SSE2-LABEL: test_div7_4i32:
80; SSE2:       # %bb.0:
81; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
82; SSE2-NEXT:    movdqa %xmm0, %xmm2
83; SSE2-NEXT:    psrad $31, %xmm2
84; SSE2-NEXT:    pand %xmm1, %xmm2
85; SSE2-NEXT:    movdqa %xmm0, %xmm3
86; SSE2-NEXT:    pmuludq %xmm1, %xmm3
87; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
88; SSE2-NEXT:    psrad $31, %xmm1
89; SSE2-NEXT:    pand %xmm0, %xmm1
90; SSE2-NEXT:    paddd %xmm1, %xmm2
91; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
92; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
93; SSE2-NEXT:    pmuludq %xmm4, %xmm3
94; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
95; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
96; SSE2-NEXT:    psubd %xmm2, %xmm1
97; SSE2-NEXT:    paddd %xmm0, %xmm1
98; SSE2-NEXT:    movdqa %xmm1, %xmm0
99; SSE2-NEXT:    psrld $31, %xmm0
100; SSE2-NEXT:    psrad $2, %xmm1
101; SSE2-NEXT:    paddd %xmm0, %xmm1
102; SSE2-NEXT:    movdqa %xmm1, %xmm0
103; SSE2-NEXT:    retq
104;
105; SSE41-LABEL: test_div7_4i32:
106; SSE41:       # %bb.0:
107; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
108; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
109; SSE41-NEXT:    pmuldq %xmm1, %xmm2
110; SSE41-NEXT:    pmuldq %xmm0, %xmm1
111; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
112; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
113; SSE41-NEXT:    paddd %xmm0, %xmm1
114; SSE41-NEXT:    movdqa %xmm1, %xmm0
115; SSE41-NEXT:    psrld $31, %xmm0
116; SSE41-NEXT:    psrad $2, %xmm1
117; SSE41-NEXT:    paddd %xmm0, %xmm1
118; SSE41-NEXT:    movdqa %xmm1, %xmm0
119; SSE41-NEXT:    retq
120;
121; AVX1-LABEL: test_div7_4i32:
122; AVX1:       # %bb.0:
123; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
124; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
125; AVX1-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
126; AVX1-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
127; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
128; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
129; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
130; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
131; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
132; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
133; AVX1-NEXT:    retq
134;
135; AVX2-LABEL: test_div7_4i32:
136; AVX2:       # %bb.0:
137; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
138; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
139; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
140; AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
141; AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
142; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
143; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
144; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
145; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm1
146; AVX2-NEXT:    vpsrad $2, %xmm0, %xmm0
147; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
148; AVX2-NEXT:    retq
149  %res = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
150  ret <4 x i32> %res
151}
152
153define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
154; SSE-LABEL: test_div7_8i16:
155; SSE:       # %bb.0:
156; SSE-NEXT:    pmulhw {{.*}}(%rip), %xmm0
157; SSE-NEXT:    movdqa %xmm0, %xmm1
158; SSE-NEXT:    psrlw $15, %xmm1
159; SSE-NEXT:    psraw $1, %xmm0
160; SSE-NEXT:    paddw %xmm1, %xmm0
161; SSE-NEXT:    retq
162;
163; AVX-LABEL: test_div7_8i16:
164; AVX:       # %bb.0:
165; AVX-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
166; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm1
167; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
168; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
169; AVX-NEXT:    retq
170  %res = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
171  ret <8 x i16> %res
172}
173
174define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
175; SSE2-LABEL: test_div7_16i8:
176; SSE2:       # %bb.0:
177; SSE2-NEXT:    movdqa %xmm0, %xmm2
178; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
179; SSE2-NEXT:    psraw $8, %xmm2
180; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427]
181; SSE2-NEXT:    pmullw %xmm3, %xmm2
182; SSE2-NEXT:    psrlw $8, %xmm2
183; SSE2-NEXT:    movdqa %xmm0, %xmm1
184; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
185; SSE2-NEXT:    psraw $8, %xmm1
186; SSE2-NEXT:    pmullw %xmm3, %xmm1
187; SSE2-NEXT:    psrlw $8, %xmm1
188; SSE2-NEXT:    packuswb %xmm2, %xmm1
189; SSE2-NEXT:    paddb %xmm0, %xmm1
190; SSE2-NEXT:    movdqa %xmm1, %xmm0
191; SSE2-NEXT:    psrlw $2, %xmm0
192; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
193; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
194; SSE2-NEXT:    pxor %xmm2, %xmm0
195; SSE2-NEXT:    psubb %xmm2, %xmm0
196; SSE2-NEXT:    psrlw $7, %xmm1
197; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
198; SSE2-NEXT:    paddb %xmm0, %xmm1
199; SSE2-NEXT:    movdqa %xmm1, %xmm0
200; SSE2-NEXT:    retq
201;
202; SSE41-LABEL: test_div7_16i8:
203; SSE41:       # %bb.0:
204; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
205; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
206; SSE41-NEXT:    pmullw %xmm2, %xmm1
207; SSE41-NEXT:    psrlw $8, %xmm1
208; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
209; SSE41-NEXT:    pmovsxbw %xmm3, %xmm3
210; SSE41-NEXT:    pmullw %xmm2, %xmm3
211; SSE41-NEXT:    psrlw $8, %xmm3
212; SSE41-NEXT:    packuswb %xmm3, %xmm1
213; SSE41-NEXT:    paddb %xmm0, %xmm1
214; SSE41-NEXT:    movdqa %xmm1, %xmm0
215; SSE41-NEXT:    psrlw $2, %xmm0
216; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
217; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
218; SSE41-NEXT:    pxor %xmm2, %xmm0
219; SSE41-NEXT:    psubb %xmm2, %xmm0
220; SSE41-NEXT:    psrlw $7, %xmm1
221; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
222; SSE41-NEXT:    paddb %xmm0, %xmm1
223; SSE41-NEXT:    movdqa %xmm1, %xmm0
224; SSE41-NEXT:    retq
225;
226; AVX1-LABEL: test_div7_16i8:
227; AVX1:       # %bb.0:
228; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
229; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
230; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
231; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
232; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
233; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
234; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
235; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
236; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
237; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
238; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
239; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
240; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
241; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
242; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
243; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
244; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
245; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
246; AVX1-NEXT:    retq
247;
248; AVX2NOBW-LABEL: test_div7_16i8:
249; AVX2NOBW:       # %bb.0:
250; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
251; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
252; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
253; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
254; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
255; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
256; AVX2NOBW-NEXT:    vpsrlw $2, %xmm0, %xmm1
257; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
258; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
259; AVX2NOBW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
260; AVX2NOBW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
261; AVX2NOBW-NEXT:    vpsrlw $7, %xmm0, %xmm0
262; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
263; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
264; AVX2NOBW-NEXT:    vzeroupper
265; AVX2NOBW-NEXT:    retq
266;
267; AVX512BW-LABEL: test_div7_16i8:
268; AVX512BW:       # %bb.0:
269; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm1
270; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
271; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
272; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
273; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
274; AVX512BW-NEXT:    vpsrlw $2, %xmm0, %xmm1
275; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
276; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
277; AVX512BW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
278; AVX512BW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
279; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm0
280; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
281; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
282; AVX512BW-NEXT:    vzeroupper
283; AVX512BW-NEXT:    retq
284  %res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
285  ret <16 x i8> %res
286}
287
288;
289; srem by 7
290;
291
292define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
293; SSE2-LABEL: test_rem7_2i64:
294; SSE2:       # %bb.0:
295; SSE2-NEXT:    movq %xmm0, %rcx
296; SSE2-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
297; SSE2-NEXT:    movq %rcx, %rax
298; SSE2-NEXT:    imulq %rsi
299; SSE2-NEXT:    movq %rdx, %rax
300; SSE2-NEXT:    shrq $63, %rax
301; SSE2-NEXT:    sarq %rdx
302; SSE2-NEXT:    addq %rax, %rdx
303; SSE2-NEXT:    leaq (,%rdx,8), %rax
304; SSE2-NEXT:    subq %rax, %rdx
305; SSE2-NEXT:    addq %rcx, %rdx
306; SSE2-NEXT:    movq %rdx, %xmm1
307; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
308; SSE2-NEXT:    movq %xmm0, %rcx
309; SSE2-NEXT:    movq %rcx, %rax
310; SSE2-NEXT:    imulq %rsi
311; SSE2-NEXT:    movq %rdx, %rax
312; SSE2-NEXT:    shrq $63, %rax
313; SSE2-NEXT:    sarq %rdx
314; SSE2-NEXT:    addq %rax, %rdx
315; SSE2-NEXT:    leaq (,%rdx,8), %rax
316; SSE2-NEXT:    subq %rax, %rdx
317; SSE2-NEXT:    addq %rcx, %rdx
318; SSE2-NEXT:    movq %rdx, %xmm0
319; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
320; SSE2-NEXT:    movdqa %xmm1, %xmm0
321; SSE2-NEXT:    retq
322;
323; SSE41-LABEL: test_rem7_2i64:
324; SSE41:       # %bb.0:
325; SSE41-NEXT:    pextrq $1, %xmm0, %rcx
326; SSE41-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
327; SSE41-NEXT:    movq %rcx, %rax
328; SSE41-NEXT:    imulq %rsi
329; SSE41-NEXT:    movq %rdx, %rax
330; SSE41-NEXT:    shrq $63, %rax
331; SSE41-NEXT:    sarq %rdx
332; SSE41-NEXT:    addq %rax, %rdx
333; SSE41-NEXT:    leaq (,%rdx,8), %rax
334; SSE41-NEXT:    subq %rax, %rdx
335; SSE41-NEXT:    addq %rcx, %rdx
336; SSE41-NEXT:    movq %rdx, %xmm1
337; SSE41-NEXT:    movq %xmm0, %rcx
338; SSE41-NEXT:    movq %rcx, %rax
339; SSE41-NEXT:    imulq %rsi
340; SSE41-NEXT:    movq %rdx, %rax
341; SSE41-NEXT:    shrq $63, %rax
342; SSE41-NEXT:    sarq %rdx
343; SSE41-NEXT:    addq %rax, %rdx
344; SSE41-NEXT:    leaq (,%rdx,8), %rax
345; SSE41-NEXT:    subq %rax, %rdx
346; SSE41-NEXT:    addq %rcx, %rdx
347; SSE41-NEXT:    movq %rdx, %xmm0
348; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
349; SSE41-NEXT:    retq
350;
351; AVX-LABEL: test_rem7_2i64:
352; AVX:       # %bb.0:
353; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
354; AVX-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
355; AVX-NEXT:    movq %rcx, %rax
356; AVX-NEXT:    imulq %rsi
357; AVX-NEXT:    movq %rdx, %rax
358; AVX-NEXT:    shrq $63, %rax
359; AVX-NEXT:    sarq %rdx
360; AVX-NEXT:    addq %rax, %rdx
361; AVX-NEXT:    leaq (,%rdx,8), %rax
362; AVX-NEXT:    subq %rax, %rdx
363; AVX-NEXT:    addq %rcx, %rdx
364; AVX-NEXT:    vmovq %rdx, %xmm1
365; AVX-NEXT:    vmovq %xmm0, %rcx
366; AVX-NEXT:    movq %rcx, %rax
367; AVX-NEXT:    imulq %rsi
368; AVX-NEXT:    movq %rdx, %rax
369; AVX-NEXT:    shrq $63, %rax
370; AVX-NEXT:    sarq %rdx
371; AVX-NEXT:    addq %rax, %rdx
372; AVX-NEXT:    leaq (,%rdx,8), %rax
373; AVX-NEXT:    subq %rax, %rdx
374; AVX-NEXT:    addq %rcx, %rdx
375; AVX-NEXT:    vmovq %rdx, %xmm0
376; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
377; AVX-NEXT:    retq
378  %res = srem <2 x i64> %a, <i64 7, i64 7>
379  ret <2 x i64> %res
380}
381
382define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
383; SSE2-LABEL: test_rem7_4i32:
384; SSE2:       # %bb.0:
385; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
386; SSE2-NEXT:    movdqa %xmm0, %xmm2
387; SSE2-NEXT:    psrad $31, %xmm2
388; SSE2-NEXT:    pand %xmm1, %xmm2
389; SSE2-NEXT:    movdqa %xmm0, %xmm3
390; SSE2-NEXT:    pmuludq %xmm1, %xmm3
391; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
392; SSE2-NEXT:    psrad $31, %xmm1
393; SSE2-NEXT:    pand %xmm0, %xmm1
394; SSE2-NEXT:    paddd %xmm1, %xmm2
395; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
396; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
397; SSE2-NEXT:    pmuludq %xmm4, %xmm3
398; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
399; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
400; SSE2-NEXT:    psubd %xmm2, %xmm1
401; SSE2-NEXT:    paddd %xmm0, %xmm1
402; SSE2-NEXT:    movdqa %xmm1, %xmm2
403; SSE2-NEXT:    psrld $31, %xmm2
404; SSE2-NEXT:    psrad $2, %xmm1
405; SSE2-NEXT:    paddd %xmm2, %xmm1
406; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7]
407; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
408; SSE2-NEXT:    pmuludq %xmm2, %xmm1
409; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
410; SSE2-NEXT:    pmuludq %xmm2, %xmm3
411; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
412; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
413; SSE2-NEXT:    psubd %xmm1, %xmm0
414; SSE2-NEXT:    retq
415;
416; SSE41-LABEL: test_rem7_4i32:
417; SSE41:       # %bb.0:
418; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
419; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
420; SSE41-NEXT:    pmuldq %xmm2, %xmm1
421; SSE41-NEXT:    pmuldq %xmm0, %xmm2
422; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
423; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
424; SSE41-NEXT:    paddd %xmm0, %xmm2
425; SSE41-NEXT:    movdqa %xmm2, %xmm1
426; SSE41-NEXT:    psrld $31, %xmm1
427; SSE41-NEXT:    psrad $2, %xmm2
428; SSE41-NEXT:    paddd %xmm1, %xmm2
429; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
430; SSE41-NEXT:    psubd %xmm2, %xmm0
431; SSE41-NEXT:    retq
432;
433; AVX1-LABEL: test_rem7_4i32:
434; AVX1:       # %bb.0:
435; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
436; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
437; AVX1-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
438; AVX1-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
439; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
440; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
441; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
442; AVX1-NEXT:    vpsrld $31, %xmm1, %xmm2
443; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm1
444; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
445; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
446; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
447; AVX1-NEXT:    retq
448;
449; AVX2-LABEL: test_rem7_4i32:
450; AVX2:       # %bb.0:
451; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
452; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
453; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
454; AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
455; AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
456; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
457; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
458; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
459; AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
460; AVX2-NEXT:    vpsrad $2, %xmm1, %xmm1
461; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
462; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
463; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
464; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
465; AVX2-NEXT:    retq
466  %res = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
467  ret <4 x i32> %res
468}
469
470define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
471; SSE-LABEL: test_rem7_8i16:
472; SSE:       # %bb.0:
473; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725]
474; SSE-NEXT:    pmulhw %xmm0, %xmm1
475; SSE-NEXT:    movdqa %xmm1, %xmm2
476; SSE-NEXT:    psrlw $15, %xmm2
477; SSE-NEXT:    psraw $1, %xmm1
478; SSE-NEXT:    paddw %xmm2, %xmm1
479; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
480; SSE-NEXT:    psubw %xmm1, %xmm0
481; SSE-NEXT:    retq
482;
483; AVX-LABEL: test_rem7_8i16:
484; AVX:       # %bb.0:
485; AVX-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
486; AVX-NEXT:    vpsrlw $15, %xmm1, %xmm2
487; AVX-NEXT:    vpsraw $1, %xmm1, %xmm1
488; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
489; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
490; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
491; AVX-NEXT:    retq
492  %res = srem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
493  ret <8 x i16> %res
494}
495
496define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
497; SSE2-LABEL: test_rem7_16i8:
498; SSE2:       # %bb.0:
499; SSE2-NEXT:    movdqa %xmm0, %xmm2
500; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
501; SSE2-NEXT:    psraw $8, %xmm2
502; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427]
503; SSE2-NEXT:    pmullw %xmm3, %xmm2
504; SSE2-NEXT:    psrlw $8, %xmm2
505; SSE2-NEXT:    movdqa %xmm0, %xmm1
506; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
507; SSE2-NEXT:    psraw $8, %xmm1
508; SSE2-NEXT:    pmullw %xmm3, %xmm1
509; SSE2-NEXT:    psrlw $8, %xmm1
510; SSE2-NEXT:    packuswb %xmm2, %xmm1
511; SSE2-NEXT:    paddb %xmm0, %xmm1
512; SSE2-NEXT:    movdqa %xmm1, %xmm2
513; SSE2-NEXT:    psrlw $2, %xmm2
514; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
515; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
516; SSE2-NEXT:    pxor %xmm3, %xmm2
517; SSE2-NEXT:    psubb %xmm3, %xmm2
518; SSE2-NEXT:    psrlw $7, %xmm1
519; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
520; SSE2-NEXT:    paddb %xmm2, %xmm1
521; SSE2-NEXT:    movdqa %xmm1, %xmm2
522; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
523; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
524; SSE2-NEXT:    pmullw %xmm3, %xmm2
525; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
526; SSE2-NEXT:    pand %xmm4, %xmm2
527; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
528; SSE2-NEXT:    pmullw %xmm3, %xmm1
529; SSE2-NEXT:    pand %xmm4, %xmm1
530; SSE2-NEXT:    packuswb %xmm2, %xmm1
531; SSE2-NEXT:    psubb %xmm1, %xmm0
532; SSE2-NEXT:    retq
533;
534; SSE41-LABEL: test_rem7_16i8:
535; SSE41:       # %bb.0:
536; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
537; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
538; SSE41-NEXT:    pmullw %xmm2, %xmm1
539; SSE41-NEXT:    psrlw $8, %xmm1
540; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
541; SSE41-NEXT:    pmovsxbw %xmm3, %xmm3
542; SSE41-NEXT:    pmullw %xmm2, %xmm3
543; SSE41-NEXT:    psrlw $8, %xmm3
544; SSE41-NEXT:    packuswb %xmm3, %xmm1
545; SSE41-NEXT:    paddb %xmm0, %xmm1
546; SSE41-NEXT:    movdqa %xmm1, %xmm2
547; SSE41-NEXT:    psrlw $2, %xmm2
548; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
549; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
550; SSE41-NEXT:    pxor %xmm3, %xmm2
551; SSE41-NEXT:    psubb %xmm3, %xmm2
552; SSE41-NEXT:    psrlw $7, %xmm1
553; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
554; SSE41-NEXT:    paddb %xmm2, %xmm1
555; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
556; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
557; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
558; SSE41-NEXT:    pmullw %xmm3, %xmm1
559; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
560; SSE41-NEXT:    pand %xmm4, %xmm1
561; SSE41-NEXT:    pmullw %xmm3, %xmm2
562; SSE41-NEXT:    pand %xmm4, %xmm2
563; SSE41-NEXT:    packuswb %xmm1, %xmm2
564; SSE41-NEXT:    psubb %xmm2, %xmm0
565; SSE41-NEXT:    retq
566;
567; AVX1-LABEL: test_rem7_16i8:
568; AVX1:       # %bb.0:
569; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
570; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
571; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
572; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
573; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
574; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
575; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
576; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
577; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
578; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
579; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm2
580; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
581; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
582; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
583; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
584; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
585; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
586; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
587; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
588; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
589; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
590; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
591; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
592; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
593; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
594; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
595; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
596; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
597; AVX1-NEXT:    retq
598;
599; AVX2NOBW-LABEL: test_rem7_16i8:
600; AVX2NOBW:       # %bb.0:
601; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
602; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
603; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
604; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
605; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
606; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
607; AVX2NOBW-NEXT:    vpsrlw $2, %xmm1, %xmm2
608; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
609; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
610; AVX2NOBW-NEXT:    vpxor %xmm3, %xmm2, %xmm2
611; AVX2NOBW-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
612; AVX2NOBW-NEXT:    vpsrlw $7, %xmm1, %xmm1
613; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
614; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
615; AVX2NOBW-NEXT:    vpmovsxbw %xmm1, %ymm1
616; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
617; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
618; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
619; AVX2NOBW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
620; AVX2NOBW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
621; AVX2NOBW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
622; AVX2NOBW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
623; AVX2NOBW-NEXT:    vzeroupper
624; AVX2NOBW-NEXT:    retq
625;
626; AVX512BW-LABEL: test_rem7_16i8:
627; AVX512BW:       # %bb.0:
628; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm1
629; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
630; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
631; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
632; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
633; AVX512BW-NEXT:    vpsrlw $2, %xmm1, %xmm2
634; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
635; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
636; AVX512BW-NEXT:    vpxor %xmm3, %xmm2, %xmm2
637; AVX512BW-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
638; AVX512BW-NEXT:    vpsrlw $7, %xmm1, %xmm1
639; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
640; AVX512BW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
641; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
642; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
643; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
644; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
645; AVX512BW-NEXT:    vzeroupper
646; AVX512BW-NEXT:    retq
647  %res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
648  ret <16 x i8> %res
649}
650