• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
4
5;
6; sdiv by 7
7;
8
9define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
10; AVX-LABEL: test_div7_8i64:
11; AVX:       # %bb.0:
12; AVX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
13; AVX-NEXT:    vpextrq $1, %xmm1, %rax
14; AVX-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
15; AVX-NEXT:    imulq %rcx
16; AVX-NEXT:    movq %rdx, %rax
17; AVX-NEXT:    shrq $63, %rax
18; AVX-NEXT:    sarq %rdx
19; AVX-NEXT:    addq %rax, %rdx
20; AVX-NEXT:    vmovq %rdx, %xmm2
21; AVX-NEXT:    vmovq %xmm1, %rax
22; AVX-NEXT:    imulq %rcx
23; AVX-NEXT:    movq %rdx, %rax
24; AVX-NEXT:    shrq $63, %rax
25; AVX-NEXT:    sarq %rdx
26; AVX-NEXT:    addq %rax, %rdx
27; AVX-NEXT:    vmovq %rdx, %xmm1
28; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
29; AVX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
30; AVX-NEXT:    vpextrq $1, %xmm2, %rax
31; AVX-NEXT:    imulq %rcx
32; AVX-NEXT:    movq %rdx, %rax
33; AVX-NEXT:    shrq $63, %rax
34; AVX-NEXT:    sarq %rdx
35; AVX-NEXT:    addq %rax, %rdx
36; AVX-NEXT:    vmovq %rdx, %xmm3
37; AVX-NEXT:    vmovq %xmm2, %rax
38; AVX-NEXT:    imulq %rcx
39; AVX-NEXT:    movq %rdx, %rax
40; AVX-NEXT:    shrq $63, %rax
41; AVX-NEXT:    sarq %rdx
42; AVX-NEXT:    addq %rax, %rdx
43; AVX-NEXT:    vmovq %rdx, %xmm2
44; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
45; AVX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
46; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm2
47; AVX-NEXT:    vpextrq $1, %xmm2, %rax
48; AVX-NEXT:    imulq %rcx
49; AVX-NEXT:    movq %rdx, %rax
50; AVX-NEXT:    shrq $63, %rax
51; AVX-NEXT:    sarq %rdx
52; AVX-NEXT:    addq %rax, %rdx
53; AVX-NEXT:    vmovq %rdx, %xmm3
54; AVX-NEXT:    vmovq %xmm2, %rax
55; AVX-NEXT:    imulq %rcx
56; AVX-NEXT:    movq %rdx, %rax
57; AVX-NEXT:    shrq $63, %rax
58; AVX-NEXT:    sarq %rdx
59; AVX-NEXT:    addq %rax, %rdx
60; AVX-NEXT:    vmovq %rdx, %xmm2
61; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
62; AVX-NEXT:    vpextrq $1, %xmm0, %rax
63; AVX-NEXT:    imulq %rcx
64; AVX-NEXT:    movq %rdx, %rax
65; AVX-NEXT:    shrq $63, %rax
66; AVX-NEXT:    sarq %rdx
67; AVX-NEXT:    addq %rax, %rdx
68; AVX-NEXT:    vmovq %rdx, %xmm3
69; AVX-NEXT:    vmovq %xmm0, %rax
70; AVX-NEXT:    imulq %rcx
71; AVX-NEXT:    movq %rdx, %rax
72; AVX-NEXT:    shrq $63, %rax
73; AVX-NEXT:    sarq %rdx
74; AVX-NEXT:    addq %rax, %rdx
75; AVX-NEXT:    vmovq %rdx, %xmm0
76; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
77; AVX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
78; AVX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
79; AVX-NEXT:    retq
80  %res = sdiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
81  ret <8 x i64> %res
82}
83
84define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
85; AVX-LABEL: test_div7_16i32:
86; AVX:       # %bb.0:
87; AVX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
88; AVX-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2
89; AVX-NEXT:    vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
90; AVX-NEXT:    vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
91; AVX-NEXT:    vpmuldq %zmm1, %zmm3, %zmm1
92; AVX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
93; AVX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
94; AVX-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
95; AVX-NEXT:    vpsrld $31, %zmm0, %zmm1
96; AVX-NEXT:    vpsrad $2, %zmm0, %zmm0
97; AVX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
98; AVX-NEXT:    retq
99  %res = sdiv <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
100  ret <16 x i32> %res
101}
102
103define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
104; AVX512F-LABEL: test_div7_32i16:
105; AVX512F:       # %bb.0:
106; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
107; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
108; AVX512F-NEXT:    vpsrlw $15, %ymm0, %ymm3
109; AVX512F-NEXT:    vpsraw $1, %ymm0, %ymm0
110; AVX512F-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
111; AVX512F-NEXT:    vpmulhw %ymm2, %ymm1, %ymm1
112; AVX512F-NEXT:    vpsrlw $15, %ymm1, %ymm2
113; AVX512F-NEXT:    vpsraw $1, %ymm1, %ymm1
114; AVX512F-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
115; AVX512F-NEXT:    retq
116;
117; AVX512BW-LABEL: test_div7_32i16:
118; AVX512BW:       # %bb.0:
119; AVX512BW-NEXT:    vpmulhw {{.*}}(%rip), %zmm0, %zmm0
120; AVX512BW-NEXT:    vpsrlw $15, %zmm0, %zmm1
121; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm0
122; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
123; AVX512BW-NEXT:    retq
124  %res = sdiv <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
125  ret <32 x i16> %res
126}
127
128define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
129; AVX512F-LABEL: test_div7_64i8:
130; AVX512F:       # %bb.0:
131; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
132; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm2
133; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
134; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
135; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
136; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm4
137; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm4
138; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
139; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm2[2,3]
140; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
141; AVX512F-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
142; AVX512F-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
143; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm2
144; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
145; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
146; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm0
147; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
148; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
149; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
150; AVX512F-NEXT:    vpxor %ymm6, %ymm0, %ymm0
151; AVX512F-NEXT:    vpsubb %ymm6, %ymm0, %ymm0
152; AVX512F-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
153; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
154; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm2
155; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
156; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
157; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm7
158; AVX512F-NEXT:    vpmullw %ymm3, %ymm7, %ymm3
159; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
160; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm2[2,3]
161; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
162; AVX512F-NEXT:    vpackuswb %ymm7, %ymm2, %ymm2
163; AVX512F-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
164; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm2
165; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
166; AVX512F-NEXT:    vpsrlw $2, %ymm1, %ymm1
167; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
168; AVX512F-NEXT:    vpxor %ymm6, %ymm1, %ymm1
169; AVX512F-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
170; AVX512F-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
171; AVX512F-NEXT:    retq
172;
173; AVX512BW-LABEL: test_div7_64i8:
174; AVX512BW:       # %bb.0:
175; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm1
176; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
177; AVX512BW-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
178; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
179; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
180; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
181; AVX512BW-NEXT:    vpmovsxbw %ymm3, %zmm3
182; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
183; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
184; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
185; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
186; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
187; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm1
188; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
189; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
190; AVX512BW-NEXT:    vpxorq %zmm2, %zmm1, %zmm1
191; AVX512BW-NEXT:    vpsubb %zmm2, %zmm1, %zmm1
192; AVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm0
193; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
194; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
195; AVX512BW-NEXT:    retq
196  %res = sdiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
197  ret <64 x i8> %res
198}
199
200;
201; srem by 7
202;
203
204define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
205; AVX-LABEL: test_rem7_8i64:
206; AVX:       # %bb.0:
207; AVX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
208; AVX-NEXT:    vpextrq $1, %xmm1, %rcx
209; AVX-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
210; AVX-NEXT:    movq %rcx, %rax
211; AVX-NEXT:    imulq %rsi
212; AVX-NEXT:    movq %rdx, %rax
213; AVX-NEXT:    shrq $63, %rax
214; AVX-NEXT:    sarq %rdx
215; AVX-NEXT:    addq %rax, %rdx
216; AVX-NEXT:    leaq (,%rdx,8), %rax
217; AVX-NEXT:    subq %rax, %rdx
218; AVX-NEXT:    addq %rcx, %rdx
219; AVX-NEXT:    vmovq %rdx, %xmm2
220; AVX-NEXT:    vmovq %xmm1, %rcx
221; AVX-NEXT:    movq %rcx, %rax
222; AVX-NEXT:    imulq %rsi
223; AVX-NEXT:    movq %rdx, %rax
224; AVX-NEXT:    shrq $63, %rax
225; AVX-NEXT:    sarq %rdx
226; AVX-NEXT:    addq %rax, %rdx
227; AVX-NEXT:    leaq (,%rdx,8), %rax
228; AVX-NEXT:    subq %rax, %rdx
229; AVX-NEXT:    addq %rcx, %rdx
230; AVX-NEXT:    vmovq %rdx, %xmm1
231; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
232; AVX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
233; AVX-NEXT:    vpextrq $1, %xmm2, %rcx
234; AVX-NEXT:    movq %rcx, %rax
235; AVX-NEXT:    imulq %rsi
236; AVX-NEXT:    movq %rdx, %rax
237; AVX-NEXT:    shrq $63, %rax
238; AVX-NEXT:    sarq %rdx
239; AVX-NEXT:    addq %rax, %rdx
240; AVX-NEXT:    leaq (,%rdx,8), %rax
241; AVX-NEXT:    subq %rax, %rdx
242; AVX-NEXT:    addq %rcx, %rdx
243; AVX-NEXT:    vmovq %rdx, %xmm3
244; AVX-NEXT:    vmovq %xmm2, %rcx
245; AVX-NEXT:    movq %rcx, %rax
246; AVX-NEXT:    imulq %rsi
247; AVX-NEXT:    movq %rdx, %rax
248; AVX-NEXT:    shrq $63, %rax
249; AVX-NEXT:    sarq %rdx
250; AVX-NEXT:    addq %rax, %rdx
251; AVX-NEXT:    leaq (,%rdx,8), %rax
252; AVX-NEXT:    subq %rax, %rdx
253; AVX-NEXT:    addq %rcx, %rdx
254; AVX-NEXT:    vmovq %rdx, %xmm2
255; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
256; AVX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
257; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm2
258; AVX-NEXT:    vpextrq $1, %xmm2, %rcx
259; AVX-NEXT:    movq %rcx, %rax
260; AVX-NEXT:    imulq %rsi
261; AVX-NEXT:    movq %rdx, %rax
262; AVX-NEXT:    shrq $63, %rax
263; AVX-NEXT:    sarq %rdx
264; AVX-NEXT:    addq %rax, %rdx
265; AVX-NEXT:    leaq (,%rdx,8), %rax
266; AVX-NEXT:    subq %rax, %rdx
267; AVX-NEXT:    addq %rcx, %rdx
268; AVX-NEXT:    vmovq %rdx, %xmm3
269; AVX-NEXT:    vmovq %xmm2, %rcx
270; AVX-NEXT:    movq %rcx, %rax
271; AVX-NEXT:    imulq %rsi
272; AVX-NEXT:    movq %rdx, %rax
273; AVX-NEXT:    shrq $63, %rax
274; AVX-NEXT:    sarq %rdx
275; AVX-NEXT:    addq %rax, %rdx
276; AVX-NEXT:    leaq (,%rdx,8), %rax
277; AVX-NEXT:    subq %rax, %rdx
278; AVX-NEXT:    addq %rcx, %rdx
279; AVX-NEXT:    vmovq %rdx, %xmm2
280; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
281; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
282; AVX-NEXT:    movq %rcx, %rax
283; AVX-NEXT:    imulq %rsi
284; AVX-NEXT:    movq %rdx, %rax
285; AVX-NEXT:    shrq $63, %rax
286; AVX-NEXT:    sarq %rdx
287; AVX-NEXT:    addq %rax, %rdx
288; AVX-NEXT:    leaq (,%rdx,8), %rax
289; AVX-NEXT:    subq %rax, %rdx
290; AVX-NEXT:    addq %rcx, %rdx
291; AVX-NEXT:    vmovq %rdx, %xmm3
292; AVX-NEXT:    vmovq %xmm0, %rcx
293; AVX-NEXT:    movq %rcx, %rax
294; AVX-NEXT:    imulq %rsi
295; AVX-NEXT:    movq %rdx, %rax
296; AVX-NEXT:    shrq $63, %rax
297; AVX-NEXT:    sarq %rdx
298; AVX-NEXT:    addq %rax, %rdx
299; AVX-NEXT:    leaq (,%rdx,8), %rax
300; AVX-NEXT:    subq %rax, %rdx
301; AVX-NEXT:    addq %rcx, %rdx
302; AVX-NEXT:    vmovq %rdx, %xmm0
303; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
304; AVX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
305; AVX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
306; AVX-NEXT:    retq
307  %res = srem <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
308  ret <8 x i64> %res
309}
310
311define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
312; AVX-LABEL: test_rem7_16i32:
313; AVX:       # %bb.0:
314; AVX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
315; AVX-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2
316; AVX-NEXT:    vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
317; AVX-NEXT:    vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
318; AVX-NEXT:    vpmuldq %zmm1, %zmm3, %zmm1
319; AVX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
320; AVX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
321; AVX-NEXT:    vpaddd %zmm0, %zmm3, %zmm1
322; AVX-NEXT:    vpsrld $31, %zmm1, %zmm2
323; AVX-NEXT:    vpsrad $2, %zmm1, %zmm1
324; AVX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
325; AVX-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm1, %zmm1
326; AVX-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
327; AVX-NEXT:    retq
328  %res = srem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
329  ret <16 x i32> %res
330}
331
332define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
333; AVX512F-LABEL: test_rem7_32i16:
334; AVX512F:       # %bb.0:
335; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
336; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm3
337; AVX512F-NEXT:    vpsrlw $15, %ymm3, %ymm4
338; AVX512F-NEXT:    vpsraw $1, %ymm3, %ymm3
339; AVX512F-NEXT:    vpaddw %ymm4, %ymm3, %ymm3
340; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
341; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
342; AVX512F-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
343; AVX512F-NEXT:    vpmulhw %ymm2, %ymm1, %ymm2
344; AVX512F-NEXT:    vpsrlw $15, %ymm2, %ymm3
345; AVX512F-NEXT:    vpsraw $1, %ymm2, %ymm2
346; AVX512F-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
347; AVX512F-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
348; AVX512F-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
349; AVX512F-NEXT:    retq
350;
351; AVX512BW-LABEL: test_rem7_32i16:
352; AVX512BW:       # %bb.0:
353; AVX512BW-NEXT:    vpmulhw {{.*}}(%rip), %zmm0, %zmm1
354; AVX512BW-NEXT:    vpsrlw $15, %zmm1, %zmm2
355; AVX512BW-NEXT:    vpsraw $1, %zmm1, %zmm1
356; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
357; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
358; AVX512BW-NEXT:    vpsubw %zmm1, %zmm0, %zmm0
359; AVX512BW-NEXT:    retq
360  %res = srem <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
361  ret <32 x i16> %res
362}
363
364define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
365; AVX512F-LABEL: test_rem7_64i8:
366; AVX512F:       # %bb.0:
367; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
368; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm3
369; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
370; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm3
371; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
372; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm4
373; AVX512F-NEXT:    vpmullw %ymm2, %ymm4, %ymm4
374; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
375; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm3[2,3]
376; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
377; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
378; AVX512F-NEXT:    vpaddb %ymm0, %ymm3, %ymm3
379; AVX512F-NEXT:    vpsrlw $7, %ymm3, %ymm5
380; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
381; AVX512F-NEXT:    vpand %ymm4, %ymm5, %ymm7
382; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm3
383; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
384; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
385; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
386; AVX512F-NEXT:    vpxor %ymm6, %ymm3, %ymm3
387; AVX512F-NEXT:    vpsubb %ymm6, %ymm3, %ymm3
388; AVX512F-NEXT:    vpaddb %ymm7, %ymm3, %ymm7
389; AVX512F-NEXT:    vpmovsxbw %xmm7, %ymm8
390; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
391; AVX512F-NEXT:    vpmullw %ymm3, %ymm8, %ymm8
392; AVX512F-NEXT:    vpmovsxwd %ymm8, %zmm8
393; AVX512F-NEXT:    vpmovdb %zmm8, %xmm8
394; AVX512F-NEXT:    vextracti128 $1, %ymm7, %xmm7
395; AVX512F-NEXT:    vpmovsxbw %xmm7, %ymm7
396; AVX512F-NEXT:    vpmullw %ymm3, %ymm7, %ymm7
397; AVX512F-NEXT:    vpmovsxwd %ymm7, %zmm7
398; AVX512F-NEXT:    vpmovdb %zmm7, %xmm7
399; AVX512F-NEXT:    vinserti128 $1, %xmm7, %ymm8, %ymm7
400; AVX512F-NEXT:    vpsubb %ymm7, %ymm0, %ymm0
401; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm7
402; AVX512F-NEXT:    vpmovsxbw %xmm7, %ymm7
403; AVX512F-NEXT:    vpmullw %ymm2, %ymm7, %ymm7
404; AVX512F-NEXT:    vpsrlw $8, %ymm7, %ymm7
405; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm8
406; AVX512F-NEXT:    vpmullw %ymm2, %ymm8, %ymm2
407; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
408; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm2[2,3],ymm7[2,3]
409; AVX512F-NEXT:    vinserti128 $1, %xmm7, %ymm2, %ymm2
410; AVX512F-NEXT:    vpackuswb %ymm8, %ymm2, %ymm2
411; AVX512F-NEXT:    vpaddb %ymm1, %ymm2, %ymm2
412; AVX512F-NEXT:    vpsrlw $7, %ymm2, %ymm7
413; AVX512F-NEXT:    vpand %ymm4, %ymm7, %ymm4
414; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm2
415; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
416; AVX512F-NEXT:    vpxor %ymm6, %ymm2, %ymm2
417; AVX512F-NEXT:    vpsubb %ymm6, %ymm2, %ymm2
418; AVX512F-NEXT:    vpaddb %ymm4, %ymm2, %ymm2
419; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm4
420; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm4
421; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
422; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
423; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm2
424; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm2
425; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
426; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
427; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
428; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
429; AVX512F-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
430; AVX512F-NEXT:    retq
431;
432; AVX512BW-LABEL: test_rem7_64i8:
433; AVX512BW:       # %bb.0:
434; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm1
435; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
436; AVX512BW-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
437; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
438; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
439; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
440; AVX512BW-NEXT:    vpmovsxbw %ymm3, %zmm3
441; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
442; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
443; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
444; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
445; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm1
446; AVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm2
447; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
448; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
449; AVX512BW-NEXT:    vpxorq %zmm3, %zmm2, %zmm2
450; AVX512BW-NEXT:    vpsubb %zmm3, %zmm2, %zmm2
451; AVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm1
452; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
453; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
454; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm2
455; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
456; AVX512BW-NEXT:    vpmullw %zmm3, %zmm2, %zmm2
457; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
458; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
459; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
460; AVX512BW-NEXT:    vpmullw %zmm3, %zmm1, %zmm1
461; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
462; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
463; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
464; AVX512BW-NEXT:    retq
465  %res = srem <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
466  ret <64 x i8> %res
467}
468