• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX2NOBW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW
5
6;
7; sdiv by 7
8;
9
10define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
11; AVX1-LABEL: test_div7_4i64:
12; AVX1:       # %bb.0:
13; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
14; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
15; AVX1-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
16; AVX1-NEXT:    imulq %rcx
17; AVX1-NEXT:    movq %rdx, %rax
18; AVX1-NEXT:    shrq $63, %rax
19; AVX1-NEXT:    sarq %rdx
20; AVX1-NEXT:    addq %rax, %rdx
21; AVX1-NEXT:    vmovq %rdx, %xmm2
22; AVX1-NEXT:    vmovq %xmm1, %rax
23; AVX1-NEXT:    imulq %rcx
24; AVX1-NEXT:    movq %rdx, %rax
25; AVX1-NEXT:    shrq $63, %rax
26; AVX1-NEXT:    sarq %rdx
27; AVX1-NEXT:    addq %rax, %rdx
28; AVX1-NEXT:    vmovq %rdx, %xmm1
29; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
30; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
31; AVX1-NEXT:    imulq %rcx
32; AVX1-NEXT:    movq %rdx, %rax
33; AVX1-NEXT:    shrq $63, %rax
34; AVX1-NEXT:    sarq %rdx
35; AVX1-NEXT:    addq %rax, %rdx
36; AVX1-NEXT:    vmovq %rdx, %xmm2
37; AVX1-NEXT:    vmovq %xmm0, %rax
38; AVX1-NEXT:    imulq %rcx
39; AVX1-NEXT:    movq %rdx, %rax
40; AVX1-NEXT:    shrq $63, %rax
41; AVX1-NEXT:    sarq %rdx
42; AVX1-NEXT:    addq %rax, %rdx
43; AVX1-NEXT:    vmovq %rdx, %xmm0
44; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
45; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
46; AVX1-NEXT:    retq
47;
48; AVX2-LABEL: test_div7_4i64:
49; AVX2:       # %bb.0:
50; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
51; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
52; AVX2-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
53; AVX2-NEXT:    imulq %rcx
54; AVX2-NEXT:    movq %rdx, %rax
55; AVX2-NEXT:    shrq $63, %rax
56; AVX2-NEXT:    sarq %rdx
57; AVX2-NEXT:    addq %rax, %rdx
58; AVX2-NEXT:    vmovq %rdx, %xmm2
59; AVX2-NEXT:    vmovq %xmm1, %rax
60; AVX2-NEXT:    imulq %rcx
61; AVX2-NEXT:    movq %rdx, %rax
62; AVX2-NEXT:    shrq $63, %rax
63; AVX2-NEXT:    sarq %rdx
64; AVX2-NEXT:    addq %rax, %rdx
65; AVX2-NEXT:    vmovq %rdx, %xmm1
66; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
67; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
68; AVX2-NEXT:    imulq %rcx
69; AVX2-NEXT:    movq %rdx, %rax
70; AVX2-NEXT:    shrq $63, %rax
71; AVX2-NEXT:    sarq %rdx
72; AVX2-NEXT:    addq %rax, %rdx
73; AVX2-NEXT:    vmovq %rdx, %xmm2
74; AVX2-NEXT:    vmovq %xmm0, %rax
75; AVX2-NEXT:    imulq %rcx
76; AVX2-NEXT:    movq %rdx, %rax
77; AVX2-NEXT:    shrq $63, %rax
78; AVX2-NEXT:    sarq %rdx
79; AVX2-NEXT:    addq %rax, %rdx
80; AVX2-NEXT:    vmovq %rdx, %xmm0
81; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
82; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
83; AVX2-NEXT:    retq
84  %res = sdiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
85  ret <4 x i64> %res
86}
87
88define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
89; AVX1-LABEL: test_div7_8i32:
90; AVX1:       # %bb.0:
91; AVX1-NEXT:    vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
92; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
93; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
94; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
95; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
96; AVX1-NEXT:    vpmuldq %xmm3, %xmm5, %xmm3
97; AVX1-NEXT:    vpmuldq %xmm2, %xmm4, %xmm2
98; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
99; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
100; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
101; AVX1-NEXT:    vpsrld $31, %xmm2, %xmm3
102; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm2
103; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
104; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
105; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
106; AVX1-NEXT:    vpmuldq %xmm3, %xmm4, %xmm3
107; AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
108; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
109; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
110; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
111; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
112; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
113; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
114; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
115; AVX1-NEXT:    retq
116;
117; AVX2-LABEL: test_div7_8i32:
118; AVX2:       # %bb.0:
119; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
120; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
121; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
122; AVX2-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
123; AVX2-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
124; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
125; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
126; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
127; AVX2-NEXT:    vpsrld $31, %ymm0, %ymm1
128; AVX2-NEXT:    vpsrad $2, %ymm0, %ymm0
129; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
130; AVX2-NEXT:    retq
131  %res = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
132  ret <8 x i32> %res
133}
134
135define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
136; AVX1-LABEL: test_div7_16i16:
137; AVX1:       # %bb.0:
138; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
139; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
140; AVX1-NEXT:    vpmulhw %xmm2, %xmm1, %xmm1
141; AVX1-NEXT:    vpsrlw $15, %xmm1, %xmm3
142; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm1
143; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
144; AVX1-NEXT:    vpmulhw %xmm2, %xmm0, %xmm0
145; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm2
146; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm0
147; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
148; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
149; AVX1-NEXT:    retq
150;
151; AVX2-LABEL: test_div7_16i16:
152; AVX2:       # %bb.0:
153; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %ymm0, %ymm0
154; AVX2-NEXT:    vpsrlw $15, %ymm0, %ymm1
155; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm0
156; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
157; AVX2-NEXT:    retq
158  %res = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
159  ret <16 x i16> %res
160}
161
162define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
163; AVX1-LABEL: test_div7_32i8:
164; AVX1:       # %bb.0:
165; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
166; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm2
167; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427]
168; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
169; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
170; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
171; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
172; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm4
173; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
174; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
175; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
176; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm2
177; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
178; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
179; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
180; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
181; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
182; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
183; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
184; AVX1-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
185; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
186; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm2
187; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
188; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
189; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
190; AVX1-NEXT:    vpmovsxbw %xmm7, %xmm7
191; AVX1-NEXT:    vpmullw %xmm3, %xmm7, %xmm3
192; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
193; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
194; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
195; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm2
196; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
197; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
198; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
199; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
200; AVX1-NEXT:    vpsubb %xmm6, %xmm0, %xmm0
201; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
202; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
203; AVX1-NEXT:    retq
204;
205; AVX2NOBW-LABEL: test_div7_32i8:
206; AVX2NOBW:       # %bb.0:
207; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
208; AVX2NOBW-NEXT:    vpmovsxbw %xmm1, %ymm1
209; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
210; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
211; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
212; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm3
213; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
214; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
215; AVX2NOBW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
216; AVX2NOBW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
217; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
218; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
219; AVX2NOBW-NEXT:    vpsrlw $2, %ymm0, %ymm1
220; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
221; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
222; AVX2NOBW-NEXT:    vpxor %ymm2, %ymm1, %ymm1
223; AVX2NOBW-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
224; AVX2NOBW-NEXT:    vpsrlw $7, %ymm0, %ymm0
225; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
226; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
227; AVX2NOBW-NEXT:    retq
228;
229; AVX512BW-LABEL: test_div7_32i8:
230; AVX512BW:       # %bb.0:
231; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm1
232; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
233; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
234; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
235; AVX512BW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
236; AVX512BW-NEXT:    vpsrlw $2, %ymm0, %ymm1
237; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
238; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
239; AVX512BW-NEXT:    vpxor %ymm2, %ymm1, %ymm1
240; AVX512BW-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
241; AVX512BW-NEXT:    vpsrlw $7, %ymm0, %ymm0
242; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
243; AVX512BW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
244; AVX512BW-NEXT:    retq
245  %res = sdiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
246  ret <32 x i8> %res
247}
248
249;
250; srem by 7
251;
252
253define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
254; AVX1-LABEL: test_rem7_4i64:
255; AVX1:       # %bb.0:
256; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
257; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
258; AVX1-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
259; AVX1-NEXT:    movq %rcx, %rax
260; AVX1-NEXT:    imulq %rsi
261; AVX1-NEXT:    movq %rdx, %rax
262; AVX1-NEXT:    shrq $63, %rax
263; AVX1-NEXT:    sarq %rdx
264; AVX1-NEXT:    addq %rax, %rdx
265; AVX1-NEXT:    leaq (,%rdx,8), %rax
266; AVX1-NEXT:    subq %rax, %rdx
267; AVX1-NEXT:    addq %rcx, %rdx
268; AVX1-NEXT:    vmovq %rdx, %xmm2
269; AVX1-NEXT:    vmovq %xmm1, %rcx
270; AVX1-NEXT:    movq %rcx, %rax
271; AVX1-NEXT:    imulq %rsi
272; AVX1-NEXT:    movq %rdx, %rax
273; AVX1-NEXT:    shrq $63, %rax
274; AVX1-NEXT:    sarq %rdx
275; AVX1-NEXT:    addq %rax, %rdx
276; AVX1-NEXT:    leaq (,%rdx,8), %rax
277; AVX1-NEXT:    subq %rax, %rdx
278; AVX1-NEXT:    addq %rcx, %rdx
279; AVX1-NEXT:    vmovq %rdx, %xmm1
280; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
281; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
282; AVX1-NEXT:    movq %rcx, %rax
283; AVX1-NEXT:    imulq %rsi
284; AVX1-NEXT:    movq %rdx, %rax
285; AVX1-NEXT:    shrq $63, %rax
286; AVX1-NEXT:    sarq %rdx
287; AVX1-NEXT:    addq %rax, %rdx
288; AVX1-NEXT:    leaq (,%rdx,8), %rax
289; AVX1-NEXT:    subq %rax, %rdx
290; AVX1-NEXT:    addq %rcx, %rdx
291; AVX1-NEXT:    vmovq %rdx, %xmm2
292; AVX1-NEXT:    vmovq %xmm0, %rcx
293; AVX1-NEXT:    movq %rcx, %rax
294; AVX1-NEXT:    imulq %rsi
295; AVX1-NEXT:    movq %rdx, %rax
296; AVX1-NEXT:    shrq $63, %rax
297; AVX1-NEXT:    sarq %rdx
298; AVX1-NEXT:    addq %rax, %rdx
299; AVX1-NEXT:    leaq (,%rdx,8), %rax
300; AVX1-NEXT:    subq %rax, %rdx
301; AVX1-NEXT:    addq %rcx, %rdx
302; AVX1-NEXT:    vmovq %rdx, %xmm0
303; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
304; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
305; AVX1-NEXT:    retq
306;
307; AVX2-LABEL: test_rem7_4i64:
308; AVX2:       # %bb.0:
309; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
310; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
311; AVX2-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
312; AVX2-NEXT:    movq %rcx, %rax
313; AVX2-NEXT:    imulq %rsi
314; AVX2-NEXT:    movq %rdx, %rax
315; AVX2-NEXT:    shrq $63, %rax
316; AVX2-NEXT:    sarq %rdx
317; AVX2-NEXT:    addq %rax, %rdx
318; AVX2-NEXT:    leaq (,%rdx,8), %rax
319; AVX2-NEXT:    subq %rax, %rdx
320; AVX2-NEXT:    addq %rcx, %rdx
321; AVX2-NEXT:    vmovq %rdx, %xmm2
322; AVX2-NEXT:    vmovq %xmm1, %rcx
323; AVX2-NEXT:    movq %rcx, %rax
324; AVX2-NEXT:    imulq %rsi
325; AVX2-NEXT:    movq %rdx, %rax
326; AVX2-NEXT:    shrq $63, %rax
327; AVX2-NEXT:    sarq %rdx
328; AVX2-NEXT:    addq %rax, %rdx
329; AVX2-NEXT:    leaq (,%rdx,8), %rax
330; AVX2-NEXT:    subq %rax, %rdx
331; AVX2-NEXT:    addq %rcx, %rdx
332; AVX2-NEXT:    vmovq %rdx, %xmm1
333; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
334; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
335; AVX2-NEXT:    movq %rcx, %rax
336; AVX2-NEXT:    imulq %rsi
337; AVX2-NEXT:    movq %rdx, %rax
338; AVX2-NEXT:    shrq $63, %rax
339; AVX2-NEXT:    sarq %rdx
340; AVX2-NEXT:    addq %rax, %rdx
341; AVX2-NEXT:    leaq (,%rdx,8), %rax
342; AVX2-NEXT:    subq %rax, %rdx
343; AVX2-NEXT:    addq %rcx, %rdx
344; AVX2-NEXT:    vmovq %rdx, %xmm2
345; AVX2-NEXT:    vmovq %xmm0, %rcx
346; AVX2-NEXT:    movq %rcx, %rax
347; AVX2-NEXT:    imulq %rsi
348; AVX2-NEXT:    movq %rdx, %rax
349; AVX2-NEXT:    shrq $63, %rax
350; AVX2-NEXT:    sarq %rdx
351; AVX2-NEXT:    addq %rax, %rdx
352; AVX2-NEXT:    leaq (,%rdx,8), %rax
353; AVX2-NEXT:    subq %rax, %rdx
354; AVX2-NEXT:    addq %rcx, %rdx
355; AVX2-NEXT:    vmovq %rdx, %xmm0
356; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
357; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
358; AVX2-NEXT:    retq
359  %res = srem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
360  ret <4 x i64> %res
361}
362
363define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
364; AVX1-LABEL: test_rem7_8i32:
365; AVX1:       # %bb.0:
366; AVX1-NEXT:    vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
367; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
368; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
369; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
370; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
371; AVX1-NEXT:    vpmuldq %xmm3, %xmm5, %xmm3
372; AVX1-NEXT:    vpmuldq %xmm2, %xmm4, %xmm2
373; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
374; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
375; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
376; AVX1-NEXT:    vpsrld $31, %xmm2, %xmm3
377; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm2
378; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
379; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
380; AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
381; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
382; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
383; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
384; AVX1-NEXT:    vpmuldq %xmm4, %xmm5, %xmm4
385; AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
386; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
387; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
388; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
389; AVX1-NEXT:    vpsrld $31, %xmm1, %xmm4
390; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm1
391; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
392; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
393; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
394; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
395; AVX1-NEXT:    retq
396;
397; AVX2-LABEL: test_rem7_8i32:
398; AVX2:       # %bb.0:
399; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
400; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
401; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
402; AVX2-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
403; AVX2-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
404; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
405; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
406; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
407; AVX2-NEXT:    vpsrld $31, %ymm1, %ymm2
408; AVX2-NEXT:    vpsrad $2, %ymm1, %ymm1
409; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
410; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
411; AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
412; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
413; AVX2-NEXT:    retq
414  %res = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
415  ret <8 x i32> %res
416}
417
418define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
419; AVX1-LABEL: test_rem7_16i16:
420; AVX1:       # %bb.0:
421; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
422; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
423; AVX1-NEXT:    vpmulhw %xmm2, %xmm1, %xmm3
424; AVX1-NEXT:    vpsrlw $15, %xmm3, %xmm4
425; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm3
426; AVX1-NEXT:    vpaddw %xmm4, %xmm3, %xmm3
427; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7]
428; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
429; AVX1-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
430; AVX1-NEXT:    vpmulhw %xmm2, %xmm0, %xmm2
431; AVX1-NEXT:    vpsrlw $15, %xmm2, %xmm3
432; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm2
433; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
434; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
435; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
436; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
437; AVX1-NEXT:    retq
438;
439; AVX2-LABEL: test_rem7_16i16:
440; AVX2:       # %bb.0:
441; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %ymm0, %ymm1
442; AVX2-NEXT:    vpsrlw $15, %ymm1, %ymm2
443; AVX2-NEXT:    vpsraw $1, %ymm1, %ymm1
444; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
445; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
446; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
447; AVX2-NEXT:    retq
448  %res = srem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
449  ret <16 x i16> %res
450}
451
452define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
453; AVX1-LABEL: test_rem7_32i8:
454; AVX1:       # %bb.0:
455; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
456; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm3
457; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [65427,65427,65427,65427,65427,65427,65427,65427]
458; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
459; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
460; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
461; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
462; AVX1-NEXT:    vpmullw %xmm1, %xmm4, %xmm4
463; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
464; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
465; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
466; AVX1-NEXT:    vpsrlw $7, %xmm3, %xmm4
467; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
468; AVX1-NEXT:    vpand %xmm8, %xmm4, %xmm4
469; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm3
470; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
471; AVX1-NEXT:    vpand %xmm9, %xmm3, %xmm3
472; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
473; AVX1-NEXT:    vpxor %xmm7, %xmm3, %xmm3
474; AVX1-NEXT:    vpsubb %xmm7, %xmm3, %xmm3
475; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
476; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
477; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
478; AVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
479; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
480; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
481; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
482; AVX1-NEXT:    vpmullw %xmm5, %xmm3, %xmm3
483; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
484; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
485; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
486; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm3
487; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
488; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
489; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
490; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
491; AVX1-NEXT:    vpmullw %xmm1, %xmm4, %xmm1
492; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
493; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
494; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
495; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm3
496; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
497; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
498; AVX1-NEXT:    vpand %xmm9, %xmm1, %xmm1
499; AVX1-NEXT:    vpxor %xmm7, %xmm1, %xmm1
500; AVX1-NEXT:    vpsubb %xmm7, %xmm1, %xmm1
501; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
502; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
503; AVX1-NEXT:    vpmullw %xmm5, %xmm3, %xmm3
504; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
505; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
506; AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
507; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
508; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
509; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
510; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
511; AVX1-NEXT:    retq
512;
513; AVX2NOBW-LABEL: test_rem7_32i8:
514; AVX2NOBW:       # %bb.0:
515; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
516; AVX2NOBW-NEXT:    vpmovsxbw %xmm1, %ymm1
517; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
518; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
519; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
520; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm3
521; AVX2NOBW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
522; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
523; AVX2NOBW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
524; AVX2NOBW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
525; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
526; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm1
527; AVX2NOBW-NEXT:    vpsrlw $2, %ymm1, %ymm2
528; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
529; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
530; AVX2NOBW-NEXT:    vpxor %ymm3, %ymm2, %ymm2
531; AVX2NOBW-NEXT:    vpsubb %ymm3, %ymm2, %ymm2
532; AVX2NOBW-NEXT:    vpsrlw $7, %ymm1, %ymm1
533; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
534; AVX2NOBW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
535; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
536; AVX2NOBW-NEXT:    vpmovsxbw %xmm2, %ymm2
537; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
538; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
539; AVX2NOBW-NEXT:    vextracti128 $1, %ymm2, %xmm4
540; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
541; AVX2NOBW-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
542; AVX2NOBW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
543; AVX2NOBW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
544; AVX2NOBW-NEXT:    vpmovsxbw %xmm1, %ymm1
545; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
546; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm3
547; AVX2NOBW-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
548; AVX2NOBW-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
549; AVX2NOBW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
550; AVX2NOBW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
551; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
552; AVX2NOBW-NEXT:    retq
553;
554; AVX512BW-LABEL: test_rem7_32i8:
555; AVX512BW:       # %bb.0:
556; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm1
557; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
558; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
559; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
560; AVX512BW-NEXT:    vpaddb %ymm0, %ymm1, %ymm1
561; AVX512BW-NEXT:    vpsrlw $2, %ymm1, %ymm2
562; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
563; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
564; AVX512BW-NEXT:    vpxor %ymm3, %ymm2, %ymm2
565; AVX512BW-NEXT:    vpsubb %ymm3, %ymm2, %ymm2
566; AVX512BW-NEXT:    vpsrlw $7, %ymm1, %ymm1
567; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
568; AVX512BW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
569; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
570; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm1, %zmm1
571; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
572; AVX512BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
573; AVX512BW-NEXT:    retq
574  %res = srem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
575  ret <32 x i8> %res
576}
577