• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
4
5;
6; udiv by 7
7;
8
9define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
10; AVX1-LABEL: test_div7_4i64:
11; AVX1:       # BB#0:
12; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
13; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
14; AVX1-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
15; AVX1-NEXT:    movq %rcx, %rax
16; AVX1-NEXT:    mulq %rsi
17; AVX1-NEXT:    subq %rdx, %rcx
18; AVX1-NEXT:    shrq %rcx
19; AVX1-NEXT:    addq %rdx, %rcx
20; AVX1-NEXT:    shrq $2, %rcx
21; AVX1-NEXT:    vmovq %rcx, %xmm2
22; AVX1-NEXT:    vmovq %xmm1, %rcx
23; AVX1-NEXT:    movq %rcx, %rax
24; AVX1-NEXT:    mulq %rsi
25; AVX1-NEXT:    subq %rdx, %rcx
26; AVX1-NEXT:    shrq %rcx
27; AVX1-NEXT:    addq %rdx, %rcx
28; AVX1-NEXT:    shrq $2, %rcx
29; AVX1-NEXT:    vmovq %rcx, %xmm1
30; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
31; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
32; AVX1-NEXT:    movq %rcx, %rax
33; AVX1-NEXT:    mulq %rsi
34; AVX1-NEXT:    subq %rdx, %rcx
35; AVX1-NEXT:    shrq %rcx
36; AVX1-NEXT:    addq %rdx, %rcx
37; AVX1-NEXT:    shrq $2, %rcx
38; AVX1-NEXT:    vmovq %rcx, %xmm2
39; AVX1-NEXT:    vmovq %xmm0, %rcx
40; AVX1-NEXT:    movq %rcx, %rax
41; AVX1-NEXT:    mulq %rsi
42; AVX1-NEXT:    subq %rdx, %rcx
43; AVX1-NEXT:    shrq %rcx
44; AVX1-NEXT:    addq %rdx, %rcx
45; AVX1-NEXT:    shrq $2, %rcx
46; AVX1-NEXT:    vmovq %rcx, %xmm0
47; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
48; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
49; AVX1-NEXT:    retq
50;
51; AVX2-LABEL: test_div7_4i64:
52; AVX2:       # BB#0:
53; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
54; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
55; AVX2-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
56; AVX2-NEXT:    movq %rcx, %rax
57; AVX2-NEXT:    mulq %rsi
58; AVX2-NEXT:    subq %rdx, %rcx
59; AVX2-NEXT:    shrq %rcx
60; AVX2-NEXT:    addq %rdx, %rcx
61; AVX2-NEXT:    shrq $2, %rcx
62; AVX2-NEXT:    vmovq %rcx, %xmm2
63; AVX2-NEXT:    vmovq %xmm1, %rcx
64; AVX2-NEXT:    movq %rcx, %rax
65; AVX2-NEXT:    mulq %rsi
66; AVX2-NEXT:    subq %rdx, %rcx
67; AVX2-NEXT:    shrq %rcx
68; AVX2-NEXT:    addq %rdx, %rcx
69; AVX2-NEXT:    shrq $2, %rcx
70; AVX2-NEXT:    vmovq %rcx, %xmm1
71; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
72; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
73; AVX2-NEXT:    movq %rcx, %rax
74; AVX2-NEXT:    mulq %rsi
75; AVX2-NEXT:    subq %rdx, %rcx
76; AVX2-NEXT:    shrq %rcx
77; AVX2-NEXT:    addq %rdx, %rcx
78; AVX2-NEXT:    shrq $2, %rcx
79; AVX2-NEXT:    vmovq %rcx, %xmm2
80; AVX2-NEXT:    vmovq %xmm0, %rcx
81; AVX2-NEXT:    movq %rcx, %rax
82; AVX2-NEXT:    mulq %rsi
83; AVX2-NEXT:    subq %rdx, %rcx
84; AVX2-NEXT:    shrq %rcx
85; AVX2-NEXT:    addq %rdx, %rcx
86; AVX2-NEXT:    shrq $2, %rcx
87; AVX2-NEXT:    vmovq %rcx, %xmm0
88; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
89; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
90; AVX2-NEXT:    retq
91  %res = udiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
92  ret <4 x i64> %res
93}
94
95define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
96; AVX1-LABEL: test_div7_8i32:
97; AVX1:       # BB#0:
98; AVX1-NEXT:    vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
99; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
100; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
101; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
102; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
103; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
104; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
105; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm3
106; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
107; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
108; AVX1-NEXT:    vpsrld $2, %xmm2, %xmm2
109; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
110; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
111; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
112; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
113; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
114; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
115; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
116; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
117; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
118; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
119; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
120; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm0
121; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
122; AVX1-NEXT:    retq
123;
124; AVX2-LABEL: test_div7_8i32:
125; AVX2:       # BB#0:
126; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
127; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
128; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
129; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
130; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
131; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
132; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
133; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
134; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
135; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
136; AVX2-NEXT:    vpsrld $2, %ymm0, %ymm0
137; AVX2-NEXT:    retq
138  %res = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
139  ret <8 x i32> %res
140}
141
142define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
143; AVX1-LABEL: test_div7_16i16:
144; AVX1:       # BB#0:
145; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
146; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
147; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm3
148; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
149; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
150; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
151; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
152; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm1
153; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
154; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
155; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
156; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
157; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
158; AVX1-NEXT:    retq
159;
160; AVX2-LABEL: test_div7_16i16:
161; AVX2:       # BB#0:
162; AVX2-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
163; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
164; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
165; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
166; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm0
167; AVX2-NEXT:    retq
168  %res = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
169  ret <16 x i16> %res
170}
171
172define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
173; AVX1-LABEL: test_div7_32i8:
174; AVX1:       # BB#0:
175; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
176; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
177; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
178; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
179; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
180; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
181; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
182; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm4
183; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
184; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
185; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
186; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
187; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
188; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
189; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
190; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
191; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
192; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
193; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
194; AVX1-NEXT:    vpmullw %xmm3, %xmm5, %xmm5
195; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
196; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
197; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
198; AVX1-NEXT:    vpmullw %xmm3, %xmm6, %xmm3
199; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
200; AVX1-NEXT:    vpackuswb %xmm3, %xmm5, %xmm3
201; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
202; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
203; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
204; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
205; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
206; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
207; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
208; AVX1-NEXT:    retq
209;
210; AVX2-LABEL: test_div7_32i8:
211; AVX2:       # BB#0:
212; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
213; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
214; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
215; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
216; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
217; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
218; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
219; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
220; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
221; AVX2-NEXT:    vpmullw %ymm1, %ymm3, %ymm1
222; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
223; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
224; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
225; AVX2-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
226; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
227; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
228; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
229; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
230; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm0
231; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
232; AVX2-NEXT:    retq
233  %res = udiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
234  ret <32 x i8> %res
235}
236
237;
238; urem by 7
239;
240
241define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
242; AVX1-LABEL: test_rem7_4i64:
243; AVX1:       # BB#0:
244; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
245; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
246; AVX1-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
247; AVX1-NEXT:    movq %rcx, %rax
248; AVX1-NEXT:    mulq %rsi
249; AVX1-NEXT:    movq %rcx, %rax
250; AVX1-NEXT:    subq %rdx, %rax
251; AVX1-NEXT:    shrq %rax
252; AVX1-NEXT:    addq %rdx, %rax
253; AVX1-NEXT:    shrq $2, %rax
254; AVX1-NEXT:    leaq (,%rax,8), %rdx
255; AVX1-NEXT:    subq %rax, %rdx
256; AVX1-NEXT:    subq %rdx, %rcx
257; AVX1-NEXT:    vmovq %rcx, %xmm2
258; AVX1-NEXT:    vmovq %xmm1, %rcx
259; AVX1-NEXT:    movq %rcx, %rax
260; AVX1-NEXT:    mulq %rsi
261; AVX1-NEXT:    movq %rcx, %rax
262; AVX1-NEXT:    subq %rdx, %rax
263; AVX1-NEXT:    shrq %rax
264; AVX1-NEXT:    addq %rdx, %rax
265; AVX1-NEXT:    shrq $2, %rax
266; AVX1-NEXT:    leaq (,%rax,8), %rdx
267; AVX1-NEXT:    subq %rax, %rdx
268; AVX1-NEXT:    subq %rdx, %rcx
269; AVX1-NEXT:    vmovq %rcx, %xmm1
270; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
271; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
272; AVX1-NEXT:    movq %rcx, %rax
273; AVX1-NEXT:    mulq %rsi
274; AVX1-NEXT:    movq %rcx, %rax
275; AVX1-NEXT:    subq %rdx, %rax
276; AVX1-NEXT:    shrq %rax
277; AVX1-NEXT:    addq %rdx, %rax
278; AVX1-NEXT:    shrq $2, %rax
279; AVX1-NEXT:    leaq (,%rax,8), %rdx
280; AVX1-NEXT:    subq %rax, %rdx
281; AVX1-NEXT:    subq %rdx, %rcx
282; AVX1-NEXT:    vmovq %rcx, %xmm2
283; AVX1-NEXT:    vmovq %xmm0, %rcx
284; AVX1-NEXT:    movq %rcx, %rax
285; AVX1-NEXT:    mulq %rsi
286; AVX1-NEXT:    movq %rcx, %rax
287; AVX1-NEXT:    subq %rdx, %rax
288; AVX1-NEXT:    shrq %rax
289; AVX1-NEXT:    addq %rdx, %rax
290; AVX1-NEXT:    shrq $2, %rax
291; AVX1-NEXT:    leaq (,%rax,8), %rdx
292; AVX1-NEXT:    subq %rax, %rdx
293; AVX1-NEXT:    subq %rdx, %rcx
294; AVX1-NEXT:    vmovq %rcx, %xmm0
295; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
296; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
297; AVX1-NEXT:    retq
298;
299; AVX2-LABEL: test_rem7_4i64:
300; AVX2:       # BB#0:
301; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
302; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
303; AVX2-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
304; AVX2-NEXT:    movq %rcx, %rax
305; AVX2-NEXT:    mulq %rsi
306; AVX2-NEXT:    movq %rcx, %rax
307; AVX2-NEXT:    subq %rdx, %rax
308; AVX2-NEXT:    shrq %rax
309; AVX2-NEXT:    addq %rdx, %rax
310; AVX2-NEXT:    shrq $2, %rax
311; AVX2-NEXT:    leaq (,%rax,8), %rdx
312; AVX2-NEXT:    subq %rax, %rdx
313; AVX2-NEXT:    subq %rdx, %rcx
314; AVX2-NEXT:    vmovq %rcx, %xmm2
315; AVX2-NEXT:    vmovq %xmm1, %rcx
316; AVX2-NEXT:    movq %rcx, %rax
317; AVX2-NEXT:    mulq %rsi
318; AVX2-NEXT:    movq %rcx, %rax
319; AVX2-NEXT:    subq %rdx, %rax
320; AVX2-NEXT:    shrq %rax
321; AVX2-NEXT:    addq %rdx, %rax
322; AVX2-NEXT:    shrq $2, %rax
323; AVX2-NEXT:    leaq (,%rax,8), %rdx
324; AVX2-NEXT:    subq %rax, %rdx
325; AVX2-NEXT:    subq %rdx, %rcx
326; AVX2-NEXT:    vmovq %rcx, %xmm1
327; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
328; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
329; AVX2-NEXT:    movq %rcx, %rax
330; AVX2-NEXT:    mulq %rsi
331; AVX2-NEXT:    movq %rcx, %rax
332; AVX2-NEXT:    subq %rdx, %rax
333; AVX2-NEXT:    shrq %rax
334; AVX2-NEXT:    addq %rdx, %rax
335; AVX2-NEXT:    shrq $2, %rax
336; AVX2-NEXT:    leaq (,%rax,8), %rdx
337; AVX2-NEXT:    subq %rax, %rdx
338; AVX2-NEXT:    subq %rdx, %rcx
339; AVX2-NEXT:    vmovq %rcx, %xmm2
340; AVX2-NEXT:    vmovq %xmm0, %rcx
341; AVX2-NEXT:    movq %rcx, %rax
342; AVX2-NEXT:    mulq %rsi
343; AVX2-NEXT:    movq %rcx, %rax
344; AVX2-NEXT:    subq %rdx, %rax
345; AVX2-NEXT:    shrq %rax
346; AVX2-NEXT:    addq %rdx, %rax
347; AVX2-NEXT:    shrq $2, %rax
348; AVX2-NEXT:    leaq (,%rax,8), %rdx
349; AVX2-NEXT:    subq %rax, %rdx
350; AVX2-NEXT:    subq %rdx, %rcx
351; AVX2-NEXT:    vmovq %rcx, %xmm0
352; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
353; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
354; AVX2-NEXT:    retq
355  %res = urem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
356  ret <4 x i64> %res
357}
358
359define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
360; AVX1-LABEL: test_rem7_8i32:
361; AVX1:       # BB#0:
362; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
363; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
364; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
365; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
366; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
367; AVX1-NEXT:    vpmuludq %xmm3, %xmm5, %xmm3
368; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
369; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
370; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
371; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm3
372; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
373; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
374; AVX1-NEXT:    vpsrld $2, %xmm2, %xmm2
375; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
376; AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
377; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
378; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
379; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
380; AVX1-NEXT:    vpmuludq %xmm4, %xmm5, %xmm4
381; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
382; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
383; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
384; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm4
385; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
386; AVX1-NEXT:    vpaddd %xmm1, %xmm4, %xmm1
387; AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
388; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
389; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
390; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
391; AVX1-NEXT:    retq
392;
393; AVX2-LABEL: test_rem7_8i32:
394; AVX2:       # BB#0:
395; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
396; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
397; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
398; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
399; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
400; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
401; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
402; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm2
403; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
404; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
405; AVX2-NEXT:    vpsrld $2, %ymm1, %ymm1
406; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
407; AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
408; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
409; AVX2-NEXT:    retq
410  %res = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
411  ret <8 x i32> %res
412}
413
414define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
415; AVX1-LABEL: test_rem7_16i16:
416; AVX1:       # BB#0:
417; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
418; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
419; AVX1-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm3
420; AVX1-NEXT:    vpsubw %xmm3, %xmm1, %xmm4
421; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm4
422; AVX1-NEXT:    vpaddw %xmm3, %xmm4, %xmm3
423; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm3
424; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7]
425; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
426; AVX1-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
427; AVX1-NEXT:    vpmulhuw %xmm2, %xmm0, %xmm2
428; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm3
429; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
430; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
431; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
432; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
433; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
434; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
435; AVX1-NEXT:    retq
436;
437; AVX2-LABEL: test_rem7_16i16:
438; AVX2:       # BB#0:
439; AVX2-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
440; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm2
441; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm2
442; AVX2-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
443; AVX2-NEXT:    vpsrlw $2, %ymm1, %ymm1
444; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
445; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
446; AVX2-NEXT:    retq
447  %res = urem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
448  ret <16 x i16> %res
449}
450
451define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
452; AVX1-LABEL: test_rem7_32i8:
453; AVX1:       # BB#0:
454; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
455; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
456; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
457; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
458; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
459; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
460; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
461; AVX1-NEXT:    vpmullw %xmm1, %xmm4, %xmm4
462; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
463; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
464; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm4
465; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm4
466; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
467; AVX1-NEXT:    vpand %xmm8, %xmm4, %xmm4
468; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
469; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm3
470; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
471; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
472; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm6
473; AVX1-NEXT:    vpmovsxbw {{.*}}(%rip), %xmm7
474; AVX1-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
475; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
476; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
477; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
478; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
479; AVX1-NEXT:    vpmullw %xmm7, %xmm3, %xmm3
480; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
481; AVX1-NEXT:    vpackuswb %xmm3, %xmm6, %xmm3
482; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
483; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
484; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
485; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
486; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
487; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
488; AVX1-NEXT:    vpmullw %xmm1, %xmm6, %xmm1
489; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
490; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
491; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm3
492; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
493; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
494; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
495; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
496; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
497; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm3
498; AVX1-NEXT:    vpmullw %xmm7, %xmm3, %xmm3
499; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
500; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
501; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
502; AVX1-NEXT:    vpmullw %xmm7, %xmm1, %xmm1
503; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
504; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
505; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
506; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
507; AVX1-NEXT:    retq
508;
509; AVX2-LABEL: test_rem7_32i8:
510; AVX2:       # BB#0:
511; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
512; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
513; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
514; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
515; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
516; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
517; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
518; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
519; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
520; AVX2-NEXT:    vpmullw %ymm1, %ymm3, %ymm1
521; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
522; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
523; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
524; AVX2-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
525; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm2
526; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm2
527; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
528; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
529; AVX2-NEXT:    vpsrlw $2, %ymm1, %ymm1
530; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
531; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
532; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
533; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm3
534; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
535; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
536; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
537; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
538; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
539; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
540; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
541; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
542; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
543; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
544; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
545; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
546; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
547; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
548; AVX2-NEXT:    retq
549  %res = urem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
550  ret <32 x i8> %res
551}
552