• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6
7;
8; udiv by 7
9;
10
11define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
12; SSE2-LABEL: test_div7_2i64:
13; SSE2:       # BB#0:
14; SSE2-NEXT:    movd %xmm0, %rcx
15; SSE2-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
16; SSE2-NEXT:    movq %rcx, %rax
17; SSE2-NEXT:    mulq %rsi
18; SSE2-NEXT:    subq %rdx, %rcx
19; SSE2-NEXT:    shrq %rcx
20; SSE2-NEXT:    addq %rdx, %rcx
21; SSE2-NEXT:    shrq $2, %rcx
22; SSE2-NEXT:    movd %rcx, %xmm1
23; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
24; SSE2-NEXT:    movd %xmm0, %rcx
25; SSE2-NEXT:    movq %rcx, %rax
26; SSE2-NEXT:    mulq %rsi
27; SSE2-NEXT:    subq %rdx, %rcx
28; SSE2-NEXT:    shrq %rcx
29; SSE2-NEXT:    addq %rdx, %rcx
30; SSE2-NEXT:    shrq $2, %rcx
31; SSE2-NEXT:    movd %rcx, %xmm0
32; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
33; SSE2-NEXT:    movdqa %xmm1, %xmm0
34; SSE2-NEXT:    retq
35;
36; SSE41-LABEL: test_div7_2i64:
37; SSE41:       # BB#0:
38; SSE41-NEXT:    pextrq $1, %xmm0, %rcx
39; SSE41-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
40; SSE41-NEXT:    movq %rcx, %rax
41; SSE41-NEXT:    mulq %rsi
42; SSE41-NEXT:    subq %rdx, %rcx
43; SSE41-NEXT:    shrq %rcx
44; SSE41-NEXT:    addq %rdx, %rcx
45; SSE41-NEXT:    shrq $2, %rcx
46; SSE41-NEXT:    movd %rcx, %xmm1
47; SSE41-NEXT:    movd %xmm0, %rcx
48; SSE41-NEXT:    movq %rcx, %rax
49; SSE41-NEXT:    mulq %rsi
50; SSE41-NEXT:    subq %rdx, %rcx
51; SSE41-NEXT:    shrq %rcx
52; SSE41-NEXT:    addq %rdx, %rcx
53; SSE41-NEXT:    shrq $2, %rcx
54; SSE41-NEXT:    movd %rcx, %xmm0
55; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
56; SSE41-NEXT:    retq
57;
58; AVX-LABEL: test_div7_2i64:
59; AVX:       # BB#0:
60; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
61; AVX-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
62; AVX-NEXT:    movq %rcx, %rax
63; AVX-NEXT:    mulq %rsi
64; AVX-NEXT:    subq %rdx, %rcx
65; AVX-NEXT:    shrq %rcx
66; AVX-NEXT:    addq %rdx, %rcx
67; AVX-NEXT:    shrq $2, %rcx
68; AVX-NEXT:    vmovq %rcx, %xmm1
69; AVX-NEXT:    vmovq %xmm0, %rcx
70; AVX-NEXT:    movq %rcx, %rax
71; AVX-NEXT:    mulq %rsi
72; AVX-NEXT:    subq %rdx, %rcx
73; AVX-NEXT:    shrq %rcx
74; AVX-NEXT:    addq %rdx, %rcx
75; AVX-NEXT:    shrq $2, %rcx
76; AVX-NEXT:    vmovq %rcx, %xmm0
77; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
78; AVX-NEXT:    retq
79  %res = udiv <2 x i64> %a, <i64 7, i64 7>
80  ret <2 x i64> %res
81}
82
83define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
84; SSE2-LABEL: test_div7_4i32:
85; SSE2:       # BB#0:
86; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
87; SSE2-NEXT:    movdqa %xmm0, %xmm2
88; SSE2-NEXT:    pmuludq %xmm1, %xmm2
89; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
90; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
91; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
92; SSE2-NEXT:    pmuludq %xmm1, %xmm3
93; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
94; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
95; SSE2-NEXT:    psubd %xmm2, %xmm0
96; SSE2-NEXT:    psrld $1, %xmm0
97; SSE2-NEXT:    paddd %xmm2, %xmm0
98; SSE2-NEXT:    psrld $2, %xmm0
99; SSE2-NEXT:    retq
100;
101; SSE41-LABEL: test_div7_4i32:
102; SSE41:       # BB#0:
103; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
104; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
105; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
106; SSE41-NEXT:    pmuludq %xmm2, %xmm3
107; SSE41-NEXT:    pmuludq %xmm0, %xmm1
108; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
109; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
110; SSE41-NEXT:    psubd %xmm1, %xmm0
111; SSE41-NEXT:    psrld $1, %xmm0
112; SSE41-NEXT:    paddd %xmm1, %xmm0
113; SSE41-NEXT:    psrld $2, %xmm0
114; SSE41-NEXT:    retq
115;
116; AVX1-LABEL: test_div7_4i32:
117; AVX1:       # BB#0:
118; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
119; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
120; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
121; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
122; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
123; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
124; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
125; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
126; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
127; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
128; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm0
129; AVX1-NEXT:    retq
130;
131; AVX2-LABEL: test_div7_4i32:
132; AVX2:       # BB#0:
133; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
134; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
135; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
136; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
137; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
138; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
139; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
140; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
141; AVX2-NEXT:    vpsrld $1, %xmm0, %xmm0
142; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
143; AVX2-NEXT:    vpsrld $2, %xmm0, %xmm0
144; AVX2-NEXT:    retq
145  %res = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
146  ret <4 x i32> %res
147}
148
149define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
150; SSE-LABEL: test_div7_8i16:
151; SSE:       # BB#0:
152; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
153; SSE-NEXT:    pmulhuw %xmm0, %xmm1
154; SSE-NEXT:    psubw %xmm1, %xmm0
155; SSE-NEXT:    psrlw $1, %xmm0
156; SSE-NEXT:    paddw %xmm1, %xmm0
157; SSE-NEXT:    psrlw $2, %xmm0
158; SSE-NEXT:    retq
159;
160; AVX-LABEL: test_div7_8i16:
161; AVX:       # BB#0:
162; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
163; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
164; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
165; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
166; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm0
167; AVX-NEXT:    retq
168  %res = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
169  ret <8 x i16> %res
170}
171
172define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
173; SSE2-LABEL: test_div7_16i8:
174; SSE2:       # BB#0:
175; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
176; SSE2-NEXT:    psrlw $8, %xmm1
177; SSE2-NEXT:    movdqa %xmm0, %xmm2
178; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
179; SSE2-NEXT:    psrlw $8, %xmm2
180; SSE2-NEXT:    pmullw %xmm1, %xmm2
181; SSE2-NEXT:    psrlw $8, %xmm2
182; SSE2-NEXT:    movdqa %xmm0, %xmm3
183; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
184; SSE2-NEXT:    psrlw $8, %xmm3
185; SSE2-NEXT:    pmullw %xmm1, %xmm3
186; SSE2-NEXT:    psrlw $8, %xmm3
187; SSE2-NEXT:    packuswb %xmm2, %xmm3
188; SSE2-NEXT:    psubb %xmm3, %xmm0
189; SSE2-NEXT:    psrlw $1, %xmm0
190; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
191; SSE2-NEXT:    paddb %xmm3, %xmm0
192; SSE2-NEXT:    psrlw $2, %xmm0
193; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
194; SSE2-NEXT:    retq
195;
196; SSE41-LABEL: test_div7_16i8:
197; SSE41:       # BB#0:
198; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
199; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
200; SSE41-NEXT:    pmullw %xmm2, %xmm1
201; SSE41-NEXT:    psrlw $8, %xmm1
202; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
203; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
204; SSE41-NEXT:    pmullw %xmm2, %xmm3
205; SSE41-NEXT:    psrlw $8, %xmm3
206; SSE41-NEXT:    packuswb %xmm3, %xmm1
207; SSE41-NEXT:    psubb %xmm1, %xmm0
208; SSE41-NEXT:    psrlw $1, %xmm0
209; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
210; SSE41-NEXT:    paddb %xmm1, %xmm0
211; SSE41-NEXT:    psrlw $2, %xmm0
212; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
213; SSE41-NEXT:    retq
214;
215; AVX1-LABEL: test_div7_16i8:
216; AVX1:       # BB#0:
217; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
218; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
219; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
220; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
221; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
222; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
223; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
224; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
225; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
226; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
227; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
228; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
229; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
230; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
231; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
232; AVX1-NEXT:    retq
233;
234; AVX2-LABEL: test_div7_16i8:
235; AVX2:       # BB#0:
236; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
237; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
238; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
239; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
240; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
241; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
242; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
243; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
244; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
245; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
246; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm0
247; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
248; AVX2-NEXT:    vzeroupper
249; AVX2-NEXT:    retq
250  %res = udiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
251  ret <16 x i8> %res
252}
253
254;
255; urem by 7
256;
257
258define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
259; SSE2-LABEL: test_rem7_2i64:
260; SSE2:       # BB#0:
261; SSE2-NEXT:    movd %xmm0, %rcx
262; SSE2-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
263; SSE2-NEXT:    movq %rcx, %rax
264; SSE2-NEXT:    mulq %rsi
265; SSE2-NEXT:    movq %rcx, %rax
266; SSE2-NEXT:    subq %rdx, %rax
267; SSE2-NEXT:    shrq %rax
268; SSE2-NEXT:    addq %rdx, %rax
269; SSE2-NEXT:    shrq $2, %rax
270; SSE2-NEXT:    leaq (,%rax,8), %rdx
271; SSE2-NEXT:    subq %rax, %rdx
272; SSE2-NEXT:    subq %rdx, %rcx
273; SSE2-NEXT:    movd %rcx, %xmm1
274; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
275; SSE2-NEXT:    movd %xmm0, %rcx
276; SSE2-NEXT:    movq %rcx, %rax
277; SSE2-NEXT:    mulq %rsi
278; SSE2-NEXT:    movq %rcx, %rax
279; SSE2-NEXT:    subq %rdx, %rax
280; SSE2-NEXT:    shrq %rax
281; SSE2-NEXT:    addq %rdx, %rax
282; SSE2-NEXT:    shrq $2, %rax
283; SSE2-NEXT:    leaq (,%rax,8), %rdx
284; SSE2-NEXT:    subq %rax, %rdx
285; SSE2-NEXT:    subq %rdx, %rcx
286; SSE2-NEXT:    movd %rcx, %xmm0
287; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
288; SSE2-NEXT:    movdqa %xmm1, %xmm0
289; SSE2-NEXT:    retq
290;
291; SSE41-LABEL: test_rem7_2i64:
292; SSE41:       # BB#0:
293; SSE41-NEXT:    pextrq $1, %xmm0, %rcx
294; SSE41-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
295; SSE41-NEXT:    movq %rcx, %rax
296; SSE41-NEXT:    mulq %rsi
297; SSE41-NEXT:    movq %rcx, %rax
298; SSE41-NEXT:    subq %rdx, %rax
299; SSE41-NEXT:    shrq %rax
300; SSE41-NEXT:    addq %rdx, %rax
301; SSE41-NEXT:    shrq $2, %rax
302; SSE41-NEXT:    leaq (,%rax,8), %rdx
303; SSE41-NEXT:    subq %rax, %rdx
304; SSE41-NEXT:    subq %rdx, %rcx
305; SSE41-NEXT:    movd %rcx, %xmm1
306; SSE41-NEXT:    movd %xmm0, %rcx
307; SSE41-NEXT:    movq %rcx, %rax
308; SSE41-NEXT:    mulq %rsi
309; SSE41-NEXT:    movq %rcx, %rax
310; SSE41-NEXT:    subq %rdx, %rax
311; SSE41-NEXT:    shrq %rax
312; SSE41-NEXT:    addq %rdx, %rax
313; SSE41-NEXT:    shrq $2, %rax
314; SSE41-NEXT:    leaq (,%rax,8), %rdx
315; SSE41-NEXT:    subq %rax, %rdx
316; SSE41-NEXT:    subq %rdx, %rcx
317; SSE41-NEXT:    movd %rcx, %xmm0
318; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
319; SSE41-NEXT:    retq
320;
321; AVX-LABEL: test_rem7_2i64:
322; AVX:       # BB#0:
323; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
324; AVX-NEXT:    movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
325; AVX-NEXT:    movq %rcx, %rax
326; AVX-NEXT:    mulq %rsi
327; AVX-NEXT:    movq %rcx, %rax
328; AVX-NEXT:    subq %rdx, %rax
329; AVX-NEXT:    shrq %rax
330; AVX-NEXT:    addq %rdx, %rax
331; AVX-NEXT:    shrq $2, %rax
332; AVX-NEXT:    leaq (,%rax,8), %rdx
333; AVX-NEXT:    subq %rax, %rdx
334; AVX-NEXT:    subq %rdx, %rcx
335; AVX-NEXT:    vmovq %rcx, %xmm1
336; AVX-NEXT:    vmovq %xmm0, %rcx
337; AVX-NEXT:    movq %rcx, %rax
338; AVX-NEXT:    mulq %rsi
339; AVX-NEXT:    movq %rcx, %rax
340; AVX-NEXT:    subq %rdx, %rax
341; AVX-NEXT:    shrq %rax
342; AVX-NEXT:    addq %rdx, %rax
343; AVX-NEXT:    shrq $2, %rax
344; AVX-NEXT:    leaq (,%rax,8), %rdx
345; AVX-NEXT:    subq %rax, %rdx
346; AVX-NEXT:    subq %rdx, %rcx
347; AVX-NEXT:    vmovq %rcx, %xmm0
348; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
349; AVX-NEXT:    retq
350  %res = urem <2 x i64> %a, <i64 7, i64 7>
351  ret <2 x i64> %res
352}
353
354define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
355; SSE2-LABEL: test_rem7_4i32:
356; SSE2:       # BB#0:
357; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
358; SSE2-NEXT:    movdqa %xmm0, %xmm2
359; SSE2-NEXT:    pmuludq %xmm1, %xmm2
360; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
361; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
362; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
363; SSE2-NEXT:    pmuludq %xmm1, %xmm3
364; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
365; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
366; SSE2-NEXT:    movdqa %xmm0, %xmm1
367; SSE2-NEXT:    psubd %xmm2, %xmm1
368; SSE2-NEXT:    psrld $1, %xmm1
369; SSE2-NEXT:    paddd %xmm2, %xmm1
370; SSE2-NEXT:    psrld $2, %xmm1
371; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7]
372; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
373; SSE2-NEXT:    pmuludq %xmm2, %xmm1
374; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
375; SSE2-NEXT:    pmuludq %xmm2, %xmm3
376; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
377; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
378; SSE2-NEXT:    psubd %xmm1, %xmm0
379; SSE2-NEXT:    retq
380;
381; SSE41-LABEL: test_rem7_4i32:
382; SSE41:       # BB#0:
383; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
384; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
385; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
386; SSE41-NEXT:    pmuludq %xmm2, %xmm3
387; SSE41-NEXT:    pmuludq %xmm0, %xmm1
388; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
389; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
390; SSE41-NEXT:    movdqa %xmm0, %xmm2
391; SSE41-NEXT:    psubd %xmm1, %xmm2
392; SSE41-NEXT:    psrld $1, %xmm2
393; SSE41-NEXT:    paddd %xmm1, %xmm2
394; SSE41-NEXT:    psrld $2, %xmm2
395; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
396; SSE41-NEXT:    psubd %xmm2, %xmm0
397; SSE41-NEXT:    retq
398;
399; AVX1-LABEL: test_rem7_4i32:
400; AVX1:       # BB#0:
401; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
402; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
403; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
404; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
405; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
406; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
407; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
408; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
409; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
410; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
411; AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
412; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
413; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
414; AVX1-NEXT:    retq
415;
416; AVX2-LABEL: test_rem7_4i32:
417; AVX2:       # BB#0:
418; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
419; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
420; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
421; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
422; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
423; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
424; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
425; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
426; AVX2-NEXT:    vpsrld $1, %xmm2, %xmm2
427; AVX2-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
428; AVX2-NEXT:    vpsrld $2, %xmm1, %xmm1
429; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
430; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
431; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
432; AVX2-NEXT:    retq
433  %res = urem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
434  ret <4 x i32> %res
435}
436
437define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
438; SSE-LABEL: test_rem7_8i16:
439; SSE:       # BB#0:
440; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
441; SSE-NEXT:    pmulhuw %xmm0, %xmm1
442; SSE-NEXT:    movdqa %xmm0, %xmm2
443; SSE-NEXT:    psubw %xmm1, %xmm2
444; SSE-NEXT:    psrlw $1, %xmm2
445; SSE-NEXT:    paddw %xmm1, %xmm2
446; SSE-NEXT:    psrlw $2, %xmm2
447; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm2
448; SSE-NEXT:    psubw %xmm2, %xmm0
449; SSE-NEXT:    retq
450;
451; AVX-LABEL: test_rem7_8i16:
452; AVX:       # BB#0:
453; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
454; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
455; AVX-NEXT:    vpsrlw $1, %xmm2, %xmm2
456; AVX-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
457; AVX-NEXT:    vpsrlw $2, %xmm1, %xmm1
458; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
459; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
460; AVX-NEXT:    retq
461  %res = urem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
462  ret <8 x i16> %res
463}
464
465define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
466; SSE2-LABEL: test_rem7_16i8:
467; SSE2:       # BB#0:
468; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
469; SSE2-NEXT:    psrlw $8, %xmm1
470; SSE2-NEXT:    movdqa %xmm0, %xmm2
471; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
472; SSE2-NEXT:    psrlw $8, %xmm2
473; SSE2-NEXT:    pmullw %xmm1, %xmm2
474; SSE2-NEXT:    psrlw $8, %xmm2
475; SSE2-NEXT:    movdqa %xmm0, %xmm3
476; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
477; SSE2-NEXT:    psrlw $8, %xmm3
478; SSE2-NEXT:    pmullw %xmm1, %xmm3
479; SSE2-NEXT:    psrlw $8, %xmm3
480; SSE2-NEXT:    packuswb %xmm2, %xmm3
481; SSE2-NEXT:    movdqa %xmm0, %xmm1
482; SSE2-NEXT:    psubb %xmm3, %xmm1
483; SSE2-NEXT:    psrlw $1, %xmm1
484; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
485; SSE2-NEXT:    paddb %xmm3, %xmm1
486; SSE2-NEXT:    psrlw $2, %xmm1
487; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
488; SSE2-NEXT:    movdqa %xmm1, %xmm2
489; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
490; SSE2-NEXT:    psraw $8, %xmm2
491; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
492; SSE2-NEXT:    psraw $8, %xmm3
493; SSE2-NEXT:    pmullw %xmm3, %xmm2
494; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
495; SSE2-NEXT:    pand %xmm4, %xmm2
496; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
497; SSE2-NEXT:    psraw $8, %xmm1
498; SSE2-NEXT:    pmullw %xmm3, %xmm1
499; SSE2-NEXT:    pand %xmm4, %xmm1
500; SSE2-NEXT:    packuswb %xmm2, %xmm1
501; SSE2-NEXT:    psubb %xmm1, %xmm0
502; SSE2-NEXT:    retq
503;
504; SSE41-LABEL: test_rem7_16i8:
505; SSE41:       # BB#0:
506; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
507; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
508; SSE41-NEXT:    pmullw %xmm2, %xmm1
509; SSE41-NEXT:    psrlw $8, %xmm1
510; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
511; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
512; SSE41-NEXT:    pmullw %xmm2, %xmm3
513; SSE41-NEXT:    psrlw $8, %xmm3
514; SSE41-NEXT:    packuswb %xmm3, %xmm1
515; SSE41-NEXT:    movdqa %xmm0, %xmm2
516; SSE41-NEXT:    psubb %xmm1, %xmm2
517; SSE41-NEXT:    psrlw $1, %xmm2
518; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
519; SSE41-NEXT:    paddb %xmm1, %xmm2
520; SSE41-NEXT:    psrlw $2, %xmm2
521; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
522; SSE41-NEXT:    pmovsxbw %xmm2, %xmm1
523; SSE41-NEXT:    pmovsxbw {{.*}}(%rip), %xmm3
524; SSE41-NEXT:    pmullw %xmm3, %xmm1
525; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
526; SSE41-NEXT:    pand %xmm4, %xmm1
527; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
528; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
529; SSE41-NEXT:    pmullw %xmm3, %xmm2
530; SSE41-NEXT:    pand %xmm4, %xmm2
531; SSE41-NEXT:    packuswb %xmm2, %xmm1
532; SSE41-NEXT:    psubb %xmm1, %xmm0
533; SSE41-NEXT:    retq
534;
535; AVX1-LABEL: test_rem7_16i8:
536; AVX1:       # BB#0:
537; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
538; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
539; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
540; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
541; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
542; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
543; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
544; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
545; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
546; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
547; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
548; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
549; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
550; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
551; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
552; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm2
553; AVX1-NEXT:    vpmovsxbw {{.*}}(%rip), %xmm3
554; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
555; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
556; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
557; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
558; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
559; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
560; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
561; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
562; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
563; AVX1-NEXT:    retq
564;
565; AVX2-LABEL: test_rem7_16i8:
566; AVX2:       # BB#0:
567; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
568; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
569; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
570; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
571; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
572; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
573; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
574; AVX2-NEXT:    vpsrlw $1, %xmm2, %xmm2
575; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
576; AVX2-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
577; AVX2-NEXT:    vpsrlw $2, %xmm1, %xmm1
578; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
579; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
580; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm2
581; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
582; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
583; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
584; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
585; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
586; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
587; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
588; AVX2-NEXT:    vzeroupper
589; AVX2-NEXT:    retq
590  %res = urem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
591  ret <16 x i8> %res
592}
593