• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6
7;
8; sdiv by 7
9;
10
11define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
12; SSE2-LABEL: test_div7_2i64:
13; SSE2:       # BB#0:
14; SSE2-NEXT:    movd %xmm0, %rax
15; SSE2-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
16; SSE2-NEXT:    imulq %rcx
17; SSE2-NEXT:    movq %rdx, %rax
18; SSE2-NEXT:    shrq $63, %rax
19; SSE2-NEXT:    sarq %rdx
20; SSE2-NEXT:    addq %rax, %rdx
21; SSE2-NEXT:    movd %rdx, %xmm1
22; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
23; SSE2-NEXT:    movd %xmm0, %rax
24; SSE2-NEXT:    imulq %rcx
25; SSE2-NEXT:    movq %rdx, %rax
26; SSE2-NEXT:    shrq $63, %rax
27; SSE2-NEXT:    sarq %rdx
28; SSE2-NEXT:    addq %rax, %rdx
29; SSE2-NEXT:    movd %rdx, %xmm0
30; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
31; SSE2-NEXT:    movdqa %xmm1, %xmm0
32; SSE2-NEXT:    retq
33;
34; SSE41-LABEL: test_div7_2i64:
35; SSE41:       # BB#0:
36; SSE41-NEXT:    pextrq $1, %xmm0, %rax
37; SSE41-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
38; SSE41-NEXT:    imulq %rcx
39; SSE41-NEXT:    movq %rdx, %rax
40; SSE41-NEXT:    shrq $63, %rax
41; SSE41-NEXT:    sarq %rdx
42; SSE41-NEXT:    addq %rax, %rdx
43; SSE41-NEXT:    movd %rdx, %xmm1
44; SSE41-NEXT:    movd %xmm0, %rax
45; SSE41-NEXT:    imulq %rcx
46; SSE41-NEXT:    movq %rdx, %rax
47; SSE41-NEXT:    shrq $63, %rax
48; SSE41-NEXT:    sarq %rdx
49; SSE41-NEXT:    addq %rax, %rdx
50; SSE41-NEXT:    movd %rdx, %xmm0
51; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
52; SSE41-NEXT:    retq
53;
54; AVX-LABEL: test_div7_2i64:
55; AVX:       # BB#0:
56; AVX-NEXT:    vpextrq $1, %xmm0, %rax
57; AVX-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
58; AVX-NEXT:    imulq %rcx
59; AVX-NEXT:    movq %rdx, %rax
60; AVX-NEXT:    shrq $63, %rax
61; AVX-NEXT:    sarq %rdx
62; AVX-NEXT:    addq %rax, %rdx
63; AVX-NEXT:    vmovq %rdx, %xmm1
64; AVX-NEXT:    vmovq %xmm0, %rax
65; AVX-NEXT:    imulq %rcx
66; AVX-NEXT:    movq %rdx, %rax
67; AVX-NEXT:    shrq $63, %rax
68; AVX-NEXT:    sarq %rdx
69; AVX-NEXT:    addq %rax, %rdx
70; AVX-NEXT:    vmovq %rdx, %xmm0
71; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
72; AVX-NEXT:    retq
73  %res = sdiv <2 x i64> %a, <i64 7, i64 7>
74  ret <2 x i64> %res
75}
76
77define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
78; SSE2-LABEL: test_div7_4i32:
79; SSE2:       # BB#0:
80; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
81; SSE2-NEXT:    movdqa %xmm0, %xmm2
82; SSE2-NEXT:    psrad $31, %xmm2
83; SSE2-NEXT:    pand %xmm1, %xmm2
84; SSE2-NEXT:    movdqa %xmm0, %xmm3
85; SSE2-NEXT:    pmuludq %xmm1, %xmm3
86; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
87; SSE2-NEXT:    psrad $31, %xmm1
88; SSE2-NEXT:    pand %xmm0, %xmm1
89; SSE2-NEXT:    paddd %xmm1, %xmm2
90; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
91; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
92; SSE2-NEXT:    pmuludq %xmm4, %xmm3
93; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
94; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
95; SSE2-NEXT:    psubd %xmm2, %xmm1
96; SSE2-NEXT:    paddd %xmm0, %xmm1
97; SSE2-NEXT:    movdqa %xmm1, %xmm0
98; SSE2-NEXT:    psrld $31, %xmm0
99; SSE2-NEXT:    psrad $2, %xmm1
100; SSE2-NEXT:    paddd %xmm0, %xmm1
101; SSE2-NEXT:    movdqa %xmm1, %xmm0
102; SSE2-NEXT:    retq
103;
104; SSE41-LABEL: test_div7_4i32:
105; SSE41:       # BB#0:
106; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
107; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
108; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
109; SSE41-NEXT:    pmuldq %xmm2, %xmm3
110; SSE41-NEXT:    pmuldq %xmm0, %xmm1
111; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
112; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
113; SSE41-NEXT:    paddd %xmm0, %xmm1
114; SSE41-NEXT:    movdqa %xmm1, %xmm0
115; SSE41-NEXT:    psrld $31, %xmm0
116; SSE41-NEXT:    psrad $2, %xmm1
117; SSE41-NEXT:    paddd %xmm0, %xmm1
118; SSE41-NEXT:    movdqa %xmm1, %xmm0
119; SSE41-NEXT:    retq
120;
121; AVX1-LABEL: test_div7_4i32:
122; AVX1:       # BB#0:
123; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
124; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
125; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
126; AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
127; AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
128; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
129; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
130; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
131; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
132; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
133; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
134; AVX1-NEXT:    retq
135;
136; AVX2-LABEL: test_div7_4i32:
137; AVX2:       # BB#0:
138; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
139; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
140; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
141; AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
142; AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
143; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
144; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
145; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
146; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm1
147; AVX2-NEXT:    vpsrad $2, %xmm0, %xmm0
148; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
149; AVX2-NEXT:    retq
150  %res = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
151  ret <4 x i32> %res
152}
153
154define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
155; SSE-LABEL: test_div7_8i16:
156; SSE:       # BB#0:
157; SSE-NEXT:    pmulhw {{.*}}(%rip), %xmm0
158; SSE-NEXT:    movdqa %xmm0, %xmm1
159; SSE-NEXT:    psrlw $15, %xmm1
160; SSE-NEXT:    psraw $1, %xmm0
161; SSE-NEXT:    paddw %xmm1, %xmm0
162; SSE-NEXT:    retq
163;
164; AVX-LABEL: test_div7_8i16:
165; AVX:       # BB#0:
166; AVX-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
167; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm1
168; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
169; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
170; AVX-NEXT:    retq
171  %res = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
172  ret <8 x i16> %res
173}
174
175define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
176; SSE2-LABEL: test_div7_16i8:
177; SSE2:       # BB#0:
178; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
179; SSE2-NEXT:    psraw $8, %xmm2
180; SSE2-NEXT:    movdqa %xmm0, %xmm3
181; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
182; SSE2-NEXT:    psraw $8, %xmm3
183; SSE2-NEXT:    pmullw %xmm2, %xmm3
184; SSE2-NEXT:    psrlw $8, %xmm3
185; SSE2-NEXT:    movdqa %xmm0, %xmm1
186; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
187; SSE2-NEXT:    psraw $8, %xmm1
188; SSE2-NEXT:    pmullw %xmm2, %xmm1
189; SSE2-NEXT:    psrlw $8, %xmm1
190; SSE2-NEXT:    packuswb %xmm3, %xmm1
191; SSE2-NEXT:    paddb %xmm0, %xmm1
192; SSE2-NEXT:    movdqa %xmm1, %xmm0
193; SSE2-NEXT:    psrlw $2, %xmm0
194; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
195; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
196; SSE2-NEXT:    pxor %xmm2, %xmm0
197; SSE2-NEXT:    psubb %xmm2, %xmm0
198; SSE2-NEXT:    psrlw $7, %xmm1
199; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
200; SSE2-NEXT:    paddb %xmm0, %xmm1
201; SSE2-NEXT:    movdqa %xmm1, %xmm0
202; SSE2-NEXT:    retq
203;
204; SSE41-LABEL: test_div7_16i8:
205; SSE41:       # BB#0:
206; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
207; SSE41-NEXT:    pmovsxbw {{.*}}(%rip), %xmm2
208; SSE41-NEXT:    pmullw %xmm2, %xmm1
209; SSE41-NEXT:    psrlw $8, %xmm1
210; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
211; SSE41-NEXT:    pmovsxbw %xmm3, %xmm3
212; SSE41-NEXT:    pmullw %xmm2, %xmm3
213; SSE41-NEXT:    psrlw $8, %xmm3
214; SSE41-NEXT:    packuswb %xmm3, %xmm1
215; SSE41-NEXT:    paddb %xmm0, %xmm1
216; SSE41-NEXT:    movdqa %xmm1, %xmm0
217; SSE41-NEXT:    psrlw $2, %xmm0
218; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
219; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
220; SSE41-NEXT:    pxor %xmm2, %xmm0
221; SSE41-NEXT:    psubb %xmm2, %xmm0
222; SSE41-NEXT:    psrlw $7, %xmm1
223; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
224; SSE41-NEXT:    paddb %xmm0, %xmm1
225; SSE41-NEXT:    movdqa %xmm1, %xmm0
226; SSE41-NEXT:    retq
227;
228; AVX1-LABEL: test_div7_16i8:
229; AVX1:       # BB#0:
230; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
231; AVX1-NEXT:    vpmovsxbw {{.*}}(%rip), %xmm2
232; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
233; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
234; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
235; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
236; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
237; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
238; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
239; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
240; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
241; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
242; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
243; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
244; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
245; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
246; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
247; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
248; AVX1-NEXT:    retq
249;
250; AVX2-LABEL: test_div7_16i8:
251; AVX2:       # BB#0:
252; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
253; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm2
254; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
255; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
256; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
257; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
258; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
259; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm1
260; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
261; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
262; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
263; AVX2-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
264; AVX2-NEXT:    vpsrlw $7, %xmm0, %xmm0
265; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
266; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
267; AVX2-NEXT:    vzeroupper
268; AVX2-NEXT:    retq
269  %res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
270  ret <16 x i8> %res
271}
272
273;
274; srem by 7
275;
276
277define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
278; SSE2-LABEL: test_rem7_2i64:
279; SSE2:       # BB#0:
280; SSE2-NEXT:    movd %xmm0, %rcx
281; SSE2-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
282; SSE2-NEXT:    movq %rcx, %rax
283; SSE2-NEXT:    imulq %rsi
284; SSE2-NEXT:    movq %rdx, %rax
285; SSE2-NEXT:    shrq $63, %rax
286; SSE2-NEXT:    sarq %rdx
287; SSE2-NEXT:    addq %rax, %rdx
288; SSE2-NEXT:    leaq (,%rdx,8), %rax
289; SSE2-NEXT:    subq %rdx, %rax
290; SSE2-NEXT:    subq %rax, %rcx
291; SSE2-NEXT:    movd %rcx, %xmm1
292; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
293; SSE2-NEXT:    movd %xmm0, %rcx
294; SSE2-NEXT:    movq %rcx, %rax
295; SSE2-NEXT:    imulq %rsi
296; SSE2-NEXT:    movq %rdx, %rax
297; SSE2-NEXT:    shrq $63, %rax
298; SSE2-NEXT:    sarq %rdx
299; SSE2-NEXT:    addq %rax, %rdx
300; SSE2-NEXT:    leaq (,%rdx,8), %rax
301; SSE2-NEXT:    subq %rdx, %rax
302; SSE2-NEXT:    subq %rax, %rcx
303; SSE2-NEXT:    movd %rcx, %xmm0
304; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
305; SSE2-NEXT:    movdqa %xmm1, %xmm0
306; SSE2-NEXT:    retq
307;
308; SSE41-LABEL: test_rem7_2i64:
309; SSE41:       # BB#0:
310; SSE41-NEXT:    pextrq $1, %xmm0, %rcx
311; SSE41-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
312; SSE41-NEXT:    movq %rcx, %rax
313; SSE41-NEXT:    imulq %rsi
314; SSE41-NEXT:    movq %rdx, %rax
315; SSE41-NEXT:    shrq $63, %rax
316; SSE41-NEXT:    sarq %rdx
317; SSE41-NEXT:    addq %rax, %rdx
318; SSE41-NEXT:    leaq (,%rdx,8), %rax
319; SSE41-NEXT:    subq %rdx, %rax
320; SSE41-NEXT:    subq %rax, %rcx
321; SSE41-NEXT:    movd %rcx, %xmm1
322; SSE41-NEXT:    movd %xmm0, %rcx
323; SSE41-NEXT:    movq %rcx, %rax
324; SSE41-NEXT:    imulq %rsi
325; SSE41-NEXT:    movq %rdx, %rax
326; SSE41-NEXT:    shrq $63, %rax
327; SSE41-NEXT:    sarq %rdx
328; SSE41-NEXT:    addq %rax, %rdx
329; SSE41-NEXT:    leaq (,%rdx,8), %rax
330; SSE41-NEXT:    subq %rdx, %rax
331; SSE41-NEXT:    subq %rax, %rcx
332; SSE41-NEXT:    movd %rcx, %xmm0
333; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
334; SSE41-NEXT:    retq
335;
336; AVX-LABEL: test_rem7_2i64:
337; AVX:       # BB#0:
338; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
339; AVX-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
340; AVX-NEXT:    movq %rcx, %rax
341; AVX-NEXT:    imulq %rsi
342; AVX-NEXT:    movq %rdx, %rax
343; AVX-NEXT:    shrq $63, %rax
344; AVX-NEXT:    sarq %rdx
345; AVX-NEXT:    addq %rax, %rdx
346; AVX-NEXT:    leaq (,%rdx,8), %rax
347; AVX-NEXT:    subq %rdx, %rax
348; AVX-NEXT:    subq %rax, %rcx
349; AVX-NEXT:    vmovq %rcx, %xmm1
350; AVX-NEXT:    vmovq %xmm0, %rcx
351; AVX-NEXT:    movq %rcx, %rax
352; AVX-NEXT:    imulq %rsi
353; AVX-NEXT:    movq %rdx, %rax
354; AVX-NEXT:    shrq $63, %rax
355; AVX-NEXT:    sarq %rdx
356; AVX-NEXT:    addq %rax, %rdx
357; AVX-NEXT:    leaq (,%rdx,8), %rax
358; AVX-NEXT:    subq %rdx, %rax
359; AVX-NEXT:    subq %rax, %rcx
360; AVX-NEXT:    vmovq %rcx, %xmm0
361; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
362; AVX-NEXT:    retq
363  %res = srem <2 x i64> %a, <i64 7, i64 7>
364  ret <2 x i64> %res
365}
366
367define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
368; SSE2-LABEL: test_rem7_4i32:
369; SSE2:       # BB#0:
370; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
371; SSE2-NEXT:    movdqa %xmm0, %xmm2
372; SSE2-NEXT:    psrad $31, %xmm2
373; SSE2-NEXT:    pand %xmm1, %xmm2
374; SSE2-NEXT:    movdqa %xmm0, %xmm3
375; SSE2-NEXT:    pmuludq %xmm1, %xmm3
376; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
377; SSE2-NEXT:    psrad $31, %xmm1
378; SSE2-NEXT:    pand %xmm0, %xmm1
379; SSE2-NEXT:    paddd %xmm1, %xmm2
380; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
381; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
382; SSE2-NEXT:    pmuludq %xmm4, %xmm3
383; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
384; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
385; SSE2-NEXT:    psubd %xmm2, %xmm1
386; SSE2-NEXT:    paddd %xmm0, %xmm1
387; SSE2-NEXT:    movdqa %xmm1, %xmm2
388; SSE2-NEXT:    psrld $31, %xmm2
389; SSE2-NEXT:    psrad $2, %xmm1
390; SSE2-NEXT:    paddd %xmm2, %xmm1
391; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7]
392; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
393; SSE2-NEXT:    pmuludq %xmm2, %xmm1
394; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
395; SSE2-NEXT:    pmuludq %xmm2, %xmm3
396; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
397; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
398; SSE2-NEXT:    psubd %xmm1, %xmm0
399; SSE2-NEXT:    retq
400;
401; SSE41-LABEL: test_rem7_4i32:
402; SSE41:       # BB#0:
403; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
404; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
405; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
406; SSE41-NEXT:    pmuldq %xmm2, %xmm3
407; SSE41-NEXT:    pmuldq %xmm0, %xmm1
408; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
409; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
410; SSE41-NEXT:    paddd %xmm0, %xmm1
411; SSE41-NEXT:    movdqa %xmm1, %xmm2
412; SSE41-NEXT:    psrld $31, %xmm2
413; SSE41-NEXT:    psrad $2, %xmm1
414; SSE41-NEXT:    paddd %xmm2, %xmm1
415; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
416; SSE41-NEXT:    psubd %xmm1, %xmm0
417; SSE41-NEXT:    retq
418;
419; AVX1-LABEL: test_rem7_4i32:
420; AVX1:       # BB#0:
421; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
422; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
423; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
424; AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
425; AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
426; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
427; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
428; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
429; AVX1-NEXT:    vpsrld $31, %xmm1, %xmm2
430; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm1
431; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
432; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
433; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
434; AVX1-NEXT:    retq
435;
436; AVX2-LABEL: test_rem7_4i32:
437; AVX2:       # BB#0:
438; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
439; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
440; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
441; AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
442; AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
443; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
444; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
445; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
446; AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
447; AVX2-NEXT:    vpsrad $2, %xmm1, %xmm1
448; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
449; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
450; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
451; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
452; AVX2-NEXT:    retq
453  %res = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
454  ret <4 x i32> %res
455}
456
457define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
458; SSE-LABEL: test_rem7_8i16:
459; SSE:       # BB#0:
460; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725]
461; SSE-NEXT:    pmulhw %xmm0, %xmm1
462; SSE-NEXT:    movdqa %xmm1, %xmm2
463; SSE-NEXT:    psrlw $15, %xmm2
464; SSE-NEXT:    psraw $1, %xmm1
465; SSE-NEXT:    paddw %xmm2, %xmm1
466; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
467; SSE-NEXT:    psubw %xmm1, %xmm0
468; SSE-NEXT:    retq
469;
470; AVX-LABEL: test_rem7_8i16:
471; AVX:       # BB#0:
472; AVX-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
473; AVX-NEXT:    vpsrlw $15, %xmm1, %xmm2
474; AVX-NEXT:    vpsraw $1, %xmm1, %xmm1
475; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
476; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
477; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
478; AVX-NEXT:    retq
479  %res = srem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
480  ret <8 x i16> %res
481}
482
483define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
484; SSE2-LABEL: test_rem7_16i8:
485; SSE2:       # BB#0:
486; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
487; SSE2-NEXT:    psraw $8, %xmm2
488; SSE2-NEXT:    movdqa %xmm0, %xmm3
489; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
490; SSE2-NEXT:    psraw $8, %xmm3
491; SSE2-NEXT:    pmullw %xmm2, %xmm3
492; SSE2-NEXT:    psrlw $8, %xmm3
493; SSE2-NEXT:    movdqa %xmm0, %xmm1
494; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
495; SSE2-NEXT:    psraw $8, %xmm1
496; SSE2-NEXT:    pmullw %xmm2, %xmm1
497; SSE2-NEXT:    psrlw $8, %xmm1
498; SSE2-NEXT:    packuswb %xmm3, %xmm1
499; SSE2-NEXT:    paddb %xmm0, %xmm1
500; SSE2-NEXT:    movdqa %xmm1, %xmm2
501; SSE2-NEXT:    psrlw $2, %xmm2
502; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
503; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
504; SSE2-NEXT:    pxor %xmm3, %xmm2
505; SSE2-NEXT:    psubb %xmm3, %xmm2
506; SSE2-NEXT:    psrlw $7, %xmm1
507; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
508; SSE2-NEXT:    paddb %xmm2, %xmm1
509; SSE2-NEXT:    movdqa %xmm1, %xmm2
510; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
511; SSE2-NEXT:    psraw $8, %xmm2
512; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
513; SSE2-NEXT:    psraw $8, %xmm3
514; SSE2-NEXT:    pmullw %xmm3, %xmm2
515; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
516; SSE2-NEXT:    pand %xmm4, %xmm2
517; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
518; SSE2-NEXT:    psraw $8, %xmm1
519; SSE2-NEXT:    pmullw %xmm3, %xmm1
520; SSE2-NEXT:    pand %xmm4, %xmm1
521; SSE2-NEXT:    packuswb %xmm2, %xmm1
522; SSE2-NEXT:    psubb %xmm1, %xmm0
523; SSE2-NEXT:    retq
524;
525; SSE41-LABEL: test_rem7_16i8:
526; SSE41:       # BB#0:
527; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
528; SSE41-NEXT:    pmovsxbw {{.*}}(%rip), %xmm2
529; SSE41-NEXT:    pmullw %xmm2, %xmm1
530; SSE41-NEXT:    psrlw $8, %xmm1
531; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
532; SSE41-NEXT:    pmovsxbw %xmm3, %xmm3
533; SSE41-NEXT:    pmullw %xmm2, %xmm3
534; SSE41-NEXT:    psrlw $8, %xmm3
535; SSE41-NEXT:    packuswb %xmm3, %xmm1
536; SSE41-NEXT:    paddb %xmm0, %xmm1
537; SSE41-NEXT:    movdqa %xmm1, %xmm2
538; SSE41-NEXT:    psrlw $2, %xmm2
539; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
540; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
541; SSE41-NEXT:    pxor %xmm3, %xmm2
542; SSE41-NEXT:    psubb %xmm3, %xmm2
543; SSE41-NEXT:    psrlw $7, %xmm1
544; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
545; SSE41-NEXT:    paddb %xmm2, %xmm1
546; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
547; SSE41-NEXT:    pmovsxbw {{.*}}(%rip), %xmm3
548; SSE41-NEXT:    pmullw %xmm3, %xmm2
549; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
550; SSE41-NEXT:    pand %xmm4, %xmm2
551; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
552; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
553; SSE41-NEXT:    pmullw %xmm3, %xmm1
554; SSE41-NEXT:    pand %xmm4, %xmm1
555; SSE41-NEXT:    packuswb %xmm1, %xmm2
556; SSE41-NEXT:    psubb %xmm2, %xmm0
557; SSE41-NEXT:    retq
558;
559; AVX1-LABEL: test_rem7_16i8:
560; AVX1:       # BB#0:
561; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
562; AVX1-NEXT:    vpmovsxbw {{.*}}(%rip), %xmm2
563; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
564; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
565; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
566; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
567; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
568; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
569; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
570; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
571; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm2
572; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
573; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
574; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
575; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
576; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
577; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
578; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
579; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm2
580; AVX1-NEXT:    vpmovsxbw {{.*}}(%rip), %xmm3
581; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
582; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
583; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
584; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
585; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
586; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
587; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
588; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
589; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
590; AVX1-NEXT:    retq
591;
592; AVX2-LABEL: test_rem7_16i8:
593; AVX2:       # BB#0:
594; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
595; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm2
596; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
597; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
598; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
599; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
600; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
601; AVX2-NEXT:    vpsrlw $2, %xmm1, %xmm2
602; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
603; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
604; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
605; AVX2-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
606; AVX2-NEXT:    vpsrlw $7, %xmm1, %xmm1
607; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
608; AVX2-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
609; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
610; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm2
611; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
612; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
613; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
614; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
615; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
616; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
617; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
618; AVX2-NEXT:    vzeroupper
619; AVX2-NEXT:    retq
620  %res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
621  ret <16 x i8> %res
622}
623