• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X64
4
5; If the target does not have a single div/rem operation,
6; -div-rem-pairs pass will decompose the remainder calculation as:
7;   X % Y --> X - ((X / Y) * Y)
8; But if the target does have a single div/rem operation,
9; the opposite transform is likely beneficial.
10
11define i8 @scalar_i8(i8 %x, i8 %y, i8* %divdst) nounwind {
12; X86-LABEL: scalar_i8:
13; X86:       # %bb.0:
14; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
15; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
16; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
17; X86-NEXT:    movzbl %cl, %eax
18; X86-NEXT:    divb %ch
19; X86-NEXT:    movb %al, (%edx)
20; X86-NEXT:    mulb %ch
21; X86-NEXT:    subb %al, %cl
22; X86-NEXT:    movl %ecx, %eax
23; X86-NEXT:    retl
24;
25; X64-LABEL: scalar_i8:
26; X64:       # %bb.0:
27; X64-NEXT:    movzbl %dil, %ecx
28; X64-NEXT:    movl %ecx, %eax
29; X64-NEXT:    divb %sil
30; X64-NEXT:    movb %al, (%rdx)
31; X64-NEXT:    mulb %sil
32; X64-NEXT:    subb %al, %cl
33; X64-NEXT:    movl %ecx, %eax
34; X64-NEXT:    retq
35  %div = udiv i8 %x, %y
36  store i8 %div, i8* %divdst, align 4
37  %t1 = mul i8 %div, %y
38  %t2 = sub i8 %x, %t1
39  ret i8 %t2
40}
41
42define i16 @scalar_i16(i16 %x, i16 %y, i16* %divdst) nounwind {
43; X86-LABEL: scalar_i16:
44; X86:       # %bb.0:
45; X86-NEXT:    pushl %edi
46; X86-NEXT:    pushl %esi
47; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
48; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
49; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
50; X86-NEXT:    movl %ecx, %eax
51; X86-NEXT:    xorl %edx, %edx
52; X86-NEXT:    divw %si
53; X86-NEXT:    # kill: def $ax killed $ax def $eax
54; X86-NEXT:    movw %ax, (%edi)
55; X86-NEXT:    imull %eax, %esi
56; X86-NEXT:    subl %esi, %ecx
57; X86-NEXT:    movl %ecx, %eax
58; X86-NEXT:    popl %esi
59; X86-NEXT:    popl %edi
60; X86-NEXT:    retl
61;
62; X64-LABEL: scalar_i16:
63; X64:       # %bb.0:
64; X64-NEXT:    movq %rdx, %rcx
65; X64-NEXT:    movl %edi, %eax
66; X64-NEXT:    xorl %edx, %edx
67; X64-NEXT:    divw %si
68; X64-NEXT:    # kill: def $ax killed $ax def $eax
69; X64-NEXT:    movw %ax, (%rcx)
70; X64-NEXT:    imull %eax, %esi
71; X64-NEXT:    subl %esi, %edi
72; X64-NEXT:    movl %edi, %eax
73; X64-NEXT:    retq
74  %div = udiv i16 %x, %y
75  store i16 %div, i16* %divdst, align 4
76  %t1 = mul i16 %div, %y
77  %t2 = sub i16 %x, %t1
78  ret i16 %t2
79}
80
81define i32 @scalar_i32(i32 %x, i32 %y, i32* %divdst) nounwind {
82; X86-LABEL: scalar_i32:
83; X86:       # %bb.0:
84; X86-NEXT:    pushl %edi
85; X86-NEXT:    pushl %esi
86; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
87; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
88; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
89; X86-NEXT:    movl %ecx, %eax
90; X86-NEXT:    xorl %edx, %edx
91; X86-NEXT:    divl %edi
92; X86-NEXT:    movl %eax, (%esi)
93; X86-NEXT:    imull %edi, %eax
94; X86-NEXT:    subl %eax, %ecx
95; X86-NEXT:    movl %ecx, %eax
96; X86-NEXT:    popl %esi
97; X86-NEXT:    popl %edi
98; X86-NEXT:    retl
99;
100; X64-LABEL: scalar_i32:
101; X64:       # %bb.0:
102; X64-NEXT:    movq %rdx, %rcx
103; X64-NEXT:    movl %edi, %eax
104; X64-NEXT:    xorl %edx, %edx
105; X64-NEXT:    divl %esi
106; X64-NEXT:    movl %eax, (%rcx)
107; X64-NEXT:    imull %esi, %eax
108; X64-NEXT:    subl %eax, %edi
109; X64-NEXT:    movl %edi, %eax
110; X64-NEXT:    retq
111  %div = udiv i32 %x, %y
112  store i32 %div, i32* %divdst, align 4
113  %t1 = mul i32 %div, %y
114  %t2 = sub i32 %x, %t1
115  ret i32 %t2
116}
117
118define i64 @scalar_i64(i64 %x, i64 %y, i64* %divdst) nounwind {
119; X86-LABEL: scalar_i64:
120; X86:       # %bb.0:
121; X86-NEXT:    pushl %ebp
122; X86-NEXT:    pushl %ebx
123; X86-NEXT:    pushl %edi
124; X86-NEXT:    pushl %esi
125; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
126; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
127; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
128; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
129; X86-NEXT:    pushl %ebx
130; X86-NEXT:    pushl %ebp
131; X86-NEXT:    pushl %edi
132; X86-NEXT:    pushl %esi
133; X86-NEXT:    calll __udivdi3
134; X86-NEXT:    addl $16, %esp
135; X86-NEXT:    movl %edx, %ecx
136; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
137; X86-NEXT:    movl %ecx, 4(%edx)
138; X86-NEXT:    movl %eax, (%edx)
139; X86-NEXT:    imull %eax, %ebx
140; X86-NEXT:    mull %ebp
141; X86-NEXT:    addl %ebx, %edx
142; X86-NEXT:    imull %ebp, %ecx
143; X86-NEXT:    addl %edx, %ecx
144; X86-NEXT:    subl %eax, %esi
145; X86-NEXT:    sbbl %ecx, %edi
146; X86-NEXT:    movl %esi, %eax
147; X86-NEXT:    movl %edi, %edx
148; X86-NEXT:    popl %esi
149; X86-NEXT:    popl %edi
150; X86-NEXT:    popl %ebx
151; X86-NEXT:    popl %ebp
152; X86-NEXT:    retl
153;
154; X64-LABEL: scalar_i64:
155; X64:       # %bb.0:
156; X64-NEXT:    movq %rdx, %rcx
157; X64-NEXT:    movq %rdi, %rax
158; X64-NEXT:    xorl %edx, %edx
159; X64-NEXT:    divq %rsi
160; X64-NEXT:    movq %rax, (%rcx)
161; X64-NEXT:    imulq %rsi, %rax
162; X64-NEXT:    subq %rax, %rdi
163; X64-NEXT:    movq %rdi, %rax
164; X64-NEXT:    retq
165  %div = udiv i64 %x, %y
166  store i64 %div, i64* %divdst, align 4
167  %t1 = mul i64 %div, %y
168  %t2 = sub i64 %x, %t1
169  ret i64 %t2
170}
171
172define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, <16 x i8>* %divdst) nounwind {
173; X86-LABEL: vector_i128_i8:
174; X86:       # %bb.0:
175; X86-NEXT:    pushl %ebp
176; X86-NEXT:    movl %esp, %ebp
177; X86-NEXT:    pushl %ebx
178; X86-NEXT:    pushl %edi
179; X86-NEXT:    pushl %esi
180; X86-NEXT:    andl $-16, %esp
181; X86-NEXT:    subl $48, %esp
182; X86-NEXT:    movdqa %xmm0, (%esp)
183; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
184; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
185; X86-NEXT:    divb {{[0-9]+}}(%esp)
186; X86-NEXT:    movzbl %al, %eax
187; X86-NEXT:    movd %eax, %xmm2
188; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
189; X86-NEXT:    divb {{[0-9]+}}(%esp)
190; X86-NEXT:    movzbl %al, %eax
191; X86-NEXT:    movd %eax, %xmm3
192; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
193; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
194; X86-NEXT:    divb {{[0-9]+}}(%esp)
195; X86-NEXT:    movzbl %al, %eax
196; X86-NEXT:    movd %eax, %xmm4
197; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
198; X86-NEXT:    divb {{[0-9]+}}(%esp)
199; X86-NEXT:    movzbl %al, %eax
200; X86-NEXT:    movd %eax, %xmm2
201; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
202; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
203; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
204; X86-NEXT:    divb {{[0-9]+}}(%esp)
205; X86-NEXT:    movzbl %al, %eax
206; X86-NEXT:    movd %eax, %xmm3
207; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
208; X86-NEXT:    divb {{[0-9]+}}(%esp)
209; X86-NEXT:    movzbl %al, %eax
210; X86-NEXT:    movd %eax, %xmm4
211; X86-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
212; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
213; X86-NEXT:    divb {{[0-9]+}}(%esp)
214; X86-NEXT:    movzbl %al, %eax
215; X86-NEXT:    movd %eax, %xmm5
216; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
217; X86-NEXT:    divb {{[0-9]+}}(%esp)
218; X86-NEXT:    movzbl %al, %eax
219; X86-NEXT:    movd %eax, %xmm3
220; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
221; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
222; X86-NEXT:    divb {{[0-9]+}}(%esp)
223; X86-NEXT:    movzbl %al, %eax
224; X86-NEXT:    movd %eax, %xmm5
225; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
226; X86-NEXT:    divb {{[0-9]+}}(%esp)
227; X86-NEXT:    movzbl %al, %eax
228; X86-NEXT:    movd %eax, %xmm6
229; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
230; X86-NEXT:    divb {{[0-9]+}}(%esp)
231; X86-NEXT:    movzbl %al, %edx
232; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
233; X86-NEXT:    divb {{[0-9]+}}(%esp)
234; X86-NEXT:    movzbl %al, %esi
235; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
236; X86-NEXT:    divb {{[0-9]+}}(%esp)
237; X86-NEXT:    movzbl %al, %edi
238; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
239; X86-NEXT:    divb {{[0-9]+}}(%esp)
240; X86-NEXT:    movzbl %al, %ebx
241; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
242; X86-NEXT:    divb {{[0-9]+}}(%esp)
243; X86-NEXT:    movl %eax, %ecx
244; X86-NEXT:    movzbl (%esp), %eax
245; X86-NEXT:    divb {{[0-9]+}}(%esp)
246; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
247; X86-NEXT:    movd %edx, %xmm4
248; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
249; X86-NEXT:    movd %esi, %xmm2
250; X86-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
251; X86-NEXT:    movd %edi, %xmm5
252; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
253; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
254; X86-NEXT:    movd %ebx, %xmm4
255; X86-NEXT:    movzbl %cl, %ecx
256; X86-NEXT:    movd %ecx, %xmm6
257; X86-NEXT:    movl 8(%ebp), %ecx
258; X86-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
259; X86-NEXT:    movzbl %al, %eax
260; X86-NEXT:    movd %eax, %xmm5
261; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
262; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
263; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
264; X86-NEXT:    movdqa %xmm5, %xmm2
265; X86-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
266; X86-NEXT:    movdqa %xmm2, (%ecx)
267; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
268; X86-NEXT:    movdqa %xmm1, %xmm2
269; X86-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
270; X86-NEXT:    pmullw %xmm3, %xmm2
271; X86-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
272; X86-NEXT:    pand %xmm3, %xmm2
273; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
274; X86-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
275; X86-NEXT:    pmullw %xmm5, %xmm1
276; X86-NEXT:    pand %xmm3, %xmm1
277; X86-NEXT:    packuswb %xmm2, %xmm1
278; X86-NEXT:    psubb %xmm1, %xmm0
279; X86-NEXT:    leal -12(%ebp), %esp
280; X86-NEXT:    popl %esi
281; X86-NEXT:    popl %edi
282; X86-NEXT:    popl %ebx
283; X86-NEXT:    popl %ebp
284; X86-NEXT:    retl
285;
286; X64-LABEL: vector_i128_i8:
287; X64:       # %bb.0:
288; X64-NEXT:    pushq %rbp
289; X64-NEXT:    pushq %r15
290; X64-NEXT:    pushq %r14
291; X64-NEXT:    pushq %r13
292; X64-NEXT:    pushq %r12
293; X64-NEXT:    pushq %rbx
294; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
295; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
296; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
297; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
298; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
299; X64-NEXT:    movzbl %al, %eax
300; X64-NEXT:    movd %eax, %xmm2
301; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
302; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
303; X64-NEXT:    movzbl %al, %r8d
304; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
305; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
306; X64-NEXT:    movzbl %al, %r9d
307; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
308; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
309; X64-NEXT:    movzbl %al, %r10d
310; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
311; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
312; X64-NEXT:    movzbl %al, %r11d
313; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
314; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
315; X64-NEXT:    movzbl %al, %r14d
316; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
317; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
318; X64-NEXT:    movzbl %al, %r15d
319; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
320; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
321; X64-NEXT:    movzbl %al, %r12d
322; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
323; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
324; X64-NEXT:    movzbl %al, %r13d
325; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
326; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
327; X64-NEXT:    movzbl %al, %edi
328; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
329; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
330; X64-NEXT:    movzbl %al, %esi
331; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
332; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
333; X64-NEXT:    movzbl %al, %ebx
334; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
335; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
336; X64-NEXT:    movzbl %al, %ebp
337; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
338; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
339; X64-NEXT:    movzbl %al, %edx
340; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
341; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
342; X64-NEXT:    movl %eax, %ecx
343; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
344; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
345; X64-NEXT:    movd %r8d, %xmm3
346; X64-NEXT:    movd %r9d, %xmm4
347; X64-NEXT:    movd %r10d, %xmm5
348; X64-NEXT:    movd %r11d, %xmm6
349; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
350; X64-NEXT:    movd %r14d, %xmm2
351; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
352; X64-NEXT:    movd %r15d, %xmm4
353; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
354; X64-NEXT:    movd %r12d, %xmm3
355; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
356; X64-NEXT:    movd %r13d, %xmm6
357; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
358; X64-NEXT:    movd %edi, %xmm4
359; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
360; X64-NEXT:    movd %esi, %xmm2
361; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
362; X64-NEXT:    movd %ebx, %xmm5
363; X64-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
364; X64-NEXT:    movd %ebp, %xmm6
365; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
366; X64-NEXT:    movd %edx, %xmm2
367; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
368; X64-NEXT:    movzbl %cl, %ecx
369; X64-NEXT:    movd %ecx, %xmm4
370; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
371; X64-NEXT:    movzbl %al, %eax
372; X64-NEXT:    movd %eax, %xmm6
373; X64-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
374; X64-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
375; X64-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
376; X64-NEXT:    movdqa %xmm6, %xmm2
377; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
378; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
379; X64-NEXT:    movdqa %xmm2, (%rax)
380; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
381; X64-NEXT:    movdqa %xmm1, %xmm2
382; X64-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
383; X64-NEXT:    pmullw %xmm3, %xmm2
384; X64-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
385; X64-NEXT:    pand %xmm3, %xmm2
386; X64-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
387; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
388; X64-NEXT:    pmullw %xmm6, %xmm1
389; X64-NEXT:    pand %xmm3, %xmm1
390; X64-NEXT:    packuswb %xmm2, %xmm1
391; X64-NEXT:    psubb %xmm1, %xmm0
392; X64-NEXT:    popq %rbx
393; X64-NEXT:    popq %r12
394; X64-NEXT:    popq %r13
395; X64-NEXT:    popq %r14
396; X64-NEXT:    popq %r15
397; X64-NEXT:    popq %rbp
398; X64-NEXT:    retq
399  %div = udiv <16 x i8> %x, %y
400  store <16 x i8> %div, <16 x i8>* %divdst, align 16
401  %t1 = mul <16 x i8> %div, %y
402  %t2 = sub <16 x i8> %x, %t1
403  ret <16 x i8> %t2
404}
405
406define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, <8 x i16>* %divdst) nounwind {
407; X86-LABEL: vector_i128_i16:
408; X86:       # %bb.0:
409; X86-NEXT:    pushl %esi
410; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
411; X86-NEXT:    pextrw $7, %xmm0, %eax
412; X86-NEXT:    pextrw $7, %xmm1, %esi
413; X86-NEXT:    # kill: def $ax killed $ax killed $eax
414; X86-NEXT:    xorl %edx, %edx
415; X86-NEXT:    divw %si
416; X86-NEXT:    # kill: def $ax killed $ax def $eax
417; X86-NEXT:    movd %eax, %xmm2
418; X86-NEXT:    pextrw $6, %xmm0, %eax
419; X86-NEXT:    pextrw $6, %xmm1, %esi
420; X86-NEXT:    # kill: def $ax killed $ax killed $eax
421; X86-NEXT:    xorl %edx, %edx
422; X86-NEXT:    divw %si
423; X86-NEXT:    # kill: def $ax killed $ax def $eax
424; X86-NEXT:    movd %eax, %xmm3
425; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
426; X86-NEXT:    pextrw $5, %xmm0, %eax
427; X86-NEXT:    pextrw $5, %xmm1, %esi
428; X86-NEXT:    # kill: def $ax killed $ax killed $eax
429; X86-NEXT:    xorl %edx, %edx
430; X86-NEXT:    divw %si
431; X86-NEXT:    # kill: def $ax killed $ax def $eax
432; X86-NEXT:    movd %eax, %xmm4
433; X86-NEXT:    pextrw $4, %xmm0, %eax
434; X86-NEXT:    pextrw $4, %xmm1, %esi
435; X86-NEXT:    # kill: def $ax killed $ax killed $eax
436; X86-NEXT:    xorl %edx, %edx
437; X86-NEXT:    divw %si
438; X86-NEXT:    # kill: def $ax killed $ax def $eax
439; X86-NEXT:    movd %eax, %xmm2
440; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
441; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
442; X86-NEXT:    pextrw $3, %xmm0, %eax
443; X86-NEXT:    pextrw $3, %xmm1, %esi
444; X86-NEXT:    # kill: def $ax killed $ax killed $eax
445; X86-NEXT:    xorl %edx, %edx
446; X86-NEXT:    divw %si
447; X86-NEXT:    # kill: def $ax killed $ax def $eax
448; X86-NEXT:    movd %eax, %xmm3
449; X86-NEXT:    pextrw $2, %xmm0, %eax
450; X86-NEXT:    pextrw $2, %xmm1, %esi
451; X86-NEXT:    # kill: def $ax killed $ax killed $eax
452; X86-NEXT:    xorl %edx, %edx
453; X86-NEXT:    divw %si
454; X86-NEXT:    # kill: def $ax killed $ax def $eax
455; X86-NEXT:    movd %eax, %xmm4
456; X86-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
457; X86-NEXT:    pextrw $1, %xmm0, %eax
458; X86-NEXT:    pextrw $1, %xmm1, %esi
459; X86-NEXT:    # kill: def $ax killed $ax killed $eax
460; X86-NEXT:    xorl %edx, %edx
461; X86-NEXT:    divw %si
462; X86-NEXT:    # kill: def $ax killed $ax def $eax
463; X86-NEXT:    movd %eax, %xmm3
464; X86-NEXT:    movd %xmm0, %eax
465; X86-NEXT:    movd %xmm1, %esi
466; X86-NEXT:    # kill: def $ax killed $ax killed $eax
467; X86-NEXT:    xorl %edx, %edx
468; X86-NEXT:    divw %si
469; X86-NEXT:    # kill: def $ax killed $ax def $eax
470; X86-NEXT:    movd %eax, %xmm5
471; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
472; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
473; X86-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
474; X86-NEXT:    movdqa %xmm5, (%ecx)
475; X86-NEXT:    pmullw %xmm1, %xmm5
476; X86-NEXT:    psubw %xmm5, %xmm0
477; X86-NEXT:    popl %esi
478; X86-NEXT:    retl
479;
480; X64-LABEL: vector_i128_i16:
481; X64:       # %bb.0:
482; X64-NEXT:    pextrw $7, %xmm0, %eax
483; X64-NEXT:    pextrw $7, %xmm1, %ecx
484; X64-NEXT:    # kill: def $ax killed $ax killed $eax
485; X64-NEXT:    xorl %edx, %edx
486; X64-NEXT:    divw %cx
487; X64-NEXT:    # kill: def $ax killed $ax def $eax
488; X64-NEXT:    movd %eax, %xmm2
489; X64-NEXT:    pextrw $6, %xmm0, %eax
490; X64-NEXT:    pextrw $6, %xmm1, %ecx
491; X64-NEXT:    # kill: def $ax killed $ax killed $eax
492; X64-NEXT:    xorl %edx, %edx
493; X64-NEXT:    divw %cx
494; X64-NEXT:    # kill: def $ax killed $ax def $eax
495; X64-NEXT:    movd %eax, %xmm3
496; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
497; X64-NEXT:    pextrw $5, %xmm0, %eax
498; X64-NEXT:    pextrw $5, %xmm1, %ecx
499; X64-NEXT:    # kill: def $ax killed $ax killed $eax
500; X64-NEXT:    xorl %edx, %edx
501; X64-NEXT:    divw %cx
502; X64-NEXT:    # kill: def $ax killed $ax def $eax
503; X64-NEXT:    movd %eax, %xmm4
504; X64-NEXT:    pextrw $4, %xmm0, %eax
505; X64-NEXT:    pextrw $4, %xmm1, %ecx
506; X64-NEXT:    # kill: def $ax killed $ax killed $eax
507; X64-NEXT:    xorl %edx, %edx
508; X64-NEXT:    divw %cx
509; X64-NEXT:    # kill: def $ax killed $ax def $eax
510; X64-NEXT:    movd %eax, %xmm2
511; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
512; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
513; X64-NEXT:    pextrw $3, %xmm0, %eax
514; X64-NEXT:    pextrw $3, %xmm1, %ecx
515; X64-NEXT:    # kill: def $ax killed $ax killed $eax
516; X64-NEXT:    xorl %edx, %edx
517; X64-NEXT:    divw %cx
518; X64-NEXT:    # kill: def $ax killed $ax def $eax
519; X64-NEXT:    movd %eax, %xmm3
520; X64-NEXT:    pextrw $2, %xmm0, %eax
521; X64-NEXT:    pextrw $2, %xmm1, %ecx
522; X64-NEXT:    # kill: def $ax killed $ax killed $eax
523; X64-NEXT:    xorl %edx, %edx
524; X64-NEXT:    divw %cx
525; X64-NEXT:    # kill: def $ax killed $ax def $eax
526; X64-NEXT:    movd %eax, %xmm4
527; X64-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
528; X64-NEXT:    pextrw $1, %xmm0, %eax
529; X64-NEXT:    pextrw $1, %xmm1, %ecx
530; X64-NEXT:    # kill: def $ax killed $ax killed $eax
531; X64-NEXT:    xorl %edx, %edx
532; X64-NEXT:    divw %cx
533; X64-NEXT:    # kill: def $ax killed $ax def $eax
534; X64-NEXT:    movd %eax, %xmm3
535; X64-NEXT:    movd %xmm0, %eax
536; X64-NEXT:    movd %xmm1, %ecx
537; X64-NEXT:    # kill: def $ax killed $ax killed $eax
538; X64-NEXT:    xorl %edx, %edx
539; X64-NEXT:    divw %cx
540; X64-NEXT:    # kill: def $ax killed $ax def $eax
541; X64-NEXT:    movd %eax, %xmm5
542; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
543; X64-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
544; X64-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
545; X64-NEXT:    movdqa %xmm5, (%rdi)
546; X64-NEXT:    pmullw %xmm1, %xmm5
547; X64-NEXT:    psubw %xmm5, %xmm0
548; X64-NEXT:    retq
549  %div = udiv <8 x i16> %x, %y
550  store <8 x i16> %div, <8 x i16>* %divdst, align 16
551  %t1 = mul <8 x i16> %div, %y
552  %t2 = sub <8 x i16> %x, %t1
553  ret <8 x i16> %t2
554}
555
556define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst) nounwind {
557; X86-LABEL: vector_i128_i32:
558; X86:       # %bb.0:
559; X86-NEXT:    pushl %esi
560; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
561; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
562; X86-NEXT:    movd %xmm2, %eax
563; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
564; X86-NEXT:    movd %xmm2, %esi
565; X86-NEXT:    xorl %edx, %edx
566; X86-NEXT:    divl %esi
567; X86-NEXT:    movd %eax, %xmm2
568; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
569; X86-NEXT:    movd %xmm3, %eax
570; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
571; X86-NEXT:    movd %xmm3, %esi
572; X86-NEXT:    xorl %edx, %edx
573; X86-NEXT:    divl %esi
574; X86-NEXT:    movd %eax, %xmm3
575; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
576; X86-NEXT:    movd %xmm0, %eax
577; X86-NEXT:    movd %xmm1, %esi
578; X86-NEXT:    xorl %edx, %edx
579; X86-NEXT:    divl %esi
580; X86-NEXT:    movd %eax, %xmm2
581; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
582; X86-NEXT:    movd %xmm4, %eax
583; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
584; X86-NEXT:    movd %xmm4, %esi
585; X86-NEXT:    xorl %edx, %edx
586; X86-NEXT:    divl %esi
587; X86-NEXT:    movd %eax, %xmm4
588; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
589; X86-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
590; X86-NEXT:    movdqa %xmm2, (%ecx)
591; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
592; X86-NEXT:    pmuludq %xmm1, %xmm2
593; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
594; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
595; X86-NEXT:    pmuludq %xmm3, %xmm1
596; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
597; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
598; X86-NEXT:    psubd %xmm2, %xmm0
599; X86-NEXT:    popl %esi
600; X86-NEXT:    retl
601;
602; X64-LABEL: vector_i128_i32:
603; X64:       # %bb.0:
604; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
605; X64-NEXT:    movd %xmm2, %eax
606; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
607; X64-NEXT:    movd %xmm2, %ecx
608; X64-NEXT:    xorl %edx, %edx
609; X64-NEXT:    divl %ecx
610; X64-NEXT:    movd %eax, %xmm2
611; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
612; X64-NEXT:    movd %xmm3, %eax
613; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
614; X64-NEXT:    movd %xmm3, %ecx
615; X64-NEXT:    xorl %edx, %edx
616; X64-NEXT:    divl %ecx
617; X64-NEXT:    movd %eax, %xmm3
618; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
619; X64-NEXT:    movd %xmm0, %eax
620; X64-NEXT:    movd %xmm1, %ecx
621; X64-NEXT:    xorl %edx, %edx
622; X64-NEXT:    divl %ecx
623; X64-NEXT:    movd %eax, %xmm2
624; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
625; X64-NEXT:    movd %xmm4, %eax
626; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
627; X64-NEXT:    movd %xmm4, %ecx
628; X64-NEXT:    xorl %edx, %edx
629; X64-NEXT:    divl %ecx
630; X64-NEXT:    movd %eax, %xmm4
631; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
632; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
633; X64-NEXT:    movdqa %xmm2, (%rdi)
634; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
635; X64-NEXT:    pmuludq %xmm1, %xmm2
636; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
637; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
638; X64-NEXT:    pmuludq %xmm3, %xmm1
639; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
640; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
641; X64-NEXT:    psubd %xmm2, %xmm0
642; X64-NEXT:    retq
643  %div = udiv <4 x i32> %x, %y
644  store <4 x i32> %div, <4 x i32>* %divdst, align 16
645  %t1 = mul <4 x i32> %div, %y
646  %t2 = sub <4 x i32> %x, %t1
647  ret <4 x i32> %t2
648}
649
650define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst) nounwind {
651; X86-LABEL: vector_i128_i64:
652; X86:       # %bb.0:
653; X86-NEXT:    pushl %esi
654; X86-NEXT:    subl $72, %esp
655; X86-NEXT:    movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
656; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
657; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
658; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
659; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
660; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
661; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
662; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
663; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
664; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
665; X86-NEXT:    movd %xmm1, (%esp)
666; X86-NEXT:    calll __udivdi3
667; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
668; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
669; X86-NEXT:    movd %xmm0, {{[0-9]+}}(%esp)
670; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
671; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
672; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
673; X86-NEXT:    movd %xmm0, {{[0-9]+}}(%esp)
674; X86-NEXT:    movd %xmm1, (%esp)
675; X86-NEXT:    movd %edx, %xmm0
676; X86-NEXT:    movd %eax, %xmm1
677; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
678; X86-NEXT:    movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
679; X86-NEXT:    calll __udivdi3
680; X86-NEXT:    movd %edx, %xmm0
681; X86-NEXT:    movd %eax, %xmm1
682; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
683; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
684; X86-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
685; X86-NEXT:    movdqa %xmm1, (%esi)
686; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload
687; X86-NEXT:    movdqa %xmm3, %xmm0
688; X86-NEXT:    psrlq $32, %xmm0
689; X86-NEXT:    pmuludq %xmm1, %xmm0
690; X86-NEXT:    movdqa %xmm1, %xmm2
691; X86-NEXT:    psrlq $32, %xmm2
692; X86-NEXT:    pmuludq %xmm3, %xmm2
693; X86-NEXT:    paddq %xmm0, %xmm2
694; X86-NEXT:    psllq $32, %xmm2
695; X86-NEXT:    pmuludq %xmm3, %xmm1
696; X86-NEXT:    paddq %xmm2, %xmm1
697; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
698; X86-NEXT:    psubq %xmm1, %xmm0
699; X86-NEXT:    addl $72, %esp
700; X86-NEXT:    popl %esi
701; X86-NEXT:    retl
702;
703; X64-LABEL: vector_i128_i64:
704; X64:       # %bb.0:
705; X64-NEXT:    movq %xmm0, %rax
706; X64-NEXT:    movq %xmm1, %rcx
707; X64-NEXT:    xorl %edx, %edx
708; X64-NEXT:    divq %rcx
709; X64-NEXT:    movq %rax, %xmm2
710; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
711; X64-NEXT:    movq %xmm3, %rax
712; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
713; X64-NEXT:    movq %xmm3, %rcx
714; X64-NEXT:    xorl %edx, %edx
715; X64-NEXT:    divq %rcx
716; X64-NEXT:    movq %rax, %xmm3
717; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
718; X64-NEXT:    movdqa %xmm2, (%rdi)
719; X64-NEXT:    movdqa %xmm1, %xmm3
720; X64-NEXT:    psrlq $32, %xmm3
721; X64-NEXT:    pmuludq %xmm2, %xmm3
722; X64-NEXT:    movdqa %xmm2, %xmm4
723; X64-NEXT:    psrlq $32, %xmm4
724; X64-NEXT:    pmuludq %xmm1, %xmm4
725; X64-NEXT:    paddq %xmm3, %xmm4
726; X64-NEXT:    psllq $32, %xmm4
727; X64-NEXT:    pmuludq %xmm1, %xmm2
728; X64-NEXT:    paddq %xmm4, %xmm2
729; X64-NEXT:    psubq %xmm2, %xmm0
730; X64-NEXT:    retq
731  %div = udiv <2 x i64> %x, %y
732  store <2 x i64> %div, <2 x i64>* %divdst, align 16
733  %t1 = mul <2 x i64> %div, %y
734  %t2 = sub <2 x i64> %x, %t1
735  ret <2 x i64> %t2
736}
737
738; Special tests.
739
740define i32 @scalar_i32_commutative(i32 %x, i32* %ysrc, i32* %divdst) nounwind {
741; X86-LABEL: scalar_i32_commutative:
742; X86:       # %bb.0:
743; X86-NEXT:    pushl %edi
744; X86-NEXT:    pushl %esi
745; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
746; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
747; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
748; X86-NEXT:    movl (%eax), %edi
749; X86-NEXT:    movl %ecx, %eax
750; X86-NEXT:    xorl %edx, %edx
751; X86-NEXT:    divl %edi
752; X86-NEXT:    movl %eax, (%esi)
753; X86-NEXT:    imull %eax, %edi
754; X86-NEXT:    subl %edi, %ecx
755; X86-NEXT:    movl %ecx, %eax
756; X86-NEXT:    popl %esi
757; X86-NEXT:    popl %edi
758; X86-NEXT:    retl
759;
760; X64-LABEL: scalar_i32_commutative:
761; X64:       # %bb.0:
762; X64-NEXT:    movq %rdx, %rcx
763; X64-NEXT:    movl (%rsi), %esi
764; X64-NEXT:    movl %edi, %eax
765; X64-NEXT:    xorl %edx, %edx
766; X64-NEXT:    divl %esi
767; X64-NEXT:    movl %eax, (%rcx)
768; X64-NEXT:    imull %eax, %esi
769; X64-NEXT:    subl %esi, %edi
770; X64-NEXT:    movl %edi, %eax
771; X64-NEXT:    retq
772  %y = load i32, i32* %ysrc, align 4
773  %div = udiv i32 %x, %y
774  store i32 %div, i32* %divdst, align 4
775  %t1 = mul i32 %y, %div ; commutative
776  %t2 = sub i32 %x, %t1
777  ret i32 %t2
778}
779
780; We do not care about extra uses.
781define i32 @extrause(i32 %x, i32 %y, i32* %divdst, i32* %t1dst) nounwind {
782; X86-LABEL: extrause:
783; X86:       # %bb.0:
784; X86-NEXT:    pushl %ebx
785; X86-NEXT:    pushl %edi
786; X86-NEXT:    pushl %esi
787; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
788; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
789; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
790; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
791; X86-NEXT:    movl %ecx, %eax
792; X86-NEXT:    xorl %edx, %edx
793; X86-NEXT:    divl %ebx
794; X86-NEXT:    movl %eax, (%edi)
795; X86-NEXT:    imull %ebx, %eax
796; X86-NEXT:    movl %eax, (%esi)
797; X86-NEXT:    subl %eax, %ecx
798; X86-NEXT:    movl %ecx, %eax
799; X86-NEXT:    popl %esi
800; X86-NEXT:    popl %edi
801; X86-NEXT:    popl %ebx
802; X86-NEXT:    retl
803;
804; X64-LABEL: extrause:
805; X64:       # %bb.0:
806; X64-NEXT:    movq %rdx, %r8
807; X64-NEXT:    movl %edi, %eax
808; X64-NEXT:    xorl %edx, %edx
809; X64-NEXT:    divl %esi
810; X64-NEXT:    movl %eax, (%r8)
811; X64-NEXT:    imull %esi, %eax
812; X64-NEXT:    movl %eax, (%rcx)
813; X64-NEXT:    subl %eax, %edi
814; X64-NEXT:    movl %edi, %eax
815; X64-NEXT:    retq
816  %div = udiv i32 %x, %y
817  store i32 %div, i32* %divdst, align 4
818  %t1 = mul i32 %div, %y
819  store i32 %t1, i32* %t1dst, align 4
820  %t2 = sub i32 %x, %t1
821  ret i32 %t2
822}
823
824; 'rem' should appear next to 'div'.
825define i32 @multiple_bb(i32 %x, i32 %y, i32* %divdst, i1 zeroext %store_urem, i32* %uremdst) nounwind {
826; X86-LABEL: multiple_bb:
827; X86:       # %bb.0:
828; X86-NEXT:    pushl %ebx
829; X86-NEXT:    pushl %edi
830; X86-NEXT:    pushl %esi
831; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
832; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
833; X86-NEXT:    movb {{[0-9]+}}(%esp), %bl
834; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
835; X86-NEXT:    movl %ecx, %eax
836; X86-NEXT:    xorl %edx, %edx
837; X86-NEXT:    divl %esi
838; X86-NEXT:    movl %eax, (%edi)
839; X86-NEXT:    testb %bl, %bl
840; X86-NEXT:    je .LBB10_2
841; X86-NEXT:  # %bb.1: # %do_urem
842; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
843; X86-NEXT:    movl %eax, %edi
844; X86-NEXT:    imull %esi, %edi
845; X86-NEXT:    subl %edi, %ecx
846; X86-NEXT:    movl %ecx, (%edx)
847; X86-NEXT:  .LBB10_2: # %end
848; X86-NEXT:    popl %esi
849; X86-NEXT:    popl %edi
850; X86-NEXT:    popl %ebx
851; X86-NEXT:    retl
852;
853; X64-LABEL: multiple_bb:
854; X64:       # %bb.0:
855; X64-NEXT:    movq %rdx, %r9
856; X64-NEXT:    movl %edi, %eax
857; X64-NEXT:    xorl %edx, %edx
858; X64-NEXT:    divl %esi
859; X64-NEXT:    movl %eax, (%r9)
860; X64-NEXT:    testb %cl, %cl
861; X64-NEXT:    je .LBB10_2
862; X64-NEXT:  # %bb.1: # %do_urem
863; X64-NEXT:    movl %eax, %ecx
864; X64-NEXT:    imull %esi, %ecx
865; X64-NEXT:    subl %ecx, %edi
866; X64-NEXT:    movl %edi, (%r8)
867; X64-NEXT:  .LBB10_2: # %end
868; X64-NEXT:    retq
869  %div = udiv i32 %x, %y
870  store i32 %div, i32* %divdst, align 4
871  br i1 %store_urem, label %do_urem, label %end
872do_urem:
873  %t1 = mul i32 %div, %y
874  %t2 = sub i32 %x, %t1
875  store i32 %t2, i32* %uremdst, align 4
876  br label %end
877end:
878  ret i32 %div
879}
880
881define i32 @negative_different_x(i32 %x0, i32 %x1, i32 %y, i32* %divdst) nounwind {
882; X86-LABEL: negative_different_x:
883; X86:       # %bb.0:
884; X86-NEXT:    pushl %edi
885; X86-NEXT:    pushl %esi
886; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
887; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
888; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
889; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
890; X86-NEXT:    xorl %edx, %edx
891; X86-NEXT:    divl %edi
892; X86-NEXT:    movl %eax, (%esi)
893; X86-NEXT:    imull %edi, %eax
894; X86-NEXT:    subl %eax, %ecx
895; X86-NEXT:    movl %ecx, %eax
896; X86-NEXT:    popl %esi
897; X86-NEXT:    popl %edi
898; X86-NEXT:    retl
899;
900; X64-LABEL: negative_different_x:
901; X64:       # %bb.0:
902; X64-NEXT:    movl %edx, %r8d
903; X64-NEXT:    movl %edi, %eax
904; X64-NEXT:    xorl %edx, %edx
905; X64-NEXT:    divl %r8d
906; X64-NEXT:    movl %eax, (%rcx)
907; X64-NEXT:    imull %r8d, %eax
908; X64-NEXT:    subl %eax, %esi
909; X64-NEXT:    movl %esi, %eax
910; X64-NEXT:    retq
911  %div = udiv i32 %x0, %y ; not %x1
912  store i32 %div, i32* %divdst, align 4
913  %t1 = mul i32 %div, %y
914  %t2 = sub i32 %x1, %t1 ; not %x0
915  ret i32 %t2
916}
917