• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -O3 -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s -check-prefix=X64
3; RUN: llc < %s -O3 -mtriple=i686-unknown-unknown   -mcpu=core2 | FileCheck %s -check-prefix=X32
4
5; @simple is the most basic chain of address induction variables. Chaining
6; saves at least one register and avoids complex addressing and setup
7; code.
8;
9; %x * 4
10; no other address computation in the preheader
11; no complex address modes
12;
13; no expensive address computation in the preheader
14; no complex address modes
15
16define i32 @simple(i32* %a, i32* %b, i32 %x) nounwind {
17; X64-LABEL: simple:
18; X64:       # %bb.0: # %entry
19; X64-NEXT:    movslq %edx, %rcx
20; X64-NEXT:    shlq $2, %rcx
21; X64-NEXT:    xorl %eax, %eax
22; X64-NEXT:    .p2align 4, 0x90
23; X64-NEXT:  .LBB0_1: # %loop
24; X64-NEXT:    # =>This Inner Loop Header: Depth=1
25; X64-NEXT:    addl (%rdi), %eax
26; X64-NEXT:    leaq (%rdi,%rcx), %r8
27; X64-NEXT:    addl (%rdi,%rcx), %eax
28; X64-NEXT:    leaq (%r8,%rcx), %rdx
29; X64-NEXT:    addl (%rcx,%r8), %eax
30; X64-NEXT:    addl (%rcx,%rdx), %eax
31; X64-NEXT:    addq %rcx, %rdx
32; X64-NEXT:    addq %rcx, %rdx
33; X64-NEXT:    movq %rdx, %rdi
34; X64-NEXT:    cmpq %rsi, %rdx
35; X64-NEXT:    jne .LBB0_1
36; X64-NEXT:  # %bb.2: # %exit
37; X64-NEXT:    retq
38;
39; X32-LABEL: simple:
40; X32:       # %bb.0: # %entry
41; X32-NEXT:    pushl %ebx
42; X32-NEXT:    pushl %edi
43; X32-NEXT:    pushl %esi
44; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
45; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
46; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
47; X32-NEXT:    shll $2, %edx
48; X32-NEXT:    xorl %eax, %eax
49; X32-NEXT:    .p2align 4, 0x90
50; X32-NEXT:  .LBB0_1: # %loop
51; X32-NEXT:    # =>This Inner Loop Header: Depth=1
52; X32-NEXT:    addl (%esi), %eax
53; X32-NEXT:    leal (%esi,%edx), %edi
54; X32-NEXT:    addl (%esi,%edx), %eax
55; X32-NEXT:    leal (%edi,%edx), %ebx
56; X32-NEXT:    addl (%edx,%edi), %eax
57; X32-NEXT:    addl (%edx,%ebx), %eax
58; X32-NEXT:    addl %edx, %ebx
59; X32-NEXT:    addl %edx, %ebx
60; X32-NEXT:    movl %ebx, %esi
61; X32-NEXT:    cmpl %ecx, %ebx
62; X32-NEXT:    jne .LBB0_1
63; X32-NEXT:  # %bb.2: # %exit
64; X32-NEXT:    popl %esi
65; X32-NEXT:    popl %edi
66; X32-NEXT:    popl %ebx
67; X32-NEXT:    retl
68entry:
69  br label %loop
70loop:
71  %iv = phi i32* [ %a, %entry ], [ %iv4, %loop ]
72  %s = phi i32 [ 0, %entry ], [ %s4, %loop ]
73  %v = load i32, i32* %iv
74  %iv1 = getelementptr inbounds i32, i32* %iv, i32 %x
75  %v1 = load i32, i32* %iv1
76  %iv2 = getelementptr inbounds i32, i32* %iv1, i32 %x
77  %v2 = load i32, i32* %iv2
78  %iv3 = getelementptr inbounds i32, i32* %iv2, i32 %x
79  %v3 = load i32, i32* %iv3
80  %s1 = add i32 %s, %v
81  %s2 = add i32 %s1, %v1
82  %s3 = add i32 %s2, %v2
83  %s4 = add i32 %s3, %v3
84  %iv4 = getelementptr inbounds i32, i32* %iv3, i32 %x
85  %cmp = icmp eq i32* %iv4, %b
86  br i1 %cmp, label %exit, label %loop
87exit:
88  ret i32 %s4
89}
90
91; @user is not currently chained because the IV is live across memory ops.
92;
93; expensive address computation in the preheader
94; complex address modes
95define i32 @user(i32* %a, i32* %b, i32 %x) nounwind {
96; X64-LABEL: user:
97; X64:       # %bb.0: # %entry
98; X64-NEXT:    movslq %edx, %rcx
99; X64-NEXT:    movq %rcx, %rdx
100; X64-NEXT:    shlq $4, %rdx
101; X64-NEXT:    leaq (,%rcx,4), %rax
102; X64-NEXT:    leaq (%rax,%rax,2), %r8
103; X64-NEXT:    xorl %eax, %eax
104; X64-NEXT:    .p2align 4, 0x90
105; X64-NEXT:  .LBB1_1: # %loop
106; X64-NEXT:    # =>This Inner Loop Header: Depth=1
107; X64-NEXT:    addl (%rdi), %eax
108; X64-NEXT:    addl (%rdi,%rcx,4), %eax
109; X64-NEXT:    addl (%rdi,%rcx,8), %eax
110; X64-NEXT:    addl (%rdi,%r8), %eax
111; X64-NEXT:    movl %eax, (%rdi)
112; X64-NEXT:    addq %rdx, %rdi
113; X64-NEXT:    cmpq %rdi, %rsi
114; X64-NEXT:    jne .LBB1_1
115; X64-NEXT:  # %bb.2: # %exit
116; X64-NEXT:    retq
117;
118; X32-LABEL: user:
119; X32:       # %bb.0: # %entry
120; X32-NEXT:    pushl %ebx
121; X32-NEXT:    pushl %edi
122; X32-NEXT:    pushl %esi
123; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
124; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
125; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
126; X32-NEXT:    movl %ecx, %edi
127; X32-NEXT:    shll $4, %edi
128; X32-NEXT:    leal (,%ecx,4), %eax
129; X32-NEXT:    leal (%eax,%eax,2), %ebx
130; X32-NEXT:    xorl %eax, %eax
131; X32-NEXT:    .p2align 4, 0x90
132; X32-NEXT:  .LBB1_1: # %loop
133; X32-NEXT:    # =>This Inner Loop Header: Depth=1
134; X32-NEXT:    addl (%esi), %eax
135; X32-NEXT:    addl (%esi,%ecx,4), %eax
136; X32-NEXT:    addl (%esi,%ecx,8), %eax
137; X32-NEXT:    addl (%esi,%ebx), %eax
138; X32-NEXT:    movl %eax, (%esi)
139; X32-NEXT:    addl %edi, %esi
140; X32-NEXT:    cmpl %esi, %edx
141; X32-NEXT:    jne .LBB1_1
142; X32-NEXT:  # %bb.2: # %exit
143; X32-NEXT:    popl %esi
144; X32-NEXT:    popl %edi
145; X32-NEXT:    popl %ebx
146; X32-NEXT:    retl
147entry:
148  br label %loop
149loop:
150  %iv = phi i32* [ %a, %entry ], [ %iv4, %loop ]
151  %s = phi i32 [ 0, %entry ], [ %s4, %loop ]
152  %v = load i32, i32* %iv
153  %iv1 = getelementptr inbounds i32, i32* %iv, i32 %x
154  %v1 = load i32, i32* %iv1
155  %iv2 = getelementptr inbounds i32, i32* %iv1, i32 %x
156  %v2 = load i32, i32* %iv2
157  %iv3 = getelementptr inbounds i32, i32* %iv2, i32 %x
158  %v3 = load i32, i32* %iv3
159  %s1 = add i32 %s, %v
160  %s2 = add i32 %s1, %v1
161  %s3 = add i32 %s2, %v2
162  %s4 = add i32 %s3, %v3
163  %iv4 = getelementptr inbounds i32, i32* %iv3, i32 %x
164  store i32 %s4, i32* %iv
165  %cmp = icmp eq i32* %iv4, %b
166  br i1 %cmp, label %exit, label %loop
167exit:
168  ret i32 %s4
169}
170
171; @extrastride is a slightly more interesting case of a single
172; complete chain with multiple strides. The test case IR is what LSR
173; used to do, and exactly what we don't want to do. LSR's new IV
174; chaining feature should now undo the damage.
175;
176; We currently don't handle this on X64 because the sexts cause
177; strange increment expressions like this:
178; IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
179;
180; For x32, no spills in the preheader, no complex address modes, no reloads.
181
182define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %res, i32 %x, i32 %y, i32 %z) nounwind {
183; X64-LABEL: extrastride:
184; X64:       # %bb.0: # %entry
185; X64-NEXT:    pushq %rbp
186; X64-NEXT:    pushq %r14
187; X64-NEXT:    pushq %rbx
188; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
189; X64-NEXT:    # kill: def $esi killed $esi def $rsi
190; X64-NEXT:    testl %r9d, %r9d
191; X64-NEXT:    je .LBB2_3
192; X64-NEXT:  # %bb.1: # %for.body.lr.ph
193; X64-NEXT:    leal (%rsi,%rsi), %r14d
194; X64-NEXT:    leal (%rsi,%rsi,2), %ebx
195; X64-NEXT:    addl %esi, %ecx
196; X64-NEXT:    leal (,%rsi,4), %eax
197; X64-NEXT:    leal (%rcx,%rsi,4), %ebp
198; X64-NEXT:    movslq %eax, %r10
199; X64-NEXT:    movslq %ebx, %r11
200; X64-NEXT:    movslq %r14d, %rbx
201; X64-NEXT:    movslq %esi, %rsi
202; X64-NEXT:    movslq %r8d, %rcx
203; X64-NEXT:    shlq $2, %rcx
204; X64-NEXT:    movslq %ebp, %rax
205; X64-NEXT:    .p2align 4, 0x90
206; X64-NEXT:  .LBB2_2: # %for.body
207; X64-NEXT:    # =>This Inner Loop Header: Depth=1
208; X64-NEXT:    movl (%rdi,%rsi), %ebp
209; X64-NEXT:    addl (%rdi), %ebp
210; X64-NEXT:    addl (%rdi,%rbx), %ebp
211; X64-NEXT:    addl (%rdi,%r11), %ebp
212; X64-NEXT:    addl (%rdi,%r10), %ebp
213; X64-NEXT:    movl %ebp, (%rdx)
214; X64-NEXT:    addq %rax, %rdi
215; X64-NEXT:    addq %rcx, %rdx
216; X64-NEXT:    decl %r9d
217; X64-NEXT:    jne .LBB2_2
218; X64-NEXT:  .LBB2_3: # %for.end
219; X64-NEXT:    popq %rbx
220; X64-NEXT:    popq %r14
221; X64-NEXT:    popq %rbp
222; X64-NEXT:    retq
223;
224; X32-LABEL: extrastride:
225; X32:       # %bb.0: # %entry
226; X32-NEXT:    pushl %ebp
227; X32-NEXT:    pushl %ebx
228; X32-NEXT:    pushl %edi
229; X32-NEXT:    pushl %esi
230; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
231; X32-NEXT:    testl %eax, %eax
232; X32-NEXT:    je .LBB2_3
233; X32-NEXT:  # %bb.1: # %for.body.lr.ph
234; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
235; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
236; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
237; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
238; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
239; X32-NEXT:    addl %esi, %edi
240; X32-NEXT:    shll $2, %ecx
241; X32-NEXT:    .p2align 4, 0x90
242; X32-NEXT:  .LBB2_2: # %for.body
243; X32-NEXT:    # =>This Inner Loop Header: Depth=1
244; X32-NEXT:    movl (%ebx,%esi), %ebp
245; X32-NEXT:    addl (%ebx), %ebp
246; X32-NEXT:    addl %esi, %ebx
247; X32-NEXT:    addl (%esi,%ebx), %ebp
248; X32-NEXT:    addl %esi, %ebx
249; X32-NEXT:    addl (%esi,%ebx), %ebp
250; X32-NEXT:    addl %esi, %ebx
251; X32-NEXT:    addl (%esi,%ebx), %ebp
252; X32-NEXT:    movl %ebp, (%edx)
253; X32-NEXT:    addl %esi, %ebx
254; X32-NEXT:    addl %edi, %ebx
255; X32-NEXT:    addl %ecx, %edx
256; X32-NEXT:    decl %eax
257; X32-NEXT:    jne .LBB2_2
258; X32-NEXT:  .LBB2_3: # %for.end
259; X32-NEXT:    popl %esi
260; X32-NEXT:    popl %edi
261; X32-NEXT:    popl %ebx
262; X32-NEXT:    popl %ebp
263; X32-NEXT:    retl
264entry:
265  %cmp8 = icmp eq i32 %z, 0
266  br i1 %cmp8, label %for.end, label %for.body.lr.ph
267
268for.body.lr.ph:                                   ; preds = %entry
269  %add.ptr.sum = shl i32 %main_stride, 1 ; s*2
270  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride ; s*3
271  %add.ptr2.sum = add i32 %x, %main_stride ; s + x
272  %add.ptr4.sum = shl i32 %main_stride, 2 ; s*4
273  %add.ptr3.sum = add i32 %add.ptr2.sum, %add.ptr4.sum ; total IV stride = s*5+x
274  br label %for.body
275
276for.body:                                         ; preds = %for.body.lr.ph, %for.body
277  %main.addr.011 = phi i8* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
278  %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
279  %res.addr.09 = phi i32* [ %res, %for.body.lr.ph ], [ %add.ptr7, %for.body ]
280  %0 = bitcast i8* %main.addr.011 to i32*
281  %1 = load i32, i32* %0, align 4
282  %add.ptr = getelementptr inbounds i8, i8* %main.addr.011, i32 %main_stride
283  %2 = bitcast i8* %add.ptr to i32*
284  %3 = load i32, i32* %2, align 4
285  %add.ptr1 = getelementptr inbounds i8, i8* %main.addr.011, i32 %add.ptr.sum
286  %4 = bitcast i8* %add.ptr1 to i32*
287  %5 = load i32, i32* %4, align 4
288  %add.ptr2 = getelementptr inbounds i8, i8* %main.addr.011, i32 %add.ptr1.sum
289  %6 = bitcast i8* %add.ptr2 to i32*
290  %7 = load i32, i32* %6, align 4
291  %add.ptr3 = getelementptr inbounds i8, i8* %main.addr.011, i32 %add.ptr4.sum
292  %8 = bitcast i8* %add.ptr3 to i32*
293  %9 = load i32, i32* %8, align 4
294  %add = add i32 %3, %1
295  %add4 = add i32 %add, %5
296  %add5 = add i32 %add4, %7
297  %add6 = add i32 %add5, %9
298  store i32 %add6, i32* %res.addr.09, align 4
299  %add.ptr6 = getelementptr inbounds i8, i8* %main.addr.011, i32 %add.ptr3.sum
300  %add.ptr7 = getelementptr inbounds i32, i32* %res.addr.09, i32 %y
301  %inc = add i32 %i.010, 1
302  %cmp = icmp eq i32 %inc, %z
303  br i1 %cmp, label %for.end, label %for.body
304
305for.end:                                          ; preds = %for.body, %entry
306  ret void
307}
308
309; @foldedidx is an unrolled variant of this loop:
310;  for (unsigned long i = 0; i < len; i += s) {
311;    c[i] = a[i] + b[i];
312;  }
313; where 's' can be folded into the addressing mode.
314; Consequently, we should *not* form any chains.
315
316define void @foldedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c) nounwind ssp {
317; X64-LABEL: foldedidx:
318; X64:       # %bb.0: # %entry
319; X64-NEXT:    movl $3, %eax
320; X64-NEXT:    .p2align 4, 0x90
321; X64-NEXT:  .LBB3_1: # %for.body
322; X64-NEXT:    # =>This Inner Loop Header: Depth=1
323; X64-NEXT:    movzbl -3(%rdi,%rax), %r8d
324; X64-NEXT:    movzbl -3(%rsi,%rax), %ecx
325; X64-NEXT:    addl %r8d, %ecx
326; X64-NEXT:    movb %cl, -3(%rdx,%rax)
327; X64-NEXT:    movzbl -2(%rdi,%rax), %r8d
328; X64-NEXT:    movzbl -2(%rsi,%rax), %ecx
329; X64-NEXT:    addl %r8d, %ecx
330; X64-NEXT:    movb %cl, -2(%rdx,%rax)
331; X64-NEXT:    movzbl -1(%rdi,%rax), %r8d
332; X64-NEXT:    movzbl -1(%rsi,%rax), %ecx
333; X64-NEXT:    addl %r8d, %ecx
334; X64-NEXT:    movb %cl, -1(%rdx,%rax)
335; X64-NEXT:    movzbl (%rdi,%rax), %r8d
336; X64-NEXT:    movzbl (%rsi,%rax), %ecx
337; X64-NEXT:    addl %r8d, %ecx
338; X64-NEXT:    movb %cl, (%rdx,%rax)
339; X64-NEXT:    addq $4, %rax
340; X64-NEXT:    cmpl $403, %eax # imm = 0x193
341; X64-NEXT:    jne .LBB3_1
342; X64-NEXT:  # %bb.2: # %for.end
343; X64-NEXT:    retq
344;
345; X32-LABEL: foldedidx:
346; X32:       # %bb.0: # %entry
347; X32-NEXT:    pushl %ebx
348; X32-NEXT:    pushl %edi
349; X32-NEXT:    pushl %esi
350; X32-NEXT:    movl $3, %eax
351; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
352; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
353; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
354; X32-NEXT:    .p2align 4, 0x90
355; X32-NEXT:  .LBB3_1: # %for.body
356; X32-NEXT:    # =>This Inner Loop Header: Depth=1
357; X32-NEXT:    movzbl -3(%esi,%eax), %edi
358; X32-NEXT:    movzbl -3(%edx,%eax), %ebx
359; X32-NEXT:    addl %edi, %ebx
360; X32-NEXT:    movb %bl, -3(%ecx,%eax)
361; X32-NEXT:    movzbl -2(%esi,%eax), %edi
362; X32-NEXT:    movzbl -2(%edx,%eax), %ebx
363; X32-NEXT:    addl %edi, %ebx
364; X32-NEXT:    movb %bl, -2(%ecx,%eax)
365; X32-NEXT:    movzbl -1(%esi,%eax), %edi
366; X32-NEXT:    movzbl -1(%edx,%eax), %ebx
367; X32-NEXT:    addl %edi, %ebx
368; X32-NEXT:    movb %bl, -1(%ecx,%eax)
369; X32-NEXT:    movzbl (%esi,%eax), %edi
370; X32-NEXT:    movzbl (%edx,%eax), %ebx
371; X32-NEXT:    addl %edi, %ebx
372; X32-NEXT:    movb %bl, (%ecx,%eax)
373; X32-NEXT:    addl $4, %eax
374; X32-NEXT:    cmpl $403, %eax # imm = 0x193
375; X32-NEXT:    jne .LBB3_1
376; X32-NEXT:  # %bb.2: # %for.end
377; X32-NEXT:    popl %esi
378; X32-NEXT:    popl %edi
379; X32-NEXT:    popl %ebx
380; X32-NEXT:    retl
381entry:
382  br label %for.body
383
384for.body:                                         ; preds = %for.body, %entry
385  %i.07 = phi i32 [ 0, %entry ], [ %inc.3, %for.body ]
386  %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.07
387  %0 = load i8, i8* %arrayidx, align 1
388  %conv5 = zext i8 %0 to i32
389  %arrayidx1 = getelementptr inbounds i8, i8* %b, i32 %i.07
390  %1 = load i8, i8* %arrayidx1, align 1
391  %conv26 = zext i8 %1 to i32
392  %add = add nsw i32 %conv26, %conv5
393  %conv3 = trunc i32 %add to i8
394  %arrayidx4 = getelementptr inbounds i8, i8* %c, i32 %i.07
395  store i8 %conv3, i8* %arrayidx4, align 1
396  %inc1 = or i32 %i.07, 1
397  %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %inc1
398  %2 = load i8, i8* %arrayidx.1, align 1
399  %conv5.1 = zext i8 %2 to i32
400  %arrayidx1.1 = getelementptr inbounds i8, i8* %b, i32 %inc1
401  %3 = load i8, i8* %arrayidx1.1, align 1
402  %conv26.1 = zext i8 %3 to i32
403  %add.1 = add nsw i32 %conv26.1, %conv5.1
404  %conv3.1 = trunc i32 %add.1 to i8
405  %arrayidx4.1 = getelementptr inbounds i8, i8* %c, i32 %inc1
406  store i8 %conv3.1, i8* %arrayidx4.1, align 1
407  %inc.12 = or i32 %i.07, 2
408  %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %inc.12
409  %4 = load i8, i8* %arrayidx.2, align 1
410  %conv5.2 = zext i8 %4 to i32
411  %arrayidx1.2 = getelementptr inbounds i8, i8* %b, i32 %inc.12
412  %5 = load i8, i8* %arrayidx1.2, align 1
413  %conv26.2 = zext i8 %5 to i32
414  %add.2 = add nsw i32 %conv26.2, %conv5.2
415  %conv3.2 = trunc i32 %add.2 to i8
416  %arrayidx4.2 = getelementptr inbounds i8, i8* %c, i32 %inc.12
417  store i8 %conv3.2, i8* %arrayidx4.2, align 1
418  %inc.23 = or i32 %i.07, 3
419  %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %inc.23
420  %6 = load i8, i8* %arrayidx.3, align 1
421  %conv5.3 = zext i8 %6 to i32
422  %arrayidx1.3 = getelementptr inbounds i8, i8* %b, i32 %inc.23
423  %7 = load i8, i8* %arrayidx1.3, align 1
424  %conv26.3 = zext i8 %7 to i32
425  %add.3 = add nsw i32 %conv26.3, %conv5.3
426  %conv3.3 = trunc i32 %add.3 to i8
427  %arrayidx4.3 = getelementptr inbounds i8, i8* %c, i32 %inc.23
428  store i8 %conv3.3, i8* %arrayidx4.3, align 1
429  %inc.3 = add nsw i32 %i.07, 4
430  %exitcond.3 = icmp eq i32 %inc.3, 400
431  br i1 %exitcond.3, label %for.end, label %for.body
432
433for.end:                                          ; preds = %for.body
434  ret void
435}
436
437; @multioper tests instructions with multiple IV user operands. We
438; should be able to chain them independent of each other.
439
440define void @multioper(i32* %a, i32 %n) nounwind {
441; X64-LABEL: multioper:
442; X64:       # %bb.0: # %entry
443; X64-NEXT:    xorl %eax, %eax
444; X64-NEXT:    .p2align 4, 0x90
445; X64-NEXT:  .LBB4_1: # %for.body
446; X64-NEXT:    # =>This Inner Loop Header: Depth=1
447; X64-NEXT:    movl %eax, (%rdi,%rax,4)
448; X64-NEXT:    leal 1(%rax), %ecx
449; X64-NEXT:    movl %ecx, 4(%rdi,%rax,4)
450; X64-NEXT:    leal 2(%rax), %ecx
451; X64-NEXT:    movl %ecx, 8(%rdi,%rax,4)
452; X64-NEXT:    leal 3(%rax), %ecx
453; X64-NEXT:    movl %ecx, 12(%rdi,%rax,4)
454; X64-NEXT:    addq $4, %rax
455; X64-NEXT:    cmpl %esi, %eax
456; X64-NEXT:    jl .LBB4_1
457; X64-NEXT:  # %bb.2: # %exit
458; X64-NEXT:    retq
459;
460; X32-LABEL: multioper:
461; X32:       # %bb.0: # %entry
462; X32-NEXT:    pushl %esi
463; X32-NEXT:    xorl %eax, %eax
464; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
465; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
466; X32-NEXT:    .p2align 4, 0x90
467; X32-NEXT:  .LBB4_1: # %for.body
468; X32-NEXT:    # =>This Inner Loop Header: Depth=1
469; X32-NEXT:    movl %eax, (%edx,%eax,4)
470; X32-NEXT:    leal 1(%eax), %esi
471; X32-NEXT:    movl %esi, 4(%edx,%eax,4)
472; X32-NEXT:    leal 2(%eax), %esi
473; X32-NEXT:    movl %esi, 8(%edx,%eax,4)
474; X32-NEXT:    leal 3(%eax), %esi
475; X32-NEXT:    movl %esi, 12(%edx,%eax,4)
476; X32-NEXT:    addl $4, %eax
477; X32-NEXT:    cmpl %ecx, %eax
478; X32-NEXT:    jl .LBB4_1
479; X32-NEXT:  # %bb.2: # %exit
480; X32-NEXT:    popl %esi
481; X32-NEXT:    retl
482entry:
483  br label %for.body
484
485for.body:
486  %p = phi i32* [ %p.next, %for.body ], [ %a, %entry ]
487  %i = phi i32 [ %inc4, %for.body ], [ 0, %entry ]
488  store i32 %i, i32* %p, align 4
489  %inc1 = or i32 %i, 1
490  %add.ptr.i1 = getelementptr inbounds i32, i32* %p, i32 1
491  store i32 %inc1, i32* %add.ptr.i1, align 4
492  %inc2 = add nsw i32 %i, 2
493  %add.ptr.i2 = getelementptr inbounds i32, i32* %p, i32 2
494  store i32 %inc2, i32* %add.ptr.i2, align 4
495  %inc3 = add nsw i32 %i, 3
496  %add.ptr.i3 = getelementptr inbounds i32, i32* %p, i32 3
497  store i32 %inc3, i32* %add.ptr.i3, align 4
498  %p.next = getelementptr inbounds i32, i32* %p, i32 4
499  %inc4 = add nsw i32 %i, 4
500  %cmp = icmp slt i32 %inc4, %n
501  br i1 %cmp, label %for.body, label %exit
502
503exit:
504  ret void
505}
506
507; @testCmpZero has a ICmpZero LSR use that should not be hidden from
508; LSR. Profitable chains should have more than one nonzero increment
509; anyway.
510
511define void @testCmpZero(i8* %src, i8* %dst, i32 %srcidx, i32 %dstidx, i32 %len) nounwind ssp {
512; X64-LABEL: testCmpZero:
513; X64:       # %bb.0: # %entry
514; X64-NEXT:    movslq %edx, %rdx
515; X64-NEXT:    addq %rdx, %rdi
516; X64-NEXT:    movslq %ecx, %r9
517; X64-NEXT:    addq %rsi, %r9
518; X64-NEXT:    addl %edx, %r8d
519; X64-NEXT:    movslq %r8d, %rcx
520; X64-NEXT:    subq %rdx, %rcx
521; X64-NEXT:    xorl %edx, %edx
522; X64-NEXT:    .p2align 4, 0x90
523; X64-NEXT:  .LBB5_1: # %for.body82.us
524; X64-NEXT:    # =>This Inner Loop Header: Depth=1
525; X64-NEXT:    movzbl (%r9,%rdx,4), %eax
526; X64-NEXT:    movb %al, (%rdi,%rdx)
527; X64-NEXT:    incq %rdx
528; X64-NEXT:    cmpq %rdx, %rcx
529; X64-NEXT:    jne .LBB5_1
530; X64-NEXT:  # %bb.2: # %return
531; X64-NEXT:    retq
532;
533; X32-LABEL: testCmpZero:
534; X32:       # %bb.0: # %entry
535; X32-NEXT:    pushl %ebx
536; X32-NEXT:    pushl %esi
537; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
538; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
539; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
540; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
541; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
542; X32-NEXT:    xorl %esi, %esi
543; X32-NEXT:    .p2align 4, 0x90
544; X32-NEXT:  .LBB5_1: # %for.body82.us
545; X32-NEXT:    # =>This Inner Loop Header: Depth=1
546; X32-NEXT:    movzbl (%edx,%esi,4), %ebx
547; X32-NEXT:    movb %bl, (%ecx,%esi)
548; X32-NEXT:    incl %esi
549; X32-NEXT:    cmpl %esi, %eax
550; X32-NEXT:    jne .LBB5_1
551; X32-NEXT:  # %bb.2: # %return
552; X32-NEXT:    popl %esi
553; X32-NEXT:    popl %ebx
554; X32-NEXT:    retl
555entry:
556  %dest0 = getelementptr inbounds i8, i8* %src, i32 %srcidx
557  %source0 = getelementptr inbounds i8, i8* %dst, i32 %dstidx
558  %add.ptr79.us.sum = add i32 %srcidx, %len
559  %lftr.limit = getelementptr i8, i8* %src, i32 %add.ptr79.us.sum
560  br label %for.body82.us
561
562for.body82.us:
563  %dest = phi i8* [ %dest0, %entry ], [ %incdec.ptr91.us, %for.body82.us ]
564  %source = phi i8* [ %source0, %entry ], [ %add.ptr83.us, %for.body82.us ]
565  %0 = bitcast i8* %source to i32*
566  %1 = load i32, i32* %0, align 4
567  %trunc = trunc i32 %1 to i8
568  %add.ptr83.us = getelementptr inbounds i8, i8* %source, i32 4
569  %incdec.ptr91.us = getelementptr inbounds i8, i8* %dest, i32 1
570  store i8 %trunc, i8* %dest, align 1
571  %exitcond = icmp eq i8* %incdec.ptr91.us, %lftr.limit
572  br i1 %exitcond, label %return, label %for.body82.us
573
574return:
575  ret void
576}
577