• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
3; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
4
5; #include <stdint.h>
6;
7; int foo(float *A, int n) {
8;   float sum = 0;
9;   for (intptr_t i=0; i < n; ++i) {
10;     sum += 7*A[i*4  ] +
11;            7*A[i*4+1] +
12;            7*A[i*4+2] +
13;            7*A[i*4+3];
14;   }
15;   return sum;
16; }
17
18define i32 @add_red(float* %A, i32 %n) {
19; CHECK-LABEL: @add_red(
20; CHECK-NEXT:  entry:
21; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
22; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
23; CHECK:       for.body.lr.ph:
24; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
25; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
26; CHECK:       for.body:
27; CHECK-NEXT:    [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
28; CHECK-NEXT:    [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ]
29; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_033]], 2
30; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
31; CHECK-NEXT:    [[ADD28:%.*]] = or i64 [[MUL]], 1
32; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD28]]
33; CHECK-NEXT:    [[ADD829:%.*]] = or i64 [[MUL]], 2
34; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD829]]
35; CHECK-NEXT:    [[ADD1330:%.*]] = or i64 [[MUL]], 3
36; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1330]]
37; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
38; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
39; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
40; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
41; CHECK-NEXT:    [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]]
42; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_033]], 1
43; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
44; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
45; CHECK:       for.cond.for.end_crit_edge:
46; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32
47; CHECK-NEXT:    br label [[FOR_END]]
48; CHECK:       for.end:
49; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
50; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
51;
52; STORE-LABEL: @add_red(
53; STORE-NEXT:  entry:
54; STORE-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
55; STORE-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
56; STORE:       for.body.lr.ph:
57; STORE-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
58; STORE-NEXT:    br label [[FOR_BODY:%.*]]
59; STORE:       for.body:
60; STORE-NEXT:    [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
61; STORE-NEXT:    [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ]
62; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_033]], 2
63; STORE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
64; STORE-NEXT:    [[ADD28:%.*]] = or i64 [[MUL]], 1
65; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD28]]
66; STORE-NEXT:    [[ADD829:%.*]] = or i64 [[MUL]], 2
67; STORE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD829]]
68; STORE-NEXT:    [[ADD1330:%.*]] = or i64 [[MUL]], 3
69; STORE-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1330]]
70; STORE-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
71; STORE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
72; STORE-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
73; STORE-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
74; STORE-NEXT:    [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]]
75; STORE-NEXT:    [[INC]] = add nsw i64 [[I_033]], 1
76; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
77; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
78; STORE:       for.cond.for.end_crit_edge:
79; STORE-NEXT:    [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32
80; STORE-NEXT:    br label [[FOR_END]]
81; STORE:       for.end:
82; STORE-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
83; STORE-NEXT:    ret i32 [[SUM_0_LCSSA]]
84;
85entry:
86  %cmp31 = icmp sgt i32 %n, 0
87  br i1 %cmp31, label %for.body.lr.ph, label %for.end
88
89for.body.lr.ph:
90  %0 = sext i32 %n to i64
91  br label %for.body
92
93for.body:
94  %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
95  %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
96  %mul = shl nsw i64 %i.033, 2
97  %arrayidx = getelementptr inbounds float, float* %A, i64 %mul
98  %1 = load float, float* %arrayidx, align 4
99  %mul2 = fmul float %1, 7.000000e+00
100  %add28 = or i64 %mul, 1
101  %arrayidx4 = getelementptr inbounds float, float* %A, i64 %add28
102  %2 = load float, float* %arrayidx4, align 4
103  %mul5 = fmul float %2, 7.000000e+00
104  %add6 = fadd fast float %mul2, %mul5
105  %add829 = or i64 %mul, 2
106  %arrayidx9 = getelementptr inbounds float, float* %A, i64 %add829
107  %3 = load float, float* %arrayidx9, align 4
108  %mul10 = fmul float %3, 7.000000e+00
109  %add11 = fadd fast float %add6, %mul10
110  %add1330 = or i64 %mul, 3
111  %arrayidx14 = getelementptr inbounds float, float* %A, i64 %add1330
112  %4 = load float, float* %arrayidx14, align 4
113  %mul15 = fmul float %4, 7.000000e+00
114  %add16 = fadd fast float %add11, %mul15
115  %add17 = fadd fast float %sum.032, %add16
116  %inc = add nsw i64 %i.033, 1
117  %exitcond = icmp eq i64 %inc, %0
118  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
119
120for.cond.for.end_crit_edge:
121  %phitmp = fptosi float %add17 to i32
122  br label %for.end
123
124for.end:
125  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
126  ret i32 %sum.0.lcssa
127}
128
129; int foo(float * restrict A, float * restrict B, int n) {
130;   float sum = 0;
131;   for (intptr_t i=0; i < n; ++i) {
132;     sum *= B[0]*A[i*4  ] +
133;       B[1]*A[i*4+1] +
134;       B[2]*A[i*4+2] +
135;       B[3]*A[i*4+3];
136;   }
137;   return sum;
138; }
139
140define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
141; CHECK-LABEL: @mul_red(
142; CHECK-NEXT:  entry:
143; CHECK-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0
144; CHECK-NEXT:    br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
145; CHECK:       for.body.lr.ph:
146; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
147; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
148; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
149; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B]] to <4 x float>*
150; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
151; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
152; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
153; CHECK:       for.body:
154; CHECK-NEXT:    [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
155; CHECK-NEXT:    [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ]
156; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_040]], 2
157; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
158; CHECK-NEXT:    [[ADD35:%.*]] = or i64 [[MUL]], 1
159; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD35]]
160; CHECK-NEXT:    [[ADD1136:%.*]] = or i64 [[MUL]], 2
161; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1136]]
162; CHECK-NEXT:    [[ADD1737:%.*]] = or i64 [[MUL]], 3
163; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1737]]
164; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
165; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
166; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]]
167; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
168; CHECK-NEXT:    [[MUL21]] = fmul float [[SUM_039]], [[TMP6]]
169; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_040]], 1
170; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
171; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
172; CHECK:       for.cond.for.end_crit_edge:
173; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32
174; CHECK-NEXT:    br label [[FOR_END]]
175; CHECK:       for.end:
176; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
177; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
178;
179; STORE-LABEL: @mul_red(
180; STORE-NEXT:  entry:
181; STORE-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0
182; STORE-NEXT:    br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
183; STORE:       for.body.lr.ph:
184; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
185; STORE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
186; STORE-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
187; STORE-NEXT:    [[TMP0:%.*]] = bitcast float* [[B]] to <4 x float>*
188; STORE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
189; STORE-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
190; STORE-NEXT:    br label [[FOR_BODY:%.*]]
191; STORE:       for.body:
192; STORE-NEXT:    [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
193; STORE-NEXT:    [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ]
194; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_040]], 2
195; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
196; STORE-NEXT:    [[ADD35:%.*]] = or i64 [[MUL]], 1
197; STORE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD35]]
198; STORE-NEXT:    [[ADD1136:%.*]] = or i64 [[MUL]], 2
199; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1136]]
200; STORE-NEXT:    [[ADD1737:%.*]] = or i64 [[MUL]], 3
201; STORE-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1737]]
202; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
203; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
204; STORE-NEXT:    [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]]
205; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
206; STORE-NEXT:    [[MUL21]] = fmul float [[SUM_039]], [[TMP6]]
207; STORE-NEXT:    [[INC]] = add nsw i64 [[I_040]], 1
208; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
209; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
210; STORE:       for.cond.for.end_crit_edge:
211; STORE-NEXT:    [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32
212; STORE-NEXT:    br label [[FOR_END]]
213; STORE:       for.end:
214; STORE-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
215; STORE-NEXT:    ret i32 [[SUM_0_LCSSA]]
216;
217entry:
218  %cmp38 = icmp sgt i32 %n, 0
219  br i1 %cmp38, label %for.body.lr.ph, label %for.end
220
221for.body.lr.ph:
222  %0 = load float, float* %B, align 4
223  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
224  %1 = load float, float* %arrayidx4, align 4
225  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
226  %2 = load float, float* %arrayidx9, align 4
227  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
228  %3 = load float, float* %arrayidx15, align 4
229  %4 = sext i32 %n to i64
230  br label %for.body
231
232for.body:
233  %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
234  %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
235  %mul = shl nsw i64 %i.040, 2
236  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
237  %5 = load float, float* %arrayidx2, align 4
238  %mul3 = fmul float %0, %5
239  %add35 = or i64 %mul, 1
240  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add35
241  %6 = load float, float* %arrayidx6, align 4
242  %mul7 = fmul float %1, %6
243  %add8 = fadd fast float %mul3, %mul7
244  %add1136 = or i64 %mul, 2
245  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1136
246  %7 = load float, float* %arrayidx12, align 4
247  %mul13 = fmul float %2, %7
248  %add14 = fadd fast float %add8, %mul13
249  %add1737 = or i64 %mul, 3
250  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1737
251  %8 = load float, float* %arrayidx18, align 4
252  %mul19 = fmul float %3, %8
253  %add20 = fadd fast float %add14, %mul19
254  %mul21 = fmul float %sum.039, %add20
255  %inc = add nsw i64 %i.040, 1
256  %exitcond = icmp eq i64 %inc, %4
257  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
258
259for.cond.for.end_crit_edge:
260  %phitmp = fptosi float %mul21 to i32
261  br label %for.end
262
263for.end:
264  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
265  ret i32 %sum.0.lcssa
266}
267
268; int foo(float * restrict A, float * restrict B, int n) {
269;   float sum = 0;
270;   for (intptr_t i=0; i < n; ++i) {
271;     sum += B[0]*A[i*6  ] +
272;            B[1]*A[i*6+1] +
273;            B[2]*A[i*6+2] +
274;            B[3]*A[i*6+3] +
275;            B[4]*A[i*6+4] +
276;            B[5]*A[i*6+5] +
277;            B[6]*A[i*6+6] +
278;            B[7]*A[i*6+7] +
279;            B[8]*A[i*6+8];
280;   }
281;   return sum;
282; }
283
284define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
285; CHECK-LABEL: @long_red(
286; CHECK-NEXT:  entry:
287; CHECK-NEXT:    [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0
288; CHECK-NEXT:    br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
289; CHECK:       for.body.lr.ph:
290; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
291; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
292; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
293; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds float, float* [[B]], i64 4
294; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds float, float* [[B]], i64 5
295; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, float* [[B]], i64 6
296; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds float, float* [[B]], i64 7
297; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B]] to <8 x float>*
298; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
299; CHECK-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, float* [[B]], i64 8
300; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX45]], align 4
301; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[N]] to i64
302; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
303; CHECK:       for.body:
304; CHECK-NEXT:    [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
305; CHECK-NEXT:    [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ]
306; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[I_083]], 6
307; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
308; CHECK-NEXT:    [[ADD80:%.*]] = or i64 [[MUL]], 1
309; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD80]]
310; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i64 [[MUL]], 2
311; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD11]]
312; CHECK-NEXT:    [[ADD17:%.*]] = add nsw i64 [[MUL]], 3
313; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD17]]
314; CHECK-NEXT:    [[ADD23:%.*]] = add nsw i64 [[MUL]], 4
315; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD23]]
316; CHECK-NEXT:    [[ADD29:%.*]] = add nsw i64 [[MUL]], 5
317; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD29]]
318; CHECK-NEXT:    [[ADD35:%.*]] = add nsw i64 [[MUL]], 6
319; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD35]]
320; CHECK-NEXT:    [[ADD41:%.*]] = add nsw i64 [[MUL]], 7
321; CHECK-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD41]]
322; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX2]] to <8 x float>*
323; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
324; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <8 x float> [[TMP1]], [[TMP5]]
325; CHECK-NEXT:    [[ADD47:%.*]] = add nsw i64 [[MUL]], 8
326; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]]
327; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4
328; CHECK-NEXT:    [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]]
329; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP6]])
330; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP8]], [[MUL49]]
331; CHECK-NEXT:    [[ADD51]] = fadd fast float [[SUM_082]], [[TMP9]]
332; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_083]], 1
333; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP3]]
334; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
335; CHECK:       for.cond.for.end_crit_edge:
336; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32
337; CHECK-NEXT:    br label [[FOR_END]]
338; CHECK:       for.end:
339; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
340; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
341;
342; STORE-LABEL: @long_red(
343; STORE-NEXT:  entry:
344; STORE-NEXT:    [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0
345; STORE-NEXT:    br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
346; STORE:       for.body.lr.ph:
347; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
348; STORE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
349; STORE-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
350; STORE-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds float, float* [[B]], i64 4
351; STORE-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds float, float* [[B]], i64 5
352; STORE-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, float* [[B]], i64 6
353; STORE-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds float, float* [[B]], i64 7
354; STORE-NEXT:    [[TMP0:%.*]] = bitcast float* [[B]] to <8 x float>*
355; STORE-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
356; STORE-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, float* [[B]], i64 8
357; STORE-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX45]], align 4
358; STORE-NEXT:    [[TMP3:%.*]] = sext i32 [[N]] to i64
359; STORE-NEXT:    br label [[FOR_BODY:%.*]]
360; STORE:       for.body:
361; STORE-NEXT:    [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
362; STORE-NEXT:    [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ]
363; STORE-NEXT:    [[MUL:%.*]] = mul nsw i64 [[I_083]], 6
364; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
365; STORE-NEXT:    [[ADD80:%.*]] = or i64 [[MUL]], 1
366; STORE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD80]]
367; STORE-NEXT:    [[ADD11:%.*]] = add nsw i64 [[MUL]], 2
368; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD11]]
369; STORE-NEXT:    [[ADD17:%.*]] = add nsw i64 [[MUL]], 3
370; STORE-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD17]]
371; STORE-NEXT:    [[ADD23:%.*]] = add nsw i64 [[MUL]], 4
372; STORE-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD23]]
373; STORE-NEXT:    [[ADD29:%.*]] = add nsw i64 [[MUL]], 5
374; STORE-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD29]]
375; STORE-NEXT:    [[ADD35:%.*]] = add nsw i64 [[MUL]], 6
376; STORE-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD35]]
377; STORE-NEXT:    [[ADD41:%.*]] = add nsw i64 [[MUL]], 7
378; STORE-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD41]]
379; STORE-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX2]] to <8 x float>*
380; STORE-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
381; STORE-NEXT:    [[TMP6:%.*]] = fmul fast <8 x float> [[TMP1]], [[TMP5]]
382; STORE-NEXT:    [[ADD47:%.*]] = add nsw i64 [[MUL]], 8
383; STORE-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]]
384; STORE-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4
385; STORE-NEXT:    [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]]
386; STORE-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP6]])
387; STORE-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP8]], [[MUL49]]
388; STORE-NEXT:    [[ADD51]] = fadd fast float [[SUM_082]], [[TMP9]]
389; STORE-NEXT:    [[INC]] = add nsw i64 [[I_083]], 1
390; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP3]]
391; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
392; STORE:       for.cond.for.end_crit_edge:
393; STORE-NEXT:    [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32
394; STORE-NEXT:    br label [[FOR_END]]
395; STORE:       for.end:
396; STORE-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
397; STORE-NEXT:    ret i32 [[SUM_0_LCSSA]]
398;
399entry:
400  %cmp81 = icmp sgt i32 %n, 0
401  br i1 %cmp81, label %for.body.lr.ph, label %for.end
402
403for.body.lr.ph:
404  %0 = load float, float* %B, align 4
405  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
406  %1 = load float, float* %arrayidx4, align 4
407  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
408  %2 = load float, float* %arrayidx9, align 4
409  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
410  %3 = load float, float* %arrayidx15, align 4
411  %arrayidx21 = getelementptr inbounds float, float* %B, i64 4
412  %4 = load float, float* %arrayidx21, align 4
413  %arrayidx27 = getelementptr inbounds float, float* %B, i64 5
414  %5 = load float, float* %arrayidx27, align 4
415  %arrayidx33 = getelementptr inbounds float, float* %B, i64 6
416  %6 = load float, float* %arrayidx33, align 4
417  %arrayidx39 = getelementptr inbounds float, float* %B, i64 7
418  %7 = load float, float* %arrayidx39, align 4
419  %arrayidx45 = getelementptr inbounds float, float* %B, i64 8
420  %8 = load float, float* %arrayidx45, align 4
421  %9 = sext i32 %n to i64
422  br label %for.body
423
424for.body:
425  %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
426  %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
427  %mul = mul nsw i64 %i.083, 6
428  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
429  %10 = load float, float* %arrayidx2, align 4
430  %mul3 = fmul fast float %0, %10
431  %add80 = or i64 %mul, 1
432  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add80
433  %11 = load float, float* %arrayidx6, align 4
434  %mul7 = fmul fast float %1, %11
435  %add8 = fadd fast float %mul3, %mul7
436  %add11 = add nsw i64 %mul, 2
437  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add11
438  %12 = load float, float* %arrayidx12, align 4
439  %mul13 = fmul fast float %2, %12
440  %add14 = fadd fast float %add8, %mul13
441  %add17 = add nsw i64 %mul, 3
442  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add17
443  %13 = load float, float* %arrayidx18, align 4
444  %mul19 = fmul fast float %3, %13
445  %add20 = fadd fast float %add14, %mul19
446  %add23 = add nsw i64 %mul, 4
447  %arrayidx24 = getelementptr inbounds float, float* %A, i64 %add23
448  %14 = load float, float* %arrayidx24, align 4
449  %mul25 = fmul fast float %4, %14
450  %add26 = fadd fast float %add20, %mul25
451  %add29 = add nsw i64 %mul, 5
452  %arrayidx30 = getelementptr inbounds float, float* %A, i64 %add29
453  %15 = load float, float* %arrayidx30, align 4
454  %mul31 = fmul fast float %5, %15
455  %add32 = fadd fast float %add26, %mul31
456  %add35 = add nsw i64 %mul, 6
457  %arrayidx36 = getelementptr inbounds float, float* %A, i64 %add35
458  %16 = load float, float* %arrayidx36, align 4
459  %mul37 = fmul fast float %6, %16
460  %add38 = fadd fast float %add32, %mul37
461  %add41 = add nsw i64 %mul, 7
462  %arrayidx42 = getelementptr inbounds float, float* %A, i64 %add41
463  %17 = load float, float* %arrayidx42, align 4
464  %mul43 = fmul fast float %7, %17
465  %add44 = fadd fast float %add38, %mul43
466  %add47 = add nsw i64 %mul, 8
467  %arrayidx48 = getelementptr inbounds float, float* %A, i64 %add47
468  %18 = load float, float* %arrayidx48, align 4
469  %mul49 = fmul fast float %8, %18
470  %add50 = fadd fast float %add44, %mul49
471  %add51 = fadd fast float %sum.082, %add50
472  %inc = add nsw i64 %i.083, 1
473  %exitcond = icmp eq i64 %inc, %9
474  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
475
476for.cond.for.end_crit_edge:
477  %phitmp = fptosi float %add51 to i32
478  br label %for.end
479
480for.end:
481  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
482  ret i32 %sum.0.lcssa
483}
484
485; int foo(float * restrict A, float * restrict B, int n) {
486;   float sum = 0;
487;   for (intptr_t i=0; i < n; ++i) {
488;     sum += B[0]*A[i*4  ];
489;     sum += B[1]*A[i*4+1];
490;     sum += B[2]*A[i*4+2];
491;     sum += B[3]*A[i*4+3];
492;   }
493;   return sum;
494; }
495
496define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
497; CHECK-LABEL: @chain_red(
498; CHECK-NEXT:  entry:
499; CHECK-NEXT:    [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0
500; CHECK-NEXT:    br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
501; CHECK:       for.body.lr.ph:
502; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
503; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
504; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
505; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B]] to <4 x float>*
506; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
507; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
508; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
509; CHECK:       for.body:
510; CHECK-NEXT:    [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
511; CHECK-NEXT:    [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
512; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_043]], 2
513; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
514; CHECK-NEXT:    [[ADD638:%.*]] = or i64 [[MUL]], 1
515; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD638]]
516; CHECK-NEXT:    [[ADD1239:%.*]] = or i64 [[MUL]], 2
517; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1239]]
518; CHECK-NEXT:    [[ADD1840:%.*]] = or i64 [[MUL]], 3
519; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1840]]
520; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
521; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
522; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]]
523; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
524; CHECK-NEXT:    [[OP_EXTRA]] = fadd fast float [[TMP6]], [[SUM_042]]
525; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_043]], 1
526; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
527; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
528; CHECK:       for.cond.for.end_crit_edge:
529; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi float [[OP_EXTRA]] to i32
530; CHECK-NEXT:    br label [[FOR_END]]
531; CHECK:       for.end:
532; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
533; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
534;
535; STORE-LABEL: @chain_red(
536; STORE-NEXT:  entry:
537; STORE-NEXT:    [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0
538; STORE-NEXT:    br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
539; STORE:       for.body.lr.ph:
540; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
541; STORE-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
542; STORE-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
543; STORE-NEXT:    [[TMP0:%.*]] = bitcast float* [[B]] to <4 x float>*
544; STORE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
545; STORE-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
546; STORE-NEXT:    br label [[FOR_BODY:%.*]]
547; STORE:       for.body:
548; STORE-NEXT:    [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
549; STORE-NEXT:    [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
550; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_043]], 2
551; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
552; STORE-NEXT:    [[ADD638:%.*]] = or i64 [[MUL]], 1
553; STORE-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD638]]
554; STORE-NEXT:    [[ADD1239:%.*]] = or i64 [[MUL]], 2
555; STORE-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1239]]
556; STORE-NEXT:    [[ADD1840:%.*]] = or i64 [[MUL]], 3
557; STORE-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1840]]
558; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
559; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
560; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]]
561; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
562; STORE-NEXT:    [[OP_EXTRA]] = fadd fast float [[TMP6]], [[SUM_042]]
563; STORE-NEXT:    [[INC]] = add nsw i64 [[I_043]], 1
564; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
565; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
566; STORE:       for.cond.for.end_crit_edge:
567; STORE-NEXT:    [[PHITMP:%.*]] = fptosi float [[OP_EXTRA]] to i32
568; STORE-NEXT:    br label [[FOR_END]]
569; STORE:       for.end:
570; STORE-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
571; STORE-NEXT:    ret i32 [[SUM_0_LCSSA]]
572;
573entry:
574  %cmp41 = icmp sgt i32 %n, 0
575  br i1 %cmp41, label %for.body.lr.ph, label %for.end
576
577for.body.lr.ph:
578  %0 = load float, float* %B, align 4
579  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
580  %1 = load float, float* %arrayidx4, align 4
581  %arrayidx10 = getelementptr inbounds float, float* %B, i64 2
582  %2 = load float, float* %arrayidx10, align 4
583  %arrayidx16 = getelementptr inbounds float, float* %B, i64 3
584  %3 = load float, float* %arrayidx16, align 4
585  %4 = sext i32 %n to i64
586  br label %for.body
587
588for.body:
589  %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
590  %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
591  %mul = shl nsw i64 %i.043, 2
592  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
593  %5 = load float, float* %arrayidx2, align 4
594  %mul3 = fmul fast float %0, %5
595  %add = fadd fast float %sum.042, %mul3
596  %add638 = or i64 %mul, 1
597  %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add638
598  %6 = load float, float* %arrayidx7, align 4
599  %mul8 = fmul fast float %1, %6
600  %add9 = fadd fast float %add, %mul8
601  %add1239 = or i64 %mul, 2
602  %arrayidx13 = getelementptr inbounds float, float* %A, i64 %add1239
603  %7 = load float, float* %arrayidx13, align 4
604  %mul14 = fmul fast float %2, %7
605  %add15 = fadd fast float %add9, %mul14
606  %add1840 = or i64 %mul, 3
607  %arrayidx19 = getelementptr inbounds float, float* %A, i64 %add1840
608  %8 = load float, float* %arrayidx19, align 4
609  %mul20 = fmul fast float %3, %8
610  %add21 = fadd fast float %add15, %mul20
611  %inc = add nsw i64 %i.043, 1
612  %exitcond = icmp eq i64 %inc, %4
613  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
614
615for.cond.for.end_crit_edge:
616  %phitmp = fptosi float %add21 to i32
617  br label %for.end
618
619for.end:
620  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
621  ret i32 %sum.0.lcssa
622}
623
624; void foo(const float *arg_A, unsigned arg_B, float *array) {
625;   for (uint32_t i = 0; i < 6; ++i) {
626;     const float *ptr = arg_A + i;
627;     float w0 = array[i * 4 + 0];
628;     float w1 = array[i * 4 + 1];
629;     float w2 = array[i * 4 + 2];
630;     float w3 = array[i * 4 + 3];
631;
632;     for (unsigned j = 0; j < arg_B; ++j) {
633;       const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1);
634;       const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1);
635;       const float x3 = x2 - (-3.1f * w2) - (3.2f * w3);
636;       const float x4 = x3 + (-4.0f * w2) + w3;
637;       w1 = w0;
638;       w0 = x1;
639;       w3 = w2;
640;       w2 = x3;
641;     }
642;
643;     array[i * 4 + 0] = w0;
644;     array[i * 4 + 1] = w1;
645;     array[i * 4 + 2] = w2;
646;     array[i * 4 + 3] = w3;
647;   }
648; }
649
650define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) {
651; CHECK-LABEL: @foo(
652; CHECK-NEXT:  entry:
653; CHECK-NEXT:    [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0
654; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
655; CHECK:       for.cond.cleanup:
656; CHECK-NEXT:    ret void
657; CHECK:       for.body:
658; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
659; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
660; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
661; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
662; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
663; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
664; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
665; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP0]], 2
666; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
667; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
668; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP0]], 3
669; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
670; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
671; CHECK-NEXT:    br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
672; CHECK:       for.body16.lr.ph:
673; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
674; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
675; CHECK-NEXT:    br label [[FOR_BODY16:%.*]]
676; CHECK:       for.cond.cleanup15:
677; CHECK-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
678; CHECK-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
679; CHECK-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
680; CHECK-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
681; CHECK-NEXT:    store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
682; CHECK-NEXT:    store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
683; CHECK-NEXT:    store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
684; CHECK-NEXT:    store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4
685; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
686; CHECK-NEXT:    [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
687; CHECK-NEXT:    br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
688; CHECK:       for.body16:
689; CHECK-NEXT:    [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
690; CHECK-NEXT:    [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
691; CHECK-NEXT:    [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
692; CHECK-NEXT:    [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
693; CHECK-NEXT:    [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
694; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
695; CHECK-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
696; CHECK-NEXT:    [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
697; CHECK-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
698; CHECK-NEXT:    [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
699; CHECK-NEXT:    [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
700; CHECK-NEXT:    [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
701; CHECK-NEXT:    [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
702; CHECK-NEXT:    [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
703; CHECK-NEXT:    [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
704; CHECK-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
705; CHECK-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
706; CHECK-NEXT:    [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
707; CHECK-NEXT:    [[INC]] = add nuw i32 [[J_098]], 1
708; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
709; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
710;
711; STORE-LABEL: @foo(
712; STORE-NEXT:  entry:
713; STORE-NEXT:    [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0
714; STORE-NEXT:    br label [[FOR_BODY:%.*]]
715; STORE:       for.cond.cleanup:
716; STORE-NEXT:    ret void
717; STORE:       for.body:
718; STORE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
719; STORE-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
720; STORE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
721; STORE-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
722; STORE-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
723; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
724; STORE-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
725; STORE-NEXT:    [[TMP4:%.*]] = or i64 [[TMP0]], 2
726; STORE-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
727; STORE-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
728; STORE-NEXT:    [[TMP6:%.*]] = or i64 [[TMP0]], 3
729; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
730; STORE-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
731; STORE-NEXT:    br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
732; STORE:       for.body16.lr.ph:
733; STORE-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
734; STORE-NEXT:    [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
735; STORE-NEXT:    br label [[FOR_BODY16:%.*]]
736; STORE:       for.cond.cleanup15:
737; STORE-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
738; STORE-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
739; STORE-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
740; STORE-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
741; STORE-NEXT:    store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
742; STORE-NEXT:    store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
743; STORE-NEXT:    store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
744; STORE-NEXT:    store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4
745; STORE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
746; STORE-NEXT:    [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
747; STORE-NEXT:    br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
748; STORE:       for.body16:
749; STORE-NEXT:    [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
750; STORE-NEXT:    [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
751; STORE-NEXT:    [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
752; STORE-NEXT:    [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
753; STORE-NEXT:    [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
754; STORE-NEXT:    [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
755; STORE-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
756; STORE-NEXT:    [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
757; STORE-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
758; STORE-NEXT:    [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
759; STORE-NEXT:    [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
760; STORE-NEXT:    [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
761; STORE-NEXT:    [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
762; STORE-NEXT:    [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
763; STORE-NEXT:    [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
764; STORE-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
765; STORE-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
766; STORE-NEXT:    [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
767; STORE-NEXT:    [[INC]] = add nuw i32 [[J_098]], 1
768; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
769; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
770;
771entry:
772  %cmp1495 = icmp eq i32 %arg_B, 0
773  br label %for.body
774
775for.cond.cleanup:                                 ; preds = %for.cond.cleanup15
776  ret void
777
778for.body:                                         ; preds = %for.cond.cleanup15, %entry
779  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ]
780  %0 = shl i64 %indvars.iv, 2
781  %arrayidx = getelementptr inbounds float, float* %array, i64 %0
782  %1 = load float, float* %arrayidx, align 4
783  %2 = or i64 %0, 1
784  %arrayidx4 = getelementptr inbounds float, float* %array, i64 %2
785  %3 = load float, float* %arrayidx4, align 4
786  %4 = or i64 %0, 2
787  %arrayidx8 = getelementptr inbounds float, float* %array, i64 %4
788  %5 = load float, float* %arrayidx8, align 4
789  %6 = or i64 %0, 3
790  %arrayidx12 = getelementptr inbounds float, float* %array, i64 %6
791  %7 = load float, float* %arrayidx12, align 4
792  br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph
793
794for.body16.lr.ph:                                 ; preds = %for.body
795  %add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv
796  %8 = load float, float* %add.ptr, align 4
797  br label %for.body16
798
799for.cond.cleanup15:                               ; preds = %for.body16, %for.body
800  %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ]
801  %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ]
802  %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ]
803  %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ]
804  store float %w0.0.lcssa, float* %arrayidx, align 4
805  store float %w1.0.lcssa, float* %arrayidx4, align 4
806  store float %w2.0.lcssa, float* %arrayidx8, align 4
807  store float %w3.0.lcssa, float* %arrayidx12, align 4
808  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
809  %exitcond109 = icmp eq i64 %indvars.iv.next, 6
810  br i1 %exitcond109, label %for.cond.cleanup, label %for.body
811
812for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
813  %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ]
814  %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ]
815  %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ]
816  %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ]
817  %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ]
818  %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000
819  %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000
820  %sub92 = fadd fast float %mul17, %mul18.neg
821  %sub19 = fadd fast float %sub92, %8
822  %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000
823  %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000
824  %mul23 = fmul fast float %w1.099, 0x4002666660000000
825  %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000
826  %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000
827  %add2293 = fadd fast float %mul27.neg, %mul25
828  %add24 = fadd fast float %add2293, %mul23
829  %sub2694 = fadd fast float %add24, %mul21.neg
830  %sub28 = fadd fast float %sub2694, %mul20
831  %inc = add nuw i32 %j.098, 1
832  %exitcond = icmp eq i32 %inc, %arg_B
833  br i1 %exitcond, label %for.cond.cleanup15, label %for.body16
834}
835
836
837; void foo(double * restrict A, double * restrict B, double * restrict C,
838;          int n) {
839;   for (intptr_t i=0; i < n; ++i) {
840;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
841;   }
842; }
843
844define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
845; CHECK-LABEL: @store_red_double(
846; CHECK-NEXT:  entry:
847; CHECK-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
848; CHECK-NEXT:    br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
849; CHECK:       for.body.lr.ph:
850; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[B:%.*]], align 8
851; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
852; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX4]], align 8
853; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
854; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
855; CHECK:       for.body:
856; CHECK-NEXT:    [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
857; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_018]], 2
858; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]]
859; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* [[ARRAYIDX2]], align 8
860; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast double [[TMP0]], [[TMP3]]
861; CHECK-NEXT:    [[ADD16:%.*]] = or i64 [[MUL]], 1
862; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[ADD16]]
863; CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX6]], align 8
864; CHECK-NEXT:    [[MUL7:%.*]] = fmul fast double [[TMP1]], [[TMP4]]
865; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast double [[MUL3]], [[MUL7]]
866; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]]
867; CHECK-NEXT:    store double [[ADD8]], double* [[ARRAYIDX9]], align 8
868; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_018]], 1
869; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
870; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
871; CHECK:       for.end:
872; CHECK-NEXT:    ret void
873;
874; STORE-LABEL: @store_red_double(
875; STORE-NEXT:  entry:
876; STORE-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
877; STORE-NEXT:    br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
878; STORE:       for.body.lr.ph:
879; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
880; STORE-NEXT:    [[TMP0:%.*]] = bitcast double* [[B]] to <2 x double>*
881; STORE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
882; STORE-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
883; STORE-NEXT:    br label [[FOR_BODY:%.*]]
884; STORE:       for.body:
885; STORE-NEXT:    [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
886; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_018]], 2
887; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]]
888; STORE-NEXT:    [[ADD16:%.*]] = or i64 [[MUL]], 1
889; STORE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[ADD16]]
890; STORE-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>*
891; STORE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
892; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP4]]
893; STORE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
894; STORE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
895; STORE-NEXT:    [[ADD8:%.*]] = fadd fast double [[TMP6]], [[TMP7]]
896; STORE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]]
897; STORE-NEXT:    store double [[ADD8]], double* [[ARRAYIDX9]], align 8
898; STORE-NEXT:    [[INC]] = add nsw i64 [[I_018]], 1
899; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
900; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
901; STORE:       for.end:
902; STORE-NEXT:    ret void
903;
904entry:
905  %cmp17 = icmp sgt i32 %n, 0
906  br i1 %cmp17, label %for.body.lr.ph, label %for.end
907
908for.body.lr.ph:
909  %0 = load double, double* %B, align 8
910  %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
911  %1 = load double, double* %arrayidx4, align 8
912  %2 = sext i32 %n to i64
913  br label %for.body
914
915for.body:
916  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
917  %mul = shl nsw i64 %i.018, 2
918  %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
919  %3 = load double, double* %arrayidx2, align 8
920  %mul3 = fmul fast double %0, %3
921  %add16 = or i64 %mul, 1
922  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
923  %4 = load double, double* %arrayidx6, align 8
924  %mul7 = fmul fast double %1, %4
925  %add8 = fadd fast double %mul3, %mul7
926  %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
927  store double %add8, double* %arrayidx9, align 8
928  %inc = add nsw i64 %i.018, 1
929  %exitcond = icmp eq i64 %inc, %2
930  br i1 %exitcond, label %for.end, label %for.body
931
932for.end:
933  ret void
934}
935
936; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
937;   float sum = 0;
938;   for (intptr_t i=0; i < n; ++i) {
939;     C[i] = B[0] *A[i*4  ] +
940;          B[1] *A[i*4+1] +
941;          B[2] *A[i*4+2] +
942;          B[3] *A[i*4+3];
943;   }
944;   return sum;
945; }
946
947define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
948; CHECK-LABEL: @store_red(
949; CHECK-NEXT:  entry:
950; CHECK-NEXT:    [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0
951; CHECK-NEXT:    br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
952; CHECK:       for.body.lr.ph:
953; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
954; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
955; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
956; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
957; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
958; CHECK:       for.body:
959; CHECK-NEXT:    [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
960; CHECK-NEXT:    [[C_ADDR_038:%.*]] = phi float* [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
961; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[B]], align 4
962; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_039]], 2
963; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
964; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4
965; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast float [[TMP1]], [[TMP2]]
966; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
967; CHECK-NEXT:    [[ADD34:%.*]] = or i64 [[MUL]], 1
968; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD34]]
969; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX6]], align 4
970; CHECK-NEXT:    [[MUL7:%.*]] = fmul fast float [[TMP3]], [[TMP4]]
971; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast float [[MUL3]], [[MUL7]]
972; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX9]], align 4
973; CHECK-NEXT:    [[ADD1135:%.*]] = or i64 [[MUL]], 2
974; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1135]]
975; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX12]], align 4
976; CHECK-NEXT:    [[MUL13:%.*]] = fmul fast float [[TMP5]], [[TMP6]]
977; CHECK-NEXT:    [[ADD14:%.*]] = fadd fast float [[ADD8]], [[MUL13]]
978; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX15]], align 4
979; CHECK-NEXT:    [[ADD1736:%.*]] = or i64 [[MUL]], 3
980; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]]
981; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ARRAYIDX18]], align 4
982; CHECK-NEXT:    [[MUL19:%.*]] = fmul fast float [[TMP7]], [[TMP8]]
983; CHECK-NEXT:    [[ADD20:%.*]] = fadd fast float [[ADD14]], [[MUL19]]
984; CHECK-NEXT:    store float [[ADD20]], float* [[C_ADDR_038]], align 4
985; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1
986; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_039]], 1
987; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
988; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
989; CHECK:       for.end:
990; CHECK-NEXT:    ret i32 0
991;
992; STORE-LABEL: @store_red(
993; STORE-NEXT:  entry:
994; STORE-NEXT:    [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0
995; STORE-NEXT:    br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
996; STORE:       for.body.lr.ph:
997; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
998; STORE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
999; STORE-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
1000; STORE-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
1001; STORE-NEXT:    br label [[FOR_BODY:%.*]]
1002; STORE:       for.body:
1003; STORE-NEXT:    [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
1004; STORE-NEXT:    [[C_ADDR_038:%.*]] = phi float* [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
1005; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_039]], 2
1006; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
1007; STORE-NEXT:    [[ADD34:%.*]] = or i64 [[MUL]], 1
1008; STORE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD34]]
1009; STORE-NEXT:    [[ADD1135:%.*]] = or i64 [[MUL]], 2
1010; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1135]]
1011; STORE-NEXT:    [[TMP1:%.*]] = bitcast float* [[B]] to <4 x float>*
1012; STORE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
1013; STORE-NEXT:    [[ADD1736:%.*]] = or i64 [[MUL]], 3
1014; STORE-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]]
1015; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
1016; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
1017; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]]
1018; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
1019; STORE-NEXT:    store float [[TMP6]], float* [[C_ADDR_038]], align 4
1020; STORE-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1
1021; STORE-NEXT:    [[INC]] = add nsw i64 [[I_039]], 1
1022; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
1023; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
1024; STORE:       for.end:
1025; STORE-NEXT:    ret i32 0
1026;
1027entry:
1028  %cmp37 = icmp sgt i32 %n, 0
1029  br i1 %cmp37, label %for.body.lr.ph, label %for.end
1030
1031for.body.lr.ph:
1032  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
1033  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
1034  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
1035  %0 = sext i32 %n to i64
1036  br label %for.body
1037
1038for.body:
1039  %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
1040  %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
1041  %1 = load float, float* %B, align 4
1042  %mul = shl nsw i64 %i.039, 2
1043  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
1044  %2 = load float, float* %arrayidx2, align 4
1045  %mul3 = fmul fast float %1, %2
1046  %3 = load float, float* %arrayidx4, align 4
1047  %add34 = or i64 %mul, 1
1048  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add34
1049  %4 = load float, float* %arrayidx6, align 4
1050  %mul7 = fmul fast float %3, %4
1051  %add8 = fadd fast float %mul3, %mul7
1052  %5 = load float, float* %arrayidx9, align 4
1053  %add1135 = or i64 %mul, 2
1054  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1135
1055  %6 = load float, float* %arrayidx12, align 4
1056  %mul13 = fmul fast float %5, %6
1057  %add14 = fadd fast float %add8, %mul13
1058  %7 = load float, float* %arrayidx15, align 4
1059  %add1736 = or i64 %mul, 3
1060  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1736
1061  %8 = load float, float* %arrayidx18, align 4
1062  %mul19 = fmul fast float %7, %8
1063  %add20 = fadd fast float %add14, %mul19
1064  store float %add20, float* %C.addr.038, align 4
1065  %incdec.ptr = getelementptr inbounds float, float* %C.addr.038, i64 1
1066  %inc = add nsw i64 %i.039, 1
1067  %exitcond = icmp eq i64 %inc, %0
1068  br i1 %exitcond, label %for.end, label %for.body
1069
1070for.end:
1071  ret i32 0
1072}
1073
1074@arr_i32 = global [32 x i32] zeroinitializer, align 16
1075@arr_float = global [32 x float] zeroinitializer, align 16
1076
1077define void @float_red_example4(float* %res) {
1078; CHECK-LABEL: @float_red_example4(
1079; CHECK-NEXT:  entry:
1080; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
1081; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
1082; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
1083; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
1084; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
1085; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
1086; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
1087; CHECK-NEXT:    store float [[ADD_2]], float* [[RES:%.*]], align 16
1088; CHECK-NEXT:    ret void
1089;
1090; STORE-LABEL: @float_red_example4(
1091; STORE-NEXT:  entry:
1092; STORE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16
1093; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]])
1094; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
1095; STORE-NEXT:    ret void
1096;
1097entry:
1098  %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
1099  %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
1100  %add = fadd fast float %1, %0
1101  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
1102  %add.1 = fadd fast float %2, %add
1103  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
1104  %add.2 = fadd fast float %3, %add.1
1105  store float %add.2, float* %res, align 16
1106  ret void
1107}
1108
1109define void @float_red_example8(float* %res) {
1110; CHECK-LABEL: @float_red_example8(
1111; CHECK-NEXT:  entry:
1112; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
1113; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
1114; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
1115; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
1116; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
1117; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
1118; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
1119; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
1120; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP4]], [[ADD_2]]
1121; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
1122; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float [[TMP5]], [[ADD_3]]
1123; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
1124; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float [[TMP6]], [[ADD_4]]
1125; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
1126; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float [[TMP7]], [[ADD_5]]
1127; CHECK-NEXT:    store float [[ADD_6]], float* [[RES:%.*]], align 16
1128; CHECK-NEXT:    ret void
1129;
1130; STORE-LABEL: @float_red_example8(
1131; STORE-NEXT:  entry:
1132; STORE-NEXT:    [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16
1133; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]])
1134; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
1135; STORE-NEXT:    ret void
1136;
1137entry:
1138  %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
1139  %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
1140  %add = fadd fast float %1, %0
1141  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
1142  %add.1 = fadd fast float %2, %add
1143  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
1144  %add.2 = fadd fast float %3, %add.1
1145  %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
1146  %add.3 = fadd fast float %4, %add.2
1147  %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
1148  %add.4 = fadd fast float %5, %add.3
1149  %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
1150  %add.5 = fadd fast float %6, %add.4
1151  %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
1152  %add.6 = fadd fast float %7, %add.5
1153  store float %add.6, float* %res, align 16
1154  ret void
1155}
1156
1157define void @float_red_example16(float* %res) {
1158; CHECK-LABEL: @float_red_example16(
1159; CHECK-NEXT:  entry:
1160; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
1161; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
1162; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
1163; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
1164; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
1165; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
1166; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
1167; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
1168; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP4]], [[ADD_2]]
1169; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
1170; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float [[TMP5]], [[ADD_3]]
1171; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
1172; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float [[TMP6]], [[ADD_4]]
1173; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
1174; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float [[TMP7]], [[ADD_5]]
1175; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16
1176; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float [[TMP8]], [[ADD_6]]
1177; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4
1178; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float [[TMP9]], [[ADD_7]]
1179; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8
1180; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float [[TMP10]], [[ADD_8]]
1181; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4
1182; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float [[TMP11]], [[ADD_9]]
1183; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16
1184; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float [[TMP12]], [[ADD_10]]
1185; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4
1186; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float [[TMP13]], [[ADD_11]]
1187; CHECK-NEXT:    [[TMP14:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8
1188; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float [[TMP14]], [[ADD_12]]
1189; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4
1190; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float [[TMP15]], [[ADD_13]]
1191; CHECK-NEXT:    store float [[ADD_14]], float* [[RES:%.*]], align 16
1192; CHECK-NEXT:    ret void
1193;
1194; STORE-LABEL: @float_red_example16(
1195; STORE-NEXT:  entry:
1196; STORE-NEXT:    [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16
1197; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP0]])
1198; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
1199; STORE-NEXT:    ret void
1200;
1201entry:
1202  %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
1203  %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
1204  %add = fadd fast float %1, %0
1205  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
1206  %add.1 = fadd fast float %2, %add
1207  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
1208  %add.2 = fadd fast float %3, %add.1
1209  %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
1210  %add.3 = fadd fast float %4, %add.2
1211  %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
1212  %add.4 = fadd fast float %5, %add.3
1213  %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
1214  %add.5 = fadd fast float %6, %add.4
1215  %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
1216  %add.6 = fadd fast float %7, %add.5
1217  %8 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16
1218  %add.7 = fadd fast float %8, %add.6
1219  %9 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4
1220  %add.8 = fadd fast float %9, %add.7
1221  %10 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8
1222  %add.9 = fadd fast float %10, %add.8
1223  %11 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4
1224  %add.10 = fadd fast float %11, %add.9
1225  %12 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16
1226  %add.11 = fadd fast float %12, %add.10
1227  %13 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4
1228  %add.12 = fadd fast float %13, %add.11
1229  %14 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8
1230  %add.13 = fadd fast float %14, %add.12
1231  %15 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4
1232  %add.14 = fadd fast float %15, %add.13
1233  store float %add.14, float* %res, align 16
1234  ret void
1235}
1236
1237define void @i32_red_example4(i32* %res) {
1238; CHECK-LABEL: @i32_red_example4(
1239; CHECK-NEXT:  entry:
1240; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1241; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1242; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
1243; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1244; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
1245; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1246; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
1247; CHECK-NEXT:    store i32 [[ADD_2]], i32* [[RES:%.*]], align 16
1248; CHECK-NEXT:    ret void
1249;
1250; STORE-LABEL: @i32_red_example4(
1251; STORE-NEXT:  entry:
1252; STORE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
1253; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
1254; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
1255; STORE-NEXT:    ret void
1256;
1257entry:
1258  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1259  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1260  %add = add nsw i32 %1, %0
1261  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1262  %add.1 = add nsw i32 %2, %add
1263  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1264  %add.2 = add nsw i32 %3, %add.1
1265  store i32 %add.2, i32* %res, align 16
1266  ret void
1267}
1268
1269define void @i32_red_example8(i32* %res) {
1270; CHECK-LABEL: @i32_red_example8(
1271; CHECK-NEXT:  entry:
1272; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1273; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1274; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
1275; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1276; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
1277; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1278; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
1279; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1280; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
1281; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1282; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
1283; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1284; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
1285; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1286; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
1287; CHECK-NEXT:    store i32 [[ADD_6]], i32* [[RES:%.*]], align 16
1288; CHECK-NEXT:    ret void
1289;
1290; STORE-LABEL: @i32_red_example8(
1291; STORE-NEXT:  entry:
1292; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1293; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1294; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
1295; STORE-NEXT:    ret void
1296;
1297entry:
1298  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1299  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1300  %add = add nsw i32 %1, %0
1301  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1302  %add.1 = add nsw i32 %2, %add
1303  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1304  %add.2 = add nsw i32 %3, %add.1
1305  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1306  %add.3 = add nsw i32 %4, %add.2
1307  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1308  %add.4 = add nsw i32 %5, %add.3
1309  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1310  %add.5 = add nsw i32 %6, %add.4
1311  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1312  %add.6 = add nsw i32 %7, %add.5
1313  store i32 %add.6, i32* %res, align 16
1314  ret void
1315}
1316
1317define void @i32_red_example16(i32* %res) {
1318; CHECK-LABEL: @i32_red_example16(
1319; CHECK-NEXT:  entry:
1320; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1321; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1322; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
1323; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1324; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
1325; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1326; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
1327; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1328; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
1329; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1330; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
1331; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1332; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
1333; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1334; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
1335; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
1336; CHECK-NEXT:    [[ADD_7:%.*]] = add nsw i32 [[TMP8]], [[ADD_6]]
1337; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
1338; CHECK-NEXT:    [[ADD_8:%.*]] = add nsw i32 [[TMP9]], [[ADD_7]]
1339; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
1340; CHECK-NEXT:    [[ADD_9:%.*]] = add nsw i32 [[TMP10]], [[ADD_8]]
1341; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
1342; CHECK-NEXT:    [[ADD_10:%.*]] = add nsw i32 [[TMP11]], [[ADD_9]]
1343; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
1344; CHECK-NEXT:    [[ADD_11:%.*]] = add nsw i32 [[TMP12]], [[ADD_10]]
1345; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
1346; CHECK-NEXT:    [[ADD_12:%.*]] = add nsw i32 [[TMP13]], [[ADD_11]]
1347; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
1348; CHECK-NEXT:    [[ADD_13:%.*]] = add nsw i32 [[TMP14]], [[ADD_12]]
1349; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
1350; CHECK-NEXT:    [[ADD_14:%.*]] = add nsw i32 [[TMP15]], [[ADD_13]]
1351; CHECK-NEXT:    store i32 [[ADD_14]], i32* [[RES:%.*]], align 16
1352; CHECK-NEXT:    ret void
1353;
1354; STORE-LABEL: @i32_red_example16(
1355; STORE-NEXT:  entry:
1356; STORE-NEXT:    [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16
1357; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP0]])
1358; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
1359; STORE-NEXT:    ret void
1360;
1361entry:
1362  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1363  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1364  %add = add nsw i32 %1, %0
1365  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1366  %add.1 = add nsw i32 %2, %add
1367  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1368  %add.2 = add nsw i32 %3, %add.1
1369  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1370  %add.3 = add nsw i32 %4, %add.2
1371  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1372  %add.4 = add nsw i32 %5, %add.3
1373  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1374  %add.5 = add nsw i32 %6, %add.4
1375  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1376  %add.6 = add nsw i32 %7, %add.5
1377  %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
1378  %add.7 = add nsw i32 %8, %add.6
1379  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
1380  %add.8 = add nsw i32 %9, %add.7
1381  %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
1382  %add.9 = add nsw i32 %10, %add.8
1383  %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
1384  %add.10 = add nsw i32 %11, %add.9
1385  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
1386  %add.11 = add nsw i32 %12, %add.10
1387  %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
1388  %add.12 = add nsw i32 %13, %add.11
1389  %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
1390  %add.13 = add nsw i32 %14, %add.12
1391  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
1392  %add.14 = add nsw i32 %15, %add.13
1393  store i32 %add.14, i32* %res, align 16
1394  ret void
1395}
1396
1397define void @i32_red_example32(i32* %res) {
1398; CHECK-LABEL: @i32_red_example32(
1399; CHECK-NEXT:  entry:
1400; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1401; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1402; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
1403; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1404; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
1405; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1406; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
1407; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1408; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
1409; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1410; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
1411; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1412; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
1413; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1414; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
1415; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
1416; CHECK-NEXT:    [[ADD_7:%.*]] = add nsw i32 [[TMP8]], [[ADD_6]]
1417; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
1418; CHECK-NEXT:    [[ADD_8:%.*]] = add nsw i32 [[TMP9]], [[ADD_7]]
1419; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
1420; CHECK-NEXT:    [[ADD_9:%.*]] = add nsw i32 [[TMP10]], [[ADD_8]]
1421; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
1422; CHECK-NEXT:    [[ADD_10:%.*]] = add nsw i32 [[TMP11]], [[ADD_9]]
1423; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
1424; CHECK-NEXT:    [[ADD_11:%.*]] = add nsw i32 [[TMP12]], [[ADD_10]]
1425; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
1426; CHECK-NEXT:    [[ADD_12:%.*]] = add nsw i32 [[TMP13]], [[ADD_11]]
1427; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
1428; CHECK-NEXT:    [[ADD_13:%.*]] = add nsw i32 [[TMP14]], [[ADD_12]]
1429; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
1430; CHECK-NEXT:    [[ADD_14:%.*]] = add nsw i32 [[TMP15]], [[ADD_13]]
1431; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16
1432; CHECK-NEXT:    [[ADD_15:%.*]] = add nsw i32 [[TMP16]], [[ADD_14]]
1433; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4
1434; CHECK-NEXT:    [[ADD_16:%.*]] = add nsw i32 [[TMP17]], [[ADD_15]]
1435; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8
1436; CHECK-NEXT:    [[ADD_17:%.*]] = add nsw i32 [[TMP18]], [[ADD_16]]
1437; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4
1438; CHECK-NEXT:    [[ADD_18:%.*]] = add nsw i32 [[TMP19]], [[ADD_17]]
1439; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16
1440; CHECK-NEXT:    [[ADD_19:%.*]] = add nsw i32 [[TMP20]], [[ADD_18]]
1441; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4
1442; CHECK-NEXT:    [[ADD_20:%.*]] = add nsw i32 [[TMP21]], [[ADD_19]]
1443; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8
1444; CHECK-NEXT:    [[ADD_21:%.*]] = add nsw i32 [[TMP22]], [[ADD_20]]
1445; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4
1446; CHECK-NEXT:    [[ADD_22:%.*]] = add nsw i32 [[TMP23]], [[ADD_21]]
1447; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16
1448; CHECK-NEXT:    [[ADD_23:%.*]] = add nsw i32 [[TMP24]], [[ADD_22]]
1449; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4
1450; CHECK-NEXT:    [[ADD_24:%.*]] = add nsw i32 [[TMP25]], [[ADD_23]]
1451; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8
1452; CHECK-NEXT:    [[ADD_25:%.*]] = add nsw i32 [[TMP26]], [[ADD_24]]
1453; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4
1454; CHECK-NEXT:    [[ADD_26:%.*]] = add nsw i32 [[TMP27]], [[ADD_25]]
1455; CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16
1456; CHECK-NEXT:    [[ADD_27:%.*]] = add nsw i32 [[TMP28]], [[ADD_26]]
1457; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4
1458; CHECK-NEXT:    [[ADD_28:%.*]] = add nsw i32 [[TMP29]], [[ADD_27]]
1459; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8
1460; CHECK-NEXT:    [[ADD_29:%.*]] = add nsw i32 [[TMP30]], [[ADD_28]]
1461; CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4
1462; CHECK-NEXT:    [[ADD_30:%.*]] = add nsw i32 [[TMP31]], [[ADD_29]]
1463; CHECK-NEXT:    store i32 [[ADD_30]], i32* [[RES:%.*]], align 16
1464; CHECK-NEXT:    ret void
1465;
1466; STORE-LABEL: @i32_red_example32(
1467; STORE-NEXT:  entry:
1468; STORE-NEXT:    [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16
1469; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP0]])
1470; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
1471; STORE-NEXT:    ret void
1472;
1473entry:
1474  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1475  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1476  %add = add nsw i32 %1, %0
1477  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1478  %add.1 = add nsw i32 %2, %add
1479  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1480  %add.2 = add nsw i32 %3, %add.1
1481  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1482  %add.3 = add nsw i32 %4, %add.2
1483  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1484  %add.4 = add nsw i32 %5, %add.3
1485  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1486  %add.5 = add nsw i32 %6, %add.4
1487  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1488  %add.6 = add nsw i32 %7, %add.5
1489  %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
1490  %add.7 = add nsw i32 %8, %add.6
1491  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
1492  %add.8 = add nsw i32 %9, %add.7
1493  %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
1494  %add.9 = add nsw i32 %10, %add.8
1495  %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
1496  %add.10 = add nsw i32 %11, %add.9
1497  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
1498  %add.11 = add nsw i32 %12, %add.10
1499  %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
1500  %add.12 = add nsw i32 %13, %add.11
1501  %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
1502  %add.13 = add nsw i32 %14, %add.12
1503  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
1504  %add.14 = add nsw i32 %15, %add.13
1505  %16 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16
1506  %add.15 = add nsw i32 %16, %add.14
1507  %17 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4
1508  %add.16 = add nsw i32 %17, %add.15
1509  %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8
1510  %add.17 = add nsw i32 %18, %add.16
1511  %19 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4
1512  %add.18 = add nsw i32 %19, %add.17
1513  %20 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16
1514  %add.19 = add nsw i32 %20, %add.18
1515  %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4
1516  %add.20 = add nsw i32 %21, %add.19
1517  %22 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8
1518  %add.21 = add nsw i32 %22, %add.20
1519  %23 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4
1520  %add.22 = add nsw i32 %23, %add.21
1521  %24 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16
1522  %add.23 = add nsw i32 %24, %add.22
1523  %25 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4
1524  %add.24 = add nsw i32 %25, %add.23
1525  %26 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8
1526  %add.25 = add nsw i32 %26, %add.24
1527  %27 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4
1528  %add.26 = add nsw i32 %27, %add.25
1529  %28 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16
1530  %add.27 = add nsw i32 %28, %add.26
1531  %29 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4
1532  %add.28 = add nsw i32 %29, %add.27
1533  %30 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8
1534  %add.29 = add nsw i32 %30, %add.28
1535  %31 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4
1536  %add.30 = add nsw i32 %31, %add.29
1537  store i32 %add.30, i32* %res, align 16
1538  ret void
1539}
1540
1541declare i32 @foobar(i32)
1542
1543define void @i32_red_call(i32 %val) {
1544; CHECK-LABEL: @i32_red_call(
1545; CHECK-NEXT:  entry:
1546; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1547; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1548; CHECK-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
1549; CHECK-NEXT:    ret void
1550;
1551; STORE-LABEL: @i32_red_call(
1552; STORE-NEXT:  entry:
1553; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1554; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1555; STORE-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
1556; STORE-NEXT:    ret void
1557;
1558entry:
1559  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1560  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1561  %add = add nsw i32 %1, %0
1562  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1563  %add.1 = add nsw i32 %2, %add
1564  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1565  %add.2 = add nsw i32 %3, %add.1
1566  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1567  %add.3 = add nsw i32 %4, %add.2
1568  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1569  %add.4 = add nsw i32 %5, %add.3
1570  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1571  %add.5 = add nsw i32 %6, %add.4
1572  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1573  %add.6 = add nsw i32 %7, %add.5
1574  %res = call i32 @foobar(i32 %add.6)
1575  ret void
1576}
1577
1578define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_v0 {
1579; CHECK-LABEL: @i32_red_invoke(
1580; CHECK-NEXT:  entry:
1581; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1582; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1583; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
1584; CHECK-NEXT:    to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
1585; CHECK:       exception:
1586; CHECK-NEXT:    [[CLEANUP:%.*]] = landingpad i8
1587; CHECK-NEXT:    cleanup
1588; CHECK-NEXT:    br label [[NORMAL]]
1589; CHECK:       normal:
1590; CHECK-NEXT:    ret void
1591;
1592; STORE-LABEL: @i32_red_invoke(
1593; STORE-NEXT:  entry:
1594; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1595; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1596; STORE-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
1597; STORE-NEXT:    to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
1598; STORE:       exception:
1599; STORE-NEXT:    [[CLEANUP:%.*]] = landingpad i8
1600; STORE-NEXT:    cleanup
1601; STORE-NEXT:    br label [[NORMAL]]
1602; STORE:       normal:
1603; STORE-NEXT:    ret void
1604;
1605entry:
1606  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1607  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1608  %add = add nsw i32 %1, %0
1609  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1610  %add.1 = add nsw i32 %2, %add
1611  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1612  %add.2 = add nsw i32 %3, %add.1
1613  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1614  %add.3 = add nsw i32 %4, %add.2
1615  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1616  %add.4 = add nsw i32 %5, %add.3
1617  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1618  %add.5 = add nsw i32 %6, %add.4
1619  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1620  %add.6 = add nsw i32 %7, %add.5
1621  %res = invoke i32 @foobar(i32 %add.6) to label %normal unwind label %exception
1622exception:
1623  %cleanup = landingpad i8 cleanup
1624  br label %normal
1625normal:
1626  ret void
1627}
1628
1629; Test case from PR47670. Reduction result is used as incoming value in phi.
1630define i32 @reduction_result_used_in_phi(i32* nocapture readonly %data, i1 zeroext %b) {
1631; CHECK-LABEL: @reduction_result_used_in_phi(
1632; CHECK-NEXT:  entry:
1633; CHECK-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1634; CHECK:       bb:
1635; CHECK-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
1636; CHECK-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
1637; CHECK-NEXT:    [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
1638; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
1639; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
1640; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1641; CHECK-NEXT:    br label [[EXIT]]
1642; CHECK:       exit:
1643; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
1644; CHECK-NEXT:    ret i32 [[SUM_1]]
1645;
1646; STORE-LABEL: @reduction_result_used_in_phi(
1647; STORE-NEXT:  entry:
1648; STORE-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1649; STORE:       bb:
1650; STORE-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
1651; STORE-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
1652; STORE-NEXT:    [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
1653; STORE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
1654; STORE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
1655; STORE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1656; STORE-NEXT:    br label [[EXIT]]
1657; STORE:       exit:
1658; STORE-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
1659; STORE-NEXT:    ret i32 [[SUM_1]]
1660;
1661entry:
1662  br i1 %b, label %bb, label %exit
1663
1664bb:
1665  %l.0 = load i32, i32* %data, align 4
1666  %idx.1 = getelementptr inbounds i32, i32* %data, i64 1
1667  %l.1 = load i32, i32* %idx.1, align 4
1668  %add.1 = add i32 %l.1, %l.0
1669  %idx.2 = getelementptr inbounds i32, i32* %data, i64 2
1670  %l.2 = load i32, i32* %idx.2, align 4
1671  %add.2 = add i32 %l.2, %add.1
1672  %idx.3 = getelementptr inbounds i32, i32* %data, i64 3
1673  %l.3 = load i32, i32* %idx.3, align 4
1674  %add.3 = add i32 %l.3, %add.2
1675  br label %exit
1676
1677exit:
1678  %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb]
1679  ret i32 %sum.1
1680}
1681
1682define i32 @reduction_result_used_in_phi_loop(i32* nocapture readonly %data, i1 zeroext %b) {
1683; CHECK-LABEL: @reduction_result_used_in_phi_loop(
1684; CHECK-NEXT:  entry:
1685; CHECK-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1686; CHECK:       bb:
1687; CHECK-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
1688; CHECK-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
1689; CHECK-NEXT:    [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
1690; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
1691; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
1692; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1693; CHECK-NEXT:    br label [[EXIT]]
1694; CHECK:       exit:
1695; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
1696; CHECK-NEXT:    ret i32 [[SUM_1]]
1697;
1698; STORE-LABEL: @reduction_result_used_in_phi_loop(
1699; STORE-NEXT:  entry:
1700; STORE-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1701; STORE:       bb:
1702; STORE-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
1703; STORE-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
1704; STORE-NEXT:    [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
1705; STORE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
1706; STORE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
1707; STORE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1708; STORE-NEXT:    br label [[EXIT]]
1709; STORE:       exit:
1710; STORE-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
1711; STORE-NEXT:    ret i32 [[SUM_1]]
1712;
1713entry:
1714  br i1 %b, label %bb, label %exit
1715
1716bb:
1717  %l.0 = load i32, i32* %data, align 4
1718  %idx.1 = getelementptr inbounds i32, i32* %data, i64 1
1719  %l.1 = load i32, i32* %idx.1, align 4
1720  %add.1 = add i32 %l.1, %l.0
1721  %idx.2 = getelementptr inbounds i32, i32* %data, i64 2
1722  %l.2 = load i32, i32* %idx.2, align 4
1723  %add.2 = add i32 %l.2, %add.1
1724  %idx.3 = getelementptr inbounds i32, i32* %data, i64 3
1725  %l.3 = load i32, i32* %idx.3, align 4
1726  %add.3 = add i32 %l.3, %add.2
1727  br label %exit
1728
1729exit:
1730  %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb]
1731  ret i32 %sum.1
1732}
1733
1734; Make sure we do not crash or infinite loop on ill-formed IR.
1735
1736define void @unreachable_block() {
1737; CHECK-LABEL: @unreachable_block(
1738; CHECK-NEXT:  bb.0:
1739; CHECK-NEXT:    br label [[BB_1:%.*]]
1740; CHECK:       dead:
1741; CHECK-NEXT:    [[T0:%.*]] = add i16 [[T0]], undef
1742; CHECK-NEXT:    br label [[BB_1]]
1743; CHECK:       bb.1:
1744; CHECK-NEXT:    [[T1:%.*]] = phi i16 [ undef, [[BB_0:%.*]] ], [ [[T0]], [[DEAD:%.*]] ]
1745; CHECK-NEXT:    ret void
1746;
1747; STORE-LABEL: @unreachable_block(
1748; STORE-NEXT:  bb.0:
1749; STORE-NEXT:    br label [[BB_1:%.*]]
1750; STORE:       dead:
1751; STORE-NEXT:    [[T0:%.*]] = add i16 [[T0]], undef
1752; STORE-NEXT:    br label [[BB_1]]
1753; STORE:       bb.1:
1754; STORE-NEXT:    [[T1:%.*]] = phi i16 [ undef, [[BB_0:%.*]] ], [ [[T0]], [[DEAD:%.*]] ]
1755; STORE-NEXT:    ret void
1756;
1757bb.0:
1758  br label %bb.1
1759
1760dead:
1761  %t0 = add i16 %t0, undef ; unreachable IR may depend on itself
1762  br label %bb.1
1763
1764bb.1:
1765  %t1 = phi i16 [ undef, %bb.0 ], [ %t0, %dead ]
1766  ret void
1767}
1768
1769declare i32 @__gxx_personality_v0(...)
1770