• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
2; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
3; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
4
5%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
6%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
7
8; CHECK-LABEL: merge_const_store:
9; save 1,2,3 ... as one big integer.
10; CHECK: movabsq $578437695752307201
11; CHECK: ret
12define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
13  %1 = icmp sgt i32 %count, 0
14  br i1 %1, label %.lr.ph, label %._crit_edge
15.lr.ph:
16  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
17  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
18  %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
19  store i8 1, i8* %2, align 1
20  %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
21  store i8 2, i8* %3, align 1
22  %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
23  store i8 3, i8* %4, align 1
24  %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
25  store i8 4, i8* %5, align 1
26  %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
27  store i8 5, i8* %6, align 1
28  %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
29  store i8 6, i8* %7, align 1
30  %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
31  store i8 7, i8* %8, align 1
32  %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
33  store i8 8, i8* %9, align 1
34  %10 = add nsw i32 %i.02, 1
35  %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
36  %exitcond = icmp eq i32 %10, %count
37  br i1 %exitcond, label %._crit_edge, label %.lr.ph
38._crit_edge:
39  ret void
40}
41
42; No vectors because we use noimplicitfloat
43; CHECK-LABEL: merge_const_store_no_vec:
44; CHECK-NOT: vmovups
45; CHECK: ret
46define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
47  %1 = icmp sgt i32 %count, 0
48  br i1 %1, label %.lr.ph, label %._crit_edge
49.lr.ph:
50  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
51  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
52  %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
53  store i32 0, i32* %2, align 4
54  %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
55  store i32 0, i32* %3, align 4
56  %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
57  store i32 0, i32* %4, align 4
58  %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
59  store i32 0, i32* %5, align 4
60  %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
61  store i32 0, i32* %6, align 4
62  %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
63  store i32 0, i32* %7, align 4
64  %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
65  store i32 0, i32* %8, align 4
66  %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
67  store i32 0, i32* %9, align 4
68  %10 = add nsw i32 %i.02, 1
69  %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
70  %exitcond = icmp eq i32 %10, %count
71  br i1 %exitcond, label %._crit_edge, label %.lr.ph
72._crit_edge:
73  ret void
74}
75
76; Move the constants using a single vector store.
77; CHECK-LABEL: merge_const_store_vec:
78; CHECK: vmovups
79; CHECK: ret
80define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
81  %1 = icmp sgt i32 %count, 0
82  br i1 %1, label %.lr.ph, label %._crit_edge
83.lr.ph:
84  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
85  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
86  %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
87  store i32 0, i32* %2, align 4
88  %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
89  store i32 0, i32* %3, align 4
90  %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
91  store i32 0, i32* %4, align 4
92  %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
93  store i32 0, i32* %5, align 4
94  %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
95  store i32 0, i32* %6, align 4
96  %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
97  store i32 0, i32* %7, align 4
98  %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
99  store i32 0, i32* %8, align 4
100  %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
101  store i32 0, i32* %9, align 4
102  %10 = add nsw i32 %i.02, 1
103  %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
104  %exitcond = icmp eq i32 %10, %count
105  br i1 %exitcond, label %._crit_edge, label %.lr.ph
106._crit_edge:
107  ret void
108}
109
110; Move the first 4 constants as a single vector. Move the rest as scalars.
111; CHECK-LABEL: merge_nonconst_store:
112; CHECK: movl $67305985
113; CHECK: movb
114; CHECK: movb
115; CHECK: movb
116; CHECK: movb
117; CHECK: ret
118define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
119  %1 = icmp sgt i32 %count, 0
120  br i1 %1, label %.lr.ph, label %._crit_edge
121.lr.ph:
122  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
123  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
124  %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
125  store i8 1, i8* %2, align 1
126  %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
127  store i8 2, i8* %3, align 1
128  %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
129  store i8 3, i8* %4, align 1
130  %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
131  store i8 4, i8* %5, align 1
132  %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
133  store i8 %zz, i8* %6, align 1                     ;  <----------- Not a const;
134  %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
135  store i8 6, i8* %7, align 1
136  %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
137  store i8 7, i8* %8, align 1
138  %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
139  store i8 8, i8* %9, align 1
140  %10 = add nsw i32 %i.02, 1
141  %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
142  %exitcond = icmp eq i32 %10, %count
143  br i1 %exitcond, label %._crit_edge, label %.lr.ph
144._crit_edge:
145  ret void
146}
147
148
149; CHECK-LABEL: merge_loads_i16:
150;  load:
151; BWON:  movzwl
152; BWOFF: movw
153;  store:
154; CHECK: movw
155; CHECK: ret
156define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
157  %1 = icmp sgt i32 %count, 0
158  br i1 %1, label %.lr.ph, label %._crit_edge
159
160.lr.ph:                                           ; preds = %0
161  %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
162  %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
163  br label %4
164
165; <label>:4                                       ; preds = %4, %.lr.ph
166  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
167  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
168  %5 = load i8, i8* %2, align 1
169  %6 = load i8, i8* %3, align 1
170  %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
171  store i8 %5, i8* %7, align 1
172  %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
173  store i8 %6, i8* %8, align 1
174  %9 = add nsw i32 %i.02, 1
175  %10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
176  %exitcond = icmp eq i32 %9, %count
177  br i1 %exitcond, label %._crit_edge, label %4
178
179._crit_edge:                                      ; preds = %4, %0
180  ret void
181}
182
183; The loads and the stores are interleaved. Can't merge them.
184; CHECK-LABEL: no_merge_loads:
185; BWON:  movzbl
186; BWOFF: movb
187; CHECK: movb
188; BWON:  movzbl
189; BWOFF: movb
190; CHECK: movb
191; CHECK: ret
192define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
193  %1 = icmp sgt i32 %count, 0
194  br i1 %1, label %.lr.ph, label %._crit_edge
195
196.lr.ph:                                           ; preds = %0
197  %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
198  %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
199  br label %a4
200
201a4:                                       ; preds = %4, %.lr.ph
202  %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
203  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
204  %a5 = load i8, i8* %2, align 1
205  %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
206  store i8 %a5, i8* %a7, align 1
207  %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
208  %a6 = load i8, i8* %3, align 1
209  store i8 %a6, i8* %a8, align 1
210  %a9 = add nsw i32 %i.02, 1
211  %a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
212  %exitcond = icmp eq i32 %a9, %count
213  br i1 %exitcond, label %._crit_edge, label %a4
214
215._crit_edge:                                      ; preds = %4, %0
216  ret void
217}
218
219
220; CHECK-LABEL: merge_loads_integer:
221;  load:
222; CHECK: movq
223;  store:
224; CHECK: movq
225; CHECK: ret
226define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
227  %1 = icmp sgt i32 %count, 0
228  br i1 %1, label %.lr.ph, label %._crit_edge
229
230.lr.ph:                                           ; preds = %0
231  %2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
232  %3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
233  br label %4
234
235; <label>:4                                       ; preds = %4, %.lr.ph
236  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
237  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
238  %5 = load i32, i32* %2
239  %6 = load i32, i32* %3
240  %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
241  store i32 %5, i32* %7
242  %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
243  store i32 %6, i32* %8
244  %9 = add nsw i32 %i.02, 1
245  %10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
246  %exitcond = icmp eq i32 %9, %count
247  br i1 %exitcond, label %._crit_edge, label %4
248
249._crit_edge:                                      ; preds = %4, %0
250  ret void
251}
252
253
254; CHECK-LABEL: merge_loads_vector:
255;  load:
256; CHECK: movups
257;  store:
258; CHECK: movups
259; CHECK: ret
260define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
261  %a1 = icmp sgt i32 %count, 0
262  br i1 %a1, label %.lr.ph, label %._crit_edge
263
264.lr.ph:                                           ; preds = %0
265  %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
266  %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
267  %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
268  %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
269  br label %block4
270
271block4:                                       ; preds = %4, %.lr.ph
272  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
273  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
274  %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
275  %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
276  %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
277  %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
278  %b1 = load i32, i32* %a2
279  %b2 = load i32, i32* %a3
280  %b3 = load i32, i32* %a4
281  %b4 = load i32, i32* %a5
282  store i32 %b1, i32* %a7
283  store i32 %b2, i32* %a8
284  store i32 %b3, i32* %a9
285  store i32 %b4, i32* %a10
286  %c9 = add nsw i32 %i.02, 1
287  %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
288  %exitcond = icmp eq i32 %c9, %count
289  br i1 %exitcond, label %._crit_edge, label %block4
290
291._crit_edge:                                      ; preds = %4, %0
292  ret void
293}
294
295;; On x86, even unaligned copies should be merged to vector ops.
296;; TODO: however, this cannot happen at the moment, due to brokenness
297;; in MergeConsecutiveStores. See UseAA FIXME in DAGCombiner.cpp
298;; visitSTORE.
299
300; CHECK-LABEL: merge_loads_no_align:
301;  load:
302; CHECK-NOT: vmovups ;; TODO
303;  store:
304; CHECK-NOT: vmovups ;; TODO
305; CHECK: ret
306define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
307  %a1 = icmp sgt i32 %count, 0
308  br i1 %a1, label %.lr.ph, label %._crit_edge
309
310.lr.ph:                                           ; preds = %0
311  %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
312  %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
313  %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
314  %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
315  br label %block4
316
317block4:                                       ; preds = %4, %.lr.ph
318  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
319  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
320  %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
321  %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
322  %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
323  %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
324  %b1 = load i32, i32* %a2, align 1
325  %b2 = load i32, i32* %a3, align 1
326  %b3 = load i32, i32* %a4, align 1
327  %b4 = load i32, i32* %a5, align 1
328  store i32 %b1, i32* %a7, align 1
329  store i32 %b2, i32* %a8, align 1
330  store i32 %b3, i32* %a9, align 1
331  store i32 %b4, i32* %a10, align 1
332  %c9 = add nsw i32 %i.02, 1
333  %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
334  %exitcond = icmp eq i32 %c9, %count
335  br i1 %exitcond, label %._crit_edge, label %block4
336
337._crit_edge:                                      ; preds = %4, %0
338  ret void
339}
340
341; Make sure that we merge the consecutive load/store sequence below and use a
342; word (16 bit) instead of a byte copy.
343; CHECK-LABEL: MergeLoadStoreBaseIndexOffset:
344; BWON: movzwl   (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]]
345; BWOFF: movw    (%{{.*}},%{{.*}}), %[[REG:[a-z]+]]
346; CHECK: movw    %[[REG]], (%{{.*}})
347define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
348  br label %1
349
350; <label>:1
351  %.09 = phi i32 [ %n, %0 ], [ %11, %1 ]
352  %.08 = phi i8* [ %b, %0 ], [ %10, %1 ]
353  %.0 = phi i64* [ %a, %0 ], [ %2, %1 ]
354  %2 = getelementptr inbounds i64, i64* %.0, i64 1
355  %3 = load i64, i64* %.0, align 1
356  %4 = getelementptr inbounds i8, i8* %c, i64 %3
357  %5 = load i8, i8* %4, align 1
358  %6 = add i64 %3, 1
359  %7 = getelementptr inbounds i8, i8* %c, i64 %6
360  %8 = load i8, i8* %7, align 1
361  store i8 %5, i8* %.08, align 1
362  %9 = getelementptr inbounds i8, i8* %.08, i64 1
363  store i8 %8, i8* %9, align 1
364  %10 = getelementptr inbounds i8, i8* %.08, i64 2
365  %11 = add nsw i32 %.09, -1
366  %12 = icmp eq i32 %11, 0
367  br i1 %12, label %13, label %1
368
369; <label>:13
370  ret void
371}
372
373; Make sure that we merge the consecutive load/store sequence below and use a
374; word (16 bit) instead of a byte copy even if there are intermediate sign
375; extensions.
376; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetSext:
377; BWON: movzwl   (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]]
378; BWOFF: movw    (%{{.*}},%{{.*}}), %[[REG:[a-z]+]]
379; CHECK: movw    %[[REG]], (%{{.*}})
380define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
381  br label %1
382
383; <label>:1
384  %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
385  %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
386  %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
387  %2 = getelementptr inbounds i8, i8* %.0, i64 1
388  %3 = load i8, i8* %.0, align 1
389  %4 = sext i8 %3 to i64
390  %5 = getelementptr inbounds i8, i8* %c, i64 %4
391  %6 = load i8, i8* %5, align 1
392  %7 = add i64 %4, 1
393  %8 = getelementptr inbounds i8, i8* %c, i64 %7
394  %9 = load i8, i8* %8, align 1
395  store i8 %6, i8* %.08, align 1
396  %10 = getelementptr inbounds i8, i8* %.08, i64 1
397  store i8 %9, i8* %10, align 1
398  %11 = getelementptr inbounds i8, i8* %.08, i64 2
399  %12 = add nsw i32 %.09, -1
400  %13 = icmp eq i32 %12, 0
401  br i1 %13, label %14, label %1
402
403; <label>:14
404  ret void
405}
406
407; However, we can only merge ignore sign extensions when they are on all memory
408; computations;
409; CHECK-LABEL: loadStoreBaseIndexOffsetSextNoSex:
410; CHECK-NOT: movw    (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
411; CHECK-NOT: movw    [[REG]], (%{{.*}})
412define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
413  br label %1
414
415; <label>:1
416  %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
417  %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
418  %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
419  %2 = getelementptr inbounds i8, i8* %.0, i64 1
420  %3 = load i8, i8* %.0, align 1
421  %4 = sext i8 %3 to i64
422  %5 = getelementptr inbounds i8, i8* %c, i64 %4
423  %6 = load i8, i8* %5, align 1
424  %7 = add i8 %3, 1
425  %wrap.4 = sext i8 %7 to i64
426  %8 = getelementptr inbounds i8, i8* %c, i64 %wrap.4
427  %9 = load i8, i8* %8, align 1
428  store i8 %6, i8* %.08, align 1
429  %10 = getelementptr inbounds i8, i8* %.08, i64 1
430  store i8 %9, i8* %10, align 1
431  %11 = getelementptr inbounds i8, i8* %.08, i64 2
432  %12 = add nsw i32 %.09, -1
433  %13 = icmp eq i32 %12, 0
434  br i1 %13, label %14, label %1
435
436; <label>:14
437  ret void
438}
439
440; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
441define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
442  %vecext0 = extractelement <8 x float> %v, i32 0
443  %vecext1 = extractelement <8 x float> %v, i32 1
444  %vecext2 = extractelement <8 x float> %v, i32 2
445  %vecext3 = extractelement <8 x float> %v, i32 3
446  %vecext4 = extractelement <8 x float> %v, i32 4
447  %vecext5 = extractelement <8 x float> %v, i32 5
448  %vecext6 = extractelement <8 x float> %v, i32 6
449  %vecext7 = extractelement <8 x float> %v, i32 7
450  %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1
451  %arrayidx2 = getelementptr inbounds float, float* %ptr, i64 2
452  %arrayidx3 = getelementptr inbounds float, float* %ptr, i64 3
453  %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 4
454  %arrayidx5 = getelementptr inbounds float, float* %ptr, i64 5
455  %arrayidx6 = getelementptr inbounds float, float* %ptr, i64 6
456  %arrayidx7 = getelementptr inbounds float, float* %ptr, i64 7
457  store float %vecext0, float* %ptr, align 4
458  store float %vecext1, float* %arrayidx1, align 4
459  store float %vecext2, float* %arrayidx2, align 4
460  store float %vecext3, float* %arrayidx3, align 4
461  store float %vecext4, float* %arrayidx4, align 4
462  store float %vecext5, float* %arrayidx5, align 4
463  store float %vecext6, float* %arrayidx6, align 4
464  store float %vecext7, float* %arrayidx7, align 4
465  ret void
466
467; CHECK-LABEL: merge_vec_element_store
468; CHECK: vmovups
469; CHECK-NEXT: vzeroupper
470; CHECK-NEXT: retq
471}
472
473; PR21711 - Merge vector stores into wider vector stores.
474; These should be merged into 32-byte stores.
475define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) {
476  %idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
477  %idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
478  %idx2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
479  %idx3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 6
480  %shuffle0 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
481  %shuffle1 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
482  %shuffle2 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
483  %shuffle3 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
484  store <4 x float> %shuffle0, <4 x float>* %idx0, align 16
485  store <4 x float> %shuffle1, <4 x float>* %idx1, align 16
486  store <4 x float> %shuffle2, <4 x float>* %idx2, align 16
487  store <4 x float> %shuffle3, <4 x float>* %idx3, align 16
488  ret void
489
490; CHECK-LABEL: merge_vec_extract_stores
491; CHECK:      vmovups %ymm0, 48(%rdi)
492; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
493; CHECK-NEXT: vzeroupper
494; CHECK-NEXT: retq
495}
496
497; Merging vector stores when sourced from vector loads is not currently handled.
498define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
499  %load_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 0
500  %load_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 1
501  %v0 = load <4 x float>, <4 x float>* %load_idx0
502  %v1 = load <4 x float>, <4 x float>* %load_idx1
503  %store_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 0
504  %store_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
505  store <4 x float> %v0, <4 x float>* %store_idx0, align 16
506  store <4 x float> %v1, <4 x float>* %store_idx1, align 16
507  ret void
508
509; CHECK-LABEL: merge_vec_stores_from_loads
510; CHECK:      vmovaps
511; CHECK-NEXT: vmovaps
512; CHECK-NEXT: vmovaps
513; CHECK-NEXT: vmovaps
514; CHECK-NEXT: retq
515}
516
517; Merging vector stores when sourced from a constant vector is not currently handled.
518define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
519  %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
520  %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
521  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx0, align 16
522  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx1, align 16
523  ret void
524
525; CHECK-LABEL: merge_vec_stores_of_constants
526; CHECK:      vxorps
527; CHECK-NEXT: vmovaps
528; CHECK-NEXT: vmovaps
529; CHECK-NEXT: retq
530}
531
532; This is a minimized test based on real code that was failing.
533; We could merge stores (and loads) like this...
534
535define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
536  %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
537  %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
538  %idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4
539  %idx5 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 5
540
541  %a0 = load i64, i64* %idx0, align 8
542  store i64 %a0, i64* %idx4, align 8
543
544  %b = bitcast i64* %idx1 to <2 x i64>*
545  %v = load <2 x i64>, <2 x i64>* %b, align 8
546  %a1 = extractelement <2 x i64> %v, i32 0
547  store i64 %a1, i64* %idx5, align 8
548  ret void
549
550; CHECK-LABEL: merge_vec_element_and_scalar_load
551; CHECK:      movq	(%rdi), %rax
552; CHECK-NEXT: movq	%rax, 32(%rdi)
553; CHECK-NEXT: movq	8(%rdi), %rax
554; CHECK-NEXT: movq	%rax, 40(%rdi)
555; CHECK-NEXT: retq
556}
557