• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
3; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
4
5%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
6%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
7
8; save 1,2,3 ... as one big integer.
9define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
10; CHECK-LABEL: merge_const_store:
11; CHECK:       # %bb.0:
12; CHECK-NEXT:    testl %edi, %edi
13; CHECK-NEXT:    jle .LBB0_3
14; CHECK-NEXT:  # %bb.1: # %.lr.ph.preheader
15; CHECK-NEXT:    movabsq $578437695752307201, %rax # imm = 0x807060504030201
16; CHECK-NEXT:    .p2align 4, 0x90
17; CHECK-NEXT:  .LBB0_2: # %.lr.ph
18; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
19; CHECK-NEXT:    movq %rax, (%rsi)
20; CHECK-NEXT:    addq $8, %rsi
21; CHECK-NEXT:    decl %edi
22; CHECK-NEXT:    jne .LBB0_2
23; CHECK-NEXT:  .LBB0_3: # %._crit_edge
24; CHECK-NEXT:    retq
25  %1 = icmp sgt i32 %count, 0
26  br i1 %1, label %.lr.ph, label %._crit_edge
27.lr.ph:
28  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
29  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
30  %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
31  store i8 1, i8* %2, align 1
32  %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
33  store i8 2, i8* %3, align 1
34  %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
35  store i8 3, i8* %4, align 1
36  %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
37  store i8 4, i8* %5, align 1
38  %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
39  store i8 5, i8* %6, align 1
40  %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
41  store i8 6, i8* %7, align 1
42  %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
43  store i8 7, i8* %8, align 1
44  %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
45  store i8 8, i8* %9, align 1
46  %10 = add nsw i32 %i.02, 1
47  %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
48  %exitcond = icmp eq i32 %10, %count
49  br i1 %exitcond, label %._crit_edge, label %.lr.ph
50._crit_edge:
51  ret void
52}
53
54; No vectors because we use noimplicitfloat
55define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
56; CHECK-LABEL: merge_const_store_no_vec:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    testl %edi, %edi
59; CHECK-NEXT:    jle .LBB1_2
60; CHECK-NEXT:    .p2align 4, 0x90
61; CHECK-NEXT:  .LBB1_1: # %.lr.ph
62; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
63; CHECK-NEXT:    movq $0, (%rsi)
64; CHECK-NEXT:    movq $0, 8(%rsi)
65; CHECK-NEXT:    movq $0, 16(%rsi)
66; CHECK-NEXT:    movq $0, 24(%rsi)
67; CHECK-NEXT:    addq $32, %rsi
68; CHECK-NEXT:    decl %edi
69; CHECK-NEXT:    jne .LBB1_1
70; CHECK-NEXT:  .LBB1_2: # %._crit_edge
71; CHECK-NEXT:    retq
72  %1 = icmp sgt i32 %count, 0
73  br i1 %1, label %.lr.ph, label %._crit_edge
74.lr.ph:
75  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
76  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
77  %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
78  store i32 0, i32* %2, align 4
79  %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
80  store i32 0, i32* %3, align 4
81  %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
82  store i32 0, i32* %4, align 4
83  %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
84  store i32 0, i32* %5, align 4
85  %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
86  store i32 0, i32* %6, align 4
87  %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
88  store i32 0, i32* %7, align 4
89  %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
90  store i32 0, i32* %8, align 4
91  %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
92  store i32 0, i32* %9, align 4
93  %10 = add nsw i32 %i.02, 1
94  %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
95  %exitcond = icmp eq i32 %10, %count
96  br i1 %exitcond, label %._crit_edge, label %.lr.ph
97._crit_edge:
98  ret void
99}
100
101; Move the constants using a single vector store.
102define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
103; CHECK-LABEL: merge_const_store_vec:
104; CHECK:       # %bb.0:
105; CHECK-NEXT:    testl %edi, %edi
106; CHECK-NEXT:    jle .LBB2_3
107; CHECK-NEXT:  # %bb.1: # %.lr.ph.preheader
108; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
109; CHECK-NEXT:    .p2align 4, 0x90
110; CHECK-NEXT:  .LBB2_2: # %.lr.ph
111; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
112; CHECK-NEXT:    vmovups %ymm0, (%rsi)
113; CHECK-NEXT:    addq $32, %rsi
114; CHECK-NEXT:    decl %edi
115; CHECK-NEXT:    jne .LBB2_2
116; CHECK-NEXT:  .LBB2_3: # %._crit_edge
117; CHECK-NEXT:    vzeroupper
118; CHECK-NEXT:    retq
119  %1 = icmp sgt i32 %count, 0
120  br i1 %1, label %.lr.ph, label %._crit_edge
121.lr.ph:
122  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
123  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
124  %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
125  store i32 0, i32* %2, align 4
126  %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
127  store i32 0, i32* %3, align 4
128  %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
129  store i32 0, i32* %4, align 4
130  %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
131  store i32 0, i32* %5, align 4
132  %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
133  store i32 0, i32* %6, align 4
134  %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
135  store i32 0, i32* %7, align 4
136  %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
137  store i32 0, i32* %8, align 4
138  %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
139  store i32 0, i32* %9, align 4
140  %10 = add nsw i32 %i.02, 1
141  %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
142  %exitcond = icmp eq i32 %10, %count
143  br i1 %exitcond, label %._crit_edge, label %.lr.ph
144._crit_edge:
145  ret void
146}
147
148; Move the first 4 constants as a single vector. Move the rest as scalars.
149define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
150; CHECK-LABEL: merge_nonconst_store:
151; CHECK:       # %bb.0:
152; CHECK-NEXT:    testl %edi, %edi
153; CHECK-NEXT:    jle .LBB3_2
154; CHECK-NEXT:    .p2align 4, 0x90
155; CHECK-NEXT:  .LBB3_1: # %.lr.ph
156; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
157; CHECK-NEXT:    movl $67305985, (%rdx) # imm = 0x4030201
158; CHECK-NEXT:    movb %sil, 4(%rdx)
159; CHECK-NEXT:    movw $1798, 5(%rdx) # imm = 0x706
160; CHECK-NEXT:    movb $8, 7(%rdx)
161; CHECK-NEXT:    addq $8, %rdx
162; CHECK-NEXT:    decl %edi
163; CHECK-NEXT:    jne .LBB3_1
164; CHECK-NEXT:  .LBB3_2: # %._crit_edge
165; CHECK-NEXT:    retq
166  %1 = icmp sgt i32 %count, 0
167  br i1 %1, label %.lr.ph, label %._crit_edge
168.lr.ph:
169  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
170  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
171  %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
172  store i8 1, i8* %2, align 1
173  %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
174  store i8 2, i8* %3, align 1
175  %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
176  store i8 3, i8* %4, align 1
177  %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
178  store i8 4, i8* %5, align 1
179  %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
180  store i8 %zz, i8* %6, align 1                     ;  <----------- Not a const;
181  %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
182  store i8 6, i8* %7, align 1
183  %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
184  store i8 7, i8* %8, align 1
185  %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
186  store i8 8, i8* %9, align 1
187  %10 = add nsw i32 %i.02, 1
188  %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
189  %exitcond = icmp eq i32 %10, %count
190  br i1 %exitcond, label %._crit_edge, label %.lr.ph
191._crit_edge:
192  ret void
193}
194
195define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
196; BWON-LABEL: merge_loads_i16:
197; BWON:       # %bb.0:
198; BWON-NEXT:    testl %edi, %edi
199; BWON-NEXT:    jle .LBB4_2
200; BWON-NEXT:    .p2align 4, 0x90
201; BWON-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
202; BWON-NEXT:    movzwl (%rsi), %eax
203; BWON-NEXT:    movw %ax, (%rdx)
204; BWON-NEXT:    addq $8, %rdx
205; BWON-NEXT:    decl %edi
206; BWON-NEXT:    jne .LBB4_1
207; BWON-NEXT:  .LBB4_2: # %._crit_edge
208; BWON-NEXT:    retq
209;
210; BWOFF-LABEL: merge_loads_i16:
211; BWOFF:       # %bb.0:
212; BWOFF-NEXT:    testl %edi, %edi
213; BWOFF-NEXT:    jle .LBB4_2
214; BWOFF-NEXT:    .p2align 4, 0x90
215; BWOFF-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
216; BWOFF-NEXT:    movw (%rsi), %ax
217; BWOFF-NEXT:    movw %ax, (%rdx)
218; BWOFF-NEXT:    addq $8, %rdx
219; BWOFF-NEXT:    decl %edi
220; BWOFF-NEXT:    jne .LBB4_1
221; BWOFF-NEXT:  .LBB4_2: # %._crit_edge
222; BWOFF-NEXT:    retq
223  %1 = icmp sgt i32 %count, 0
224  br i1 %1, label %.lr.ph, label %._crit_edge
225
226.lr.ph:                                           ; preds = %0
227  %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
228  %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
229  br label %4
230
231; <label>:4                                       ; preds = %4, %.lr.ph
232  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
233  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
234  %5 = load i8, i8* %2, align 1
235  %6 = load i8, i8* %3, align 1
236  %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
237  store i8 %5, i8* %7, align 1
238  %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
239  store i8 %6, i8* %8, align 1
240  %9 = add nsw i32 %i.02, 1
241  %10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
242  %exitcond = icmp eq i32 %9, %count
243  br i1 %exitcond, label %._crit_edge, label %4
244
245._crit_edge:                                      ; preds = %4, %0
246  ret void
247}
248
249; The loads and the stores are interleaved. Can't merge them.
250define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
251; BWON-LABEL: no_merge_loads:
252; BWON:       # %bb.0:
253; BWON-NEXT:    testl %edi, %edi
254; BWON-NEXT:    jle .LBB5_2
255; BWON-NEXT:    .p2align 4, 0x90
256; BWON-NEXT:  .LBB5_1: # %a4
257; BWON-NEXT:    # =>This Inner Loop Header: Depth=1
258; BWON-NEXT:    movzbl (%rsi), %eax
259; BWON-NEXT:    movb %al, (%rdx)
260; BWON-NEXT:    movzbl 1(%rsi), %eax
261; BWON-NEXT:    movb %al, 1(%rdx)
262; BWON-NEXT:    addq $8, %rdx
263; BWON-NEXT:    decl %edi
264; BWON-NEXT:    jne .LBB5_1
265; BWON-NEXT:  .LBB5_2: # %._crit_edge
266; BWON-NEXT:    retq
267;
268; BWOFF-LABEL: no_merge_loads:
269; BWOFF:       # %bb.0:
270; BWOFF-NEXT:    testl %edi, %edi
271; BWOFF-NEXT:    jle .LBB5_2
272; BWOFF-NEXT:    .p2align 4, 0x90
273; BWOFF-NEXT:  .LBB5_1: # %a4
274; BWOFF-NEXT:    # =>This Inner Loop Header: Depth=1
275; BWOFF-NEXT:    movb (%rsi), %al
276; BWOFF-NEXT:    movb %al, (%rdx)
277; BWOFF-NEXT:    movb 1(%rsi), %al
278; BWOFF-NEXT:    movb %al, 1(%rdx)
279; BWOFF-NEXT:    addq $8, %rdx
280; BWOFF-NEXT:    decl %edi
281; BWOFF-NEXT:    jne .LBB5_1
282; BWOFF-NEXT:  .LBB5_2: # %._crit_edge
283; BWOFF-NEXT:    retq
284  %1 = icmp sgt i32 %count, 0
285  br i1 %1, label %.lr.ph, label %._crit_edge
286
287.lr.ph:                                           ; preds = %0
288  %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
289  %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
290  br label %a4
291
292a4:                                       ; preds = %4, %.lr.ph
293  %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
294  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
295  %a5 = load i8, i8* %2, align 1
296  %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
297  store i8 %a5, i8* %a7, align 1
298  %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
299  %a6 = load i8, i8* %3, align 1
300  store i8 %a6, i8* %a8, align 1
301  %a9 = add nsw i32 %i.02, 1
302  %a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
303  %exitcond = icmp eq i32 %a9, %count
304  br i1 %exitcond, label %._crit_edge, label %a4
305
306._crit_edge:                                      ; preds = %4, %0
307  ret void
308}
309
310define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
311; CHECK-LABEL: merge_loads_integer:
312; CHECK:       # %bb.0:
313; CHECK-NEXT:    testl %edi, %edi
314; CHECK-NEXT:    jle .LBB6_2
315; CHECK-NEXT:    .p2align 4, 0x90
316; CHECK-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
317; CHECK-NEXT:    movq (%rsi), %rax
318; CHECK-NEXT:    movq %rax, (%rdx)
319; CHECK-NEXT:    addq $32, %rdx
320; CHECK-NEXT:    decl %edi
321; CHECK-NEXT:    jne .LBB6_1
322; CHECK-NEXT:  .LBB6_2: # %._crit_edge
323; CHECK-NEXT:    retq
324  %1 = icmp sgt i32 %count, 0
325  br i1 %1, label %.lr.ph, label %._crit_edge
326
327.lr.ph:                                           ; preds = %0
328  %2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
329  %3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
330  br label %4
331
332; <label>:4                                       ; preds = %4, %.lr.ph
333  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
334  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
335  %5 = load i32, i32* %2
336  %6 = load i32, i32* %3
337  %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
338  store i32 %5, i32* %7
339  %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
340  store i32 %6, i32* %8
341  %9 = add nsw i32 %i.02, 1
342  %10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
343  %exitcond = icmp eq i32 %9, %count
344  br i1 %exitcond, label %._crit_edge, label %4
345
346._crit_edge:                                      ; preds = %4, %0
347  ret void
348}
349
350define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
351; CHECK-LABEL: merge_loads_vector:
352; CHECK:       # %bb.0:
353; CHECK-NEXT:    testl %edi, %edi
354; CHECK-NEXT:    jle .LBB7_2
355; CHECK-NEXT:    .p2align 4, 0x90
356; CHECK-NEXT:  .LBB7_1: # %block4
357; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
358; CHECK-NEXT:    vmovups (%rsi), %xmm0
359; CHECK-NEXT:    vmovups %xmm0, (%rdx)
360; CHECK-NEXT:    addq $32, %rdx
361; CHECK-NEXT:    decl %edi
362; CHECK-NEXT:    jne .LBB7_1
363; CHECK-NEXT:  .LBB7_2: # %._crit_edge
364; CHECK-NEXT:    retq
365  %a1 = icmp sgt i32 %count, 0
366  br i1 %a1, label %.lr.ph, label %._crit_edge
367
368.lr.ph:                                           ; preds = %0
369  %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
370  %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
371  %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
372  %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
373  br label %block4
374
375block4:                                       ; preds = %4, %.lr.ph
376  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
377  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
378  %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
379  %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
380  %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
381  %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
382  %b1 = load i32, i32* %a2
383  %b2 = load i32, i32* %a3
384  %b3 = load i32, i32* %a4
385  %b4 = load i32, i32* %a5
386  store i32 %b1, i32* %a7
387  store i32 %b2, i32* %a8
388  store i32 %b3, i32* %a9
389  store i32 %b4, i32* %a10
390  %c9 = add nsw i32 %i.02, 1
391  %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
392  %exitcond = icmp eq i32 %c9, %count
393  br i1 %exitcond, label %._crit_edge, label %block4
394
395._crit_edge:                                      ; preds = %4, %0
396  ret void
397}
398
399; On x86, even unaligned copies can be merged to vector ops.
400define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
401; CHECK-LABEL: merge_loads_no_align:
402; CHECK:       # %bb.0:
403; CHECK-NEXT:    testl %edi, %edi
404; CHECK-NEXT:    jle .LBB8_2
405; CHECK-NEXT:    .p2align 4, 0x90
406; CHECK-NEXT:  .LBB8_1: # %block4
407; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
408; CHECK-NEXT:    vmovups (%rsi), %xmm0
409; CHECK-NEXT:    vmovups %xmm0, (%rdx)
410; CHECK-NEXT:    addq $32, %rdx
411; CHECK-NEXT:    decl %edi
412; CHECK-NEXT:    jne .LBB8_1
413; CHECK-NEXT:  .LBB8_2: # %._crit_edge
414; CHECK-NEXT:    retq
415  %a1 = icmp sgt i32 %count, 0
416  br i1 %a1, label %.lr.ph, label %._crit_edge
417
418.lr.ph:                                           ; preds = %0
419  %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
420  %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
421  %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
422  %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
423  br label %block4
424
425block4:                                       ; preds = %4, %.lr.ph
426  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
427  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
428  %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
429  %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
430  %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
431  %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
432  %b1 = load i32, i32* %a2, align 1
433  %b2 = load i32, i32* %a3, align 1
434  %b3 = load i32, i32* %a4, align 1
435  %b4 = load i32, i32* %a5, align 1
436  store i32 %b1, i32* %a7, align 1
437  store i32 %b2, i32* %a8, align 1
438  store i32 %b3, i32* %a9, align 1
439  store i32 %b4, i32* %a10, align 1
440  %c9 = add nsw i32 %i.02, 1
441  %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
442  %exitcond = icmp eq i32 %c9, %count
443  br i1 %exitcond, label %._crit_edge, label %block4
444
445._crit_edge:                                      ; preds = %4, %0
446  ret void
447}
448
449; Make sure that we merge the consecutive load/store sequence below and use a
450; word (16 bit) instead of a byte copy.
451define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
452; BWON-LABEL: MergeLoadStoreBaseIndexOffset:
453; BWON:       # %bb.0:
454; BWON-NEXT:    movl %ecx, %r8d
455; BWON-NEXT:    xorl %ecx, %ecx
456; BWON-NEXT:    .p2align 4, 0x90
457; BWON-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
458; BWON-NEXT:    movq (%rdi,%rcx,8), %rax
459; BWON-NEXT:    movzwl (%rdx,%rax), %eax
460; BWON-NEXT:    movw %ax, (%rsi,%rcx,2)
461; BWON-NEXT:    incq %rcx
462; BWON-NEXT:    cmpl %ecx, %r8d
463; BWON-NEXT:    jne .LBB9_1
464; BWON-NEXT:  # %bb.2:
465; BWON-NEXT:    retq
466;
467; BWOFF-LABEL: MergeLoadStoreBaseIndexOffset:
468; BWOFF:       # %bb.0:
469; BWOFF-NEXT:    movl %ecx, %r8d
470; BWOFF-NEXT:    xorl %ecx, %ecx
471; BWOFF-NEXT:    .p2align 4, 0x90
472; BWOFF-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
473; BWOFF-NEXT:    movq (%rdi,%rcx,8), %rax
474; BWOFF-NEXT:    movw (%rdx,%rax), %ax
475; BWOFF-NEXT:    movw %ax, (%rsi,%rcx,2)
476; BWOFF-NEXT:    incq %rcx
477; BWOFF-NEXT:    cmpl %ecx, %r8d
478; BWOFF-NEXT:    jne .LBB9_1
479; BWOFF-NEXT:  # %bb.2:
480; BWOFF-NEXT:    retq
481  br label %1
482
483; <label>:1
484  %.09 = phi i32 [ %n, %0 ], [ %11, %1 ]
485  %.08 = phi i8* [ %b, %0 ], [ %10, %1 ]
486  %.0 = phi i64* [ %a, %0 ], [ %2, %1 ]
487  %2 = getelementptr inbounds i64, i64* %.0, i64 1
488  %3 = load i64, i64* %.0, align 1
489  %4 = getelementptr inbounds i8, i8* %c, i64 %3
490  %5 = load i8, i8* %4, align 1
491  %6 = add i64 %3, 1
492  %7 = getelementptr inbounds i8, i8* %c, i64 %6
493  %8 = load i8, i8* %7, align 1
494  store i8 %5, i8* %.08, align 1
495  %9 = getelementptr inbounds i8, i8* %.08, i64 1
496  store i8 %8, i8* %9, align 1
497  %10 = getelementptr inbounds i8, i8* %.08, i64 2
498  %11 = add nsw i32 %.09, -1
499  %12 = icmp eq i32 %11, 0
500  br i1 %12, label %13, label %1
501
502; <label>:13
503  ret void
504}
505
506; Make sure that we merge the consecutive load/store sequence below and use a
507; word (16 bit) instead of a byte copy for complicated address calculation.
508define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) {
509; BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
510; BWON:       # %bb.0:
511; BWON-NEXT:    xorl %r8d, %r8d
512; BWON-NEXT:    .p2align 4, 0x90
513; BWON-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
514; BWON-NEXT:    movsbq (%rsi), %rax
515; BWON-NEXT:    movzwl (%rdx,%rax), %eax
516; BWON-NEXT:    movw %ax, (%rdi,%r8)
517; BWON-NEXT:    incq %rsi
518; BWON-NEXT:    addq $2, %r8
519; BWON-NEXT:    cmpq %rcx, %r8
520; BWON-NEXT:    jl .LBB10_1
521; BWON-NEXT:  # %bb.2:
522; BWON-NEXT:    retq
523;
524; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
525; BWOFF:       # %bb.0:
526; BWOFF-NEXT:    xorl %r8d, %r8d
527; BWOFF-NEXT:    .p2align 4, 0x90
528; BWOFF-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
529; BWOFF-NEXT:    movsbq (%rsi), %rax
530; BWOFF-NEXT:    movw (%rdx,%rax), %ax
531; BWOFF-NEXT:    movw %ax, (%rdi,%r8)
532; BWOFF-NEXT:    incq %rsi
533; BWOFF-NEXT:    addq $2, %r8
534; BWOFF-NEXT:    cmpq %rcx, %r8
535; BWOFF-NEXT:    jl .LBB10_1
536; BWOFF-NEXT:  # %bb.2:
537; BWOFF-NEXT:    retq
538  br label %1
539
540; <label>:1
541  %.09 = phi i64 [ 0, %0 ], [ %13, %1 ]
542  %.08 = phi i8* [ %b, %0 ], [ %12, %1 ]
543  %2 = load i8, i8* %.08, align 1
544  %3 = sext i8 %2 to i64
545  %4 = getelementptr inbounds i8, i8* %c, i64 %3
546  %5 = load i8, i8* %4, align 1
547  %6 = add nsw i64 %3, 1
548  %7 = getelementptr inbounds i8, i8* %c, i64 %6
549  %8 = load i8, i8* %7, align 1
550  %9 = getelementptr inbounds i8, i8* %a, i64 %.09
551  store i8 %5, i8* %9, align 1
552  %10 = or i64 %.09, 1
553  %11 = getelementptr inbounds i8, i8* %a, i64 %10
554  store i8 %8, i8* %11, align 1
555  %12 = getelementptr inbounds i8, i8* %.08, i64 1
556  %13 = add nuw nsw i64 %.09, 2
557  %14 = icmp slt i64 %13, %n
558  br i1 %14, label %1, label %15
559
560; <label>:15
561  ret void
562}
563
564; Make sure that we merge the consecutive load/store sequence below and use a
565; word (16 bit) instead of a byte copy even if there are intermediate sign
566; extensions.
567define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
568; BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext:
569; BWON:       # %bb.0:
570; BWON-NEXT:    movl %ecx, %r8d
571; BWON-NEXT:    xorl %ecx, %ecx
572; BWON-NEXT:    .p2align 4, 0x90
573; BWON-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
574; BWON-NEXT:    movsbq (%rdi,%rcx), %rax
575; BWON-NEXT:    movzwl (%rdx,%rax), %eax
576; BWON-NEXT:    movw %ax, (%rsi,%rcx,2)
577; BWON-NEXT:    incq %rcx
578; BWON-NEXT:    cmpl %ecx, %r8d
579; BWON-NEXT:    jne .LBB11_1
580; BWON-NEXT:  # %bb.2:
581; BWON-NEXT:    retq
582;
583; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext:
584; BWOFF:       # %bb.0:
585; BWOFF-NEXT:    movl %ecx, %r8d
586; BWOFF-NEXT:    xorl %ecx, %ecx
587; BWOFF-NEXT:    .p2align 4, 0x90
588; BWOFF-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
589; BWOFF-NEXT:    movsbq (%rdi,%rcx), %rax
590; BWOFF-NEXT:    movw (%rdx,%rax), %ax
591; BWOFF-NEXT:    movw %ax, (%rsi,%rcx,2)
592; BWOFF-NEXT:    incq %rcx
593; BWOFF-NEXT:    cmpl %ecx, %r8d
594; BWOFF-NEXT:    jne .LBB11_1
595; BWOFF-NEXT:  # %bb.2:
596; BWOFF-NEXT:    retq
597  br label %1
598
599; <label>:1
600  %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
601  %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
602  %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
603  %2 = getelementptr inbounds i8, i8* %.0, i64 1
604  %3 = load i8, i8* %.0, align 1
605  %4 = sext i8 %3 to i64
606  %5 = getelementptr inbounds i8, i8* %c, i64 %4
607  %6 = load i8, i8* %5, align 1
608  %7 = add i64 %4, 1
609  %8 = getelementptr inbounds i8, i8* %c, i64 %7
610  %9 = load i8, i8* %8, align 1
611  store i8 %6, i8* %.08, align 1
612  %10 = getelementptr inbounds i8, i8* %.08, i64 1
613  store i8 %9, i8* %10, align 1
614  %11 = getelementptr inbounds i8, i8* %.08, i64 2
615  %12 = add nsw i32 %.09, -1
616  %13 = icmp eq i32 %12, 0
617  br i1 %13, label %14, label %1
618
619; <label>:14
620  ret void
621}
622
623; However, we can only merge ignore sign extensions when they are on all memory
624; computations;
625define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
626; BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex:
627; BWON:       # %bb.0:
628; BWON-NEXT:    movl %ecx, %r8d
629; BWON-NEXT:    xorl %ecx, %ecx
630; BWON-NEXT:    .p2align 4, 0x90
631; BWON-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
632; BWON-NEXT:    movsbq (%rdi,%rcx), %rax
633; BWON-NEXT:    movzbl (%rdx,%rax), %r9d
634; BWON-NEXT:    incb %al
635; BWON-NEXT:    movsbq %al, %rax
636; BWON-NEXT:    movzbl (%rdx,%rax), %eax
637; BWON-NEXT:    movb %r9b, (%rsi,%rcx,2)
638; BWON-NEXT:    movb %al, 1(%rsi,%rcx,2)
639; BWON-NEXT:    incq %rcx
640; BWON-NEXT:    cmpl %ecx, %r8d
641; BWON-NEXT:    jne .LBB12_1
642; BWON-NEXT:  # %bb.2:
643; BWON-NEXT:    retq
644;
645; BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex:
646; BWOFF:       # %bb.0:
647; BWOFF-NEXT:    movl %ecx, %r8d
648; BWOFF-NEXT:    xorl %ecx, %ecx
649; BWOFF-NEXT:    .p2align 4, 0x90
650; BWOFF-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
651; BWOFF-NEXT:    movsbq (%rdi,%rcx), %rax
652; BWOFF-NEXT:    movb (%rdx,%rax), %r9b
653; BWOFF-NEXT:    incb %al
654; BWOFF-NEXT:    movsbq %al, %rax
655; BWOFF-NEXT:    movb (%rdx,%rax), %al
656; BWOFF-NEXT:    movb %r9b, (%rsi,%rcx,2)
657; BWOFF-NEXT:    movb %al, 1(%rsi,%rcx,2)
658; BWOFF-NEXT:    incq %rcx
659; BWOFF-NEXT:    cmpl %ecx, %r8d
660; BWOFF-NEXT:    jne .LBB12_1
661; BWOFF-NEXT:  # %bb.2:
662; BWOFF-NEXT:    retq
663  br label %1
664
665; <label>:1
666  %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
667  %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
668  %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
669  %2 = getelementptr inbounds i8, i8* %.0, i64 1
670  %3 = load i8, i8* %.0, align 1
671  %4 = sext i8 %3 to i64
672  %5 = getelementptr inbounds i8, i8* %c, i64 %4
673  %6 = load i8, i8* %5, align 1
674  %7 = add i8 %3, 1
675  %wrap.4 = sext i8 %7 to i64
676  %8 = getelementptr inbounds i8, i8* %c, i64 %wrap.4
677  %9 = load i8, i8* %8, align 1
678  store i8 %6, i8* %.08, align 1
679  %10 = getelementptr inbounds i8, i8* %.08, i64 1
680  store i8 %9, i8* %10, align 1
681  %11 = getelementptr inbounds i8, i8* %.08, i64 2
682  %12 = add nsw i32 %.09, -1
683  %13 = icmp eq i32 %12, 0
684  br i1 %13, label %14, label %1
685
686; <label>:14
687  ret void
688}
689
690; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
691define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
692; CHECK-LABEL: merge_vec_element_store:
693; CHECK:       # %bb.0:
694; CHECK-NEXT:    vmovups %ymm0, (%rdi)
695; CHECK-NEXT:    vzeroupper
696; CHECK-NEXT:    retq
697  %vecext0 = extractelement <8 x float> %v, i32 0
698  %vecext1 = extractelement <8 x float> %v, i32 1
699  %vecext2 = extractelement <8 x float> %v, i32 2
700  %vecext3 = extractelement <8 x float> %v, i32 3
701  %vecext4 = extractelement <8 x float> %v, i32 4
702  %vecext5 = extractelement <8 x float> %v, i32 5
703  %vecext6 = extractelement <8 x float> %v, i32 6
704  %vecext7 = extractelement <8 x float> %v, i32 7
705  %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1
706  %arrayidx2 = getelementptr inbounds float, float* %ptr, i64 2
707  %arrayidx3 = getelementptr inbounds float, float* %ptr, i64 3
708  %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 4
709  %arrayidx5 = getelementptr inbounds float, float* %ptr, i64 5
710  %arrayidx6 = getelementptr inbounds float, float* %ptr, i64 6
711  %arrayidx7 = getelementptr inbounds float, float* %ptr, i64 7
712  store float %vecext0, float* %ptr, align 4
713  store float %vecext1, float* %arrayidx1, align 4
714  store float %vecext2, float* %arrayidx2, align 4
715  store float %vecext3, float* %arrayidx3, align 4
716  store float %vecext4, float* %arrayidx4, align 4
717  store float %vecext5, float* %arrayidx5, align 4
718  store float %vecext6, float* %arrayidx6, align 4
719  store float %vecext7, float* %arrayidx7, align 4
720  ret void
721
722}
723
724; PR21711 - Merge vector stores into wider vector stores.
725; These should be merged into 32-byte stores.
726define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) {
727; CHECK-LABEL: merge_vec_extract_stores:
728; CHECK:       # %bb.0:
729; CHECK-NEXT:    vmovups %ymm0, 48(%rdi)
730; CHECK-NEXT:    vmovups %ymm1, 80(%rdi)
731; CHECK-NEXT:    vzeroupper
732; CHECK-NEXT:    retq
733  %idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
734  %idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
735  %idx2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
736  %idx3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 6
737  %shuffle0 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
738  %shuffle1 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
739  %shuffle2 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
740  %shuffle3 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
741  store <4 x float> %shuffle0, <4 x float>* %idx0, align 16
742  store <4 x float> %shuffle1, <4 x float>* %idx1, align 16
743  store <4 x float> %shuffle2, <4 x float>* %idx2, align 16
744  store <4 x float> %shuffle3, <4 x float>* %idx3, align 16
745  ret void
746
747}
748
749; Merging vector stores when sourced from vector loads.
750define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
751; CHECK-LABEL: merge_vec_stores_from_loads:
752; CHECK:       # %bb.0:
753; CHECK-NEXT:    vmovups (%rdi), %ymm0
754; CHECK-NEXT:    vmovups %ymm0, (%rsi)
755; CHECK-NEXT:    vzeroupper
756; CHECK-NEXT:    retq
757  %load_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 0
758  %load_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 1
759  %v0 = load <4 x float>, <4 x float>* %load_idx0
760  %v1 = load <4 x float>, <4 x float>* %load_idx1
761  %store_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 0
762  %store_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
763  store <4 x float> %v0, <4 x float>* %store_idx0, align 16
764  store <4 x float> %v1, <4 x float>* %store_idx1, align 16
765  ret void
766
767}
768
769; Merging vector stores when sourced from a constant vector is not currently handled.
770define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
771; CHECK-LABEL: merge_vec_stores_of_constants:
772; CHECK:       # %bb.0:
773; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
774; CHECK-NEXT:    vmovaps %xmm0, 48(%rdi)
775; CHECK-NEXT:    vmovaps %xmm0, 64(%rdi)
776; CHECK-NEXT:    retq
777  %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
778  %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
779  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx0, align 16
780  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx1, align 16
781  ret void
782
783}
784
785; This is a minimized test based on real code that was failing.
786; This should now be merged.
787define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
788; CHECK-LABEL: merge_vec_element_and_scalar_load:
789; CHECK:       # %bb.0:
790; CHECK-NEXT:    vmovups (%rdi), %xmm0
791; CHECK-NEXT:    vmovups %xmm0, 32(%rdi)
792; CHECK-NEXT:    retq
793  %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
794  %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
795  %idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4
796  %idx5 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 5
797
798  %a0 = load i64, i64* %idx0, align 8
799  store i64 %a0, i64* %idx4, align 8
800
801  %b = bitcast i64* %idx1 to <2 x i64>*
802  %v = load <2 x i64>, <2 x i64>* %b, align 8
803  %a1 = extractelement <2 x i64> %v, i32 0
804  store i64 %a1, i64* %idx5, align 8
805  ret void
806
807}
808
809; Don't let a non-consecutive store thwart merging of the last two.
810define void @almost_consecutive_stores(i8* %p) {
811; CHECK-LABEL: almost_consecutive_stores:
812; CHECK:       # %bb.0:
813; CHECK-NEXT:    movb $0, (%rdi)
814; CHECK-NEXT:    movb $1, 42(%rdi)
815; CHECK-NEXT:    movw $770, 2(%rdi) # imm = 0x302
816; CHECK-NEXT:    retq
817  store i8 0, i8* %p
818  %p1 = getelementptr i8, i8* %p, i64 42
819  store i8 1, i8* %p1
820  %p2 = getelementptr i8, i8* %p, i64 2
821  store i8 2, i8* %p2
822  %p3 = getelementptr i8, i8* %p, i64 3
823  store i8 3, i8* %p3
824  ret void
825}
826
827; We should be able to merge these.
828define void @merge_bitcast(<4 x i32> %v, float* %ptr) {
829; CHECK-LABEL: merge_bitcast:
830; CHECK:       # %bb.0:
831; CHECK-NEXT:    vmovups %xmm0, (%rdi)
832; CHECK-NEXT:    retq
833  %fv = bitcast <4 x i32> %v to <4 x float>
834  %vecext1 = extractelement <4 x i32> %v, i32 1
835  %vecext2 = extractelement <4 x i32> %v, i32 2
836  %vecext3 = extractelement <4 x i32> %v, i32 3
837  %f0 = extractelement <4 x float> %fv, i32 0
838  %f1 = bitcast i32 %vecext1 to float
839  %f2 = bitcast i32 %vecext2 to float
840  %f3 = bitcast i32 %vecext3 to float
841  %idx0 = getelementptr inbounds float, float* %ptr, i64 0
842  %idx1 = getelementptr inbounds float, float* %ptr, i64 1
843  %idx2 = getelementptr inbounds float, float* %ptr, i64 2
844  %idx3 = getelementptr inbounds float, float* %ptr, i64 3
845  store float %f0, float* %idx0, align 4
846  store float %f1, float* %idx1, align 4
847  store float %f2, float* %idx2, align 4
848  store float %f3, float* %idx3, align 4
849  ret void
850}
851