• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
3
4; This test is mostly to test DAG store merging, so disable the vectorizer.
5; Run with devices with different unaligned load restrictions.
6
7; TODO: Vector element tests
8; TODO: Non-zero base offset for load and store combinations
9; TODO: Same base addrspacecasted
10
11
12; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
13; GCN: buffer_store_short
14; GCN: s_endpgm
15define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
16  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
17
18  store i8 123, i8 addrspace(1)* %out.gep.1
19  store i8 456, i8 addrspace(1)* %out, align 2
20  ret void
21}
22
23; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
24; GCN: buffer_store_byte
25; GCN: buffer_store_byte
26; GCN: s_endpgm
27define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
28  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
29
30  store i8 123, i8 addrspace(1)* %out.gep.1
31  store i8 456, i8 addrspace(1)* %out
32  ret void
33}
34
35; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
36; GCN: buffer_store_dword v
37define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
38  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
39
40  store i16 123, i16 addrspace(1)* %out.gep.1
41  store i16 456, i16 addrspace(1)* %out, align 4
42  ret void
43}
44
45; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
46; GCN: buffer_store_dword v
47define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
48  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
49
50  store i16 0, i16 addrspace(1)* %out.gep.1
51  store i16 0, i16 addrspace(1)* %out, align 4
52  ret void
53}
54
55; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
56; GCN: buffer_store_short
57; GCN: buffer_store_short
58; GCN: s_endpgm
59define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
60  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
61
62  store i16 123, i16 addrspace(1)* %out.gep.1
63  store i16 456, i16 addrspace(1)* %out
64  ret void
65}
66
67; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
68; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
69; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
70; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
71define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
72  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
73
74  store i32 123, i32 addrspace(1)* %out.gep.1
75  store i32 456, i32 addrspace(1)* %out
76  ret void
77}
78
79; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
80; GCN: buffer_store_dwordx2
81define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
82  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
83  %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
84  store float 1.0, float addrspace(1)* %out.gep.1.bc
85  store i32 456, i32 addrspace(1)* %out
86  ret void
87}
88
89; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
90; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
91; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
92; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
93define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
94  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
95  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
96  store i32 123, i32 addrspace(1)* %out.gep.1.bc
97  store float 4.0, float addrspace(1)* %out
98  ret void
99}
100
101; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
102; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
103; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
104; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
105; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
106; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
107define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
108  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
109  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
110  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
111
112  store i32 123, i32 addrspace(1)* %out.gep.1
113  store i32 456, i32 addrspace(1)* %out.gep.2
114  store i32 333, i32 addrspace(1)* %out.gep.3
115  store i32 1234, i32 addrspace(1)* %out
116  ret void
117}
118
119; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
120; GCN: buffer_store_dwordx4
121define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
122  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
123  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
124  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
125
126  store float 8.0, float addrspace(1)* %out
127  store float 1.0, float addrspace(1)* %out.gep.1
128  store float 2.0, float addrspace(1)* %out.gep.2
129  store float 4.0, float addrspace(1)* %out.gep.3
130  ret void
131}
132
133; First store is out of order.
134; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
135; GCN: buffer_store_dwordx4
136define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
137  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
138  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
139  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
140
141  store float 1.0, float addrspace(1)* %out.gep.1
142  store float 2.0, float addrspace(1)* %out.gep.2
143  store float 4.0, float addrspace(1)* %out.gep.3
144  store float 8.0, float addrspace(1)* %out
145  ret void
146}
147
148; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
149; GCN-AA: buffer_store_dwordx4 v
150; GCN: s_endpgm
151define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
152  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
153  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
154  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
155
156  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
157  %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
158
159  store i32 11, i32 addrspace(1)* %out.gep.1.bc
160  store float 2.0, float addrspace(1)* %out.gep.2
161  store i32 17, i32 addrspace(1)* %out.gep.3.bc
162  store float 8.0, float addrspace(1)* %out
163  ret void
164}
165
166; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
167; SI-DAG: buffer_store_dwordx2
168; SI-DAG: buffer_store_dword
169; SI-NOT: buffer_store_dword
170; GCN: s_endpgm
171define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
172  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
173  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
174
175  store i32 123, i32 addrspace(1)* %out.gep.1
176  store i32 456, i32 addrspace(1)* %out.gep.2
177  store i32 1234, i32 addrspace(1)* %out
178  ret void
179}
180
181; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
182; GCN: buffer_store_dwordx4
183define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
184  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
185
186  store i64 123, i64 addrspace(1)* %out.gep.1
187  store i64 456, i64 addrspace(1)* %out
188  ret void
189}
190
191; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
192; GCN: buffer_store_dwordx4
193; GCN: buffer_store_dwordx4
194define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
195  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
196  %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
197  %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
198
199  store i64 123, i64 addrspace(1)* %out.gep.1
200  store i64 456, i64 addrspace(1)* %out.gep.2
201  store i64 333, i64 addrspace(1)* %out.gep.3
202  store i64 1234, i64 addrspace(1)* %out
203  ret void
204}
205
206; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
207; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
208; GCN: buffer_store_dwordx2 [[LOAD]]
209define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
210  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
211  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
212
213  %lo = load i32, i32 addrspace(1)* %in
214  %hi = load i32, i32 addrspace(1)* %in.gep.1
215
216  store i32 %lo, i32 addrspace(1)* %out
217  store i32 %hi, i32 addrspace(1)* %out.gep.1
218  ret void
219}
220
221; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
222; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
223; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
224define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
225  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
226  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
227
228  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
229  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
230  %lo = load i32, i32 addrspace(1)* %in.gep.0
231  %hi = load i32, i32 addrspace(1)* %in.gep.1
232
233  store i32 %lo, i32 addrspace(1)* %out.gep.0
234  store i32 %hi, i32 addrspace(1)* %out.gep.1
235  ret void
236}
237
238; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
239; GCN: buffer_load_dwordx2 v
240; GCN: buffer_store_dwordx2 v
241define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
242  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
243  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
244
245  %lo = load i32, i32 addrspace(1)* %in
246  %hi = load i32, i32 addrspace(1)* %in.gep.1
247
248  store i32 %hi, i32 addrspace(1)* %out
249  store i32 %lo, i32 addrspace(1)* %out.gep.1
250  ret void
251}
252
253; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
254; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
255; GCN: buffer_store_dwordx4 [[LOAD]]
256define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
257  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
258  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
259  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
260  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
261  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
262  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
263
264  %x = load i32, i32 addrspace(1)* %in
265  %y = load i32, i32 addrspace(1)* %in.gep.1
266  %z = load i32, i32 addrspace(1)* %in.gep.2
267  %w = load i32, i32 addrspace(1)* %in.gep.3
268
269  store i32 %x, i32 addrspace(1)* %out
270  store i32 %y, i32 addrspace(1)* %out.gep.1
271  store i32 %z, i32 addrspace(1)* %out.gep.2
272  store i32 %w, i32 addrspace(1)* %out.gep.3
273  ret void
274}
275
276; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
277; SI-DAG: buffer_load_dwordx2
278; SI-DAG: buffer_load_dword v
279; GCN: s_waitcnt
280; SI-DAG: buffer_store_dword v
281; SI-DAG: buffer_store_dwordx2 v
282; GCN: s_endpgm
283define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
284  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
285  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
286  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
287  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
288
289  %x = load i32, i32 addrspace(1)* %in
290  %y = load i32, i32 addrspace(1)* %in.gep.1
291  %z = load i32, i32 addrspace(1)* %in.gep.2
292
293  store i32 %x, i32 addrspace(1)* %out
294  store i32 %y, i32 addrspace(1)* %out.gep.1
295  store i32 %z, i32 addrspace(1)* %out.gep.2
296  ret void
297}
298
299; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
300; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
301; GCN: buffer_store_dwordx4 [[LOAD]]
302define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
303  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
304  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
305  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
306  %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
307  %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
308  %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
309
310  %x = load float, float addrspace(1)* %in
311  %y = load float, float addrspace(1)* %in.gep.1
312  %z = load float, float addrspace(1)* %in.gep.2
313  %w = load float, float addrspace(1)* %in.gep.3
314
315  store float %x, float addrspace(1)* %out
316  store float %y, float addrspace(1)* %out.gep.1
317  store float %z, float addrspace(1)* %out.gep.2
318  store float %w, float addrspace(1)* %out.gep.3
319  ret void
320}
321
322; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
323; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
324; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
325define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
326  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
327  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
328  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
329  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
330  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
331  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
332  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
333  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
334
335  %x = load i32, i32 addrspace(1)* %in.gep.0
336  %y = load i32, i32 addrspace(1)* %in.gep.1
337  %z = load i32, i32 addrspace(1)* %in.gep.2
338  %w = load i32, i32 addrspace(1)* %in.gep.3
339
340  store i32 %x, i32 addrspace(1)* %out.gep.0
341  store i32 %y, i32 addrspace(1)* %out.gep.1
342  store i32 %z, i32 addrspace(1)* %out.gep.2
343  store i32 %w, i32 addrspace(1)* %out.gep.3
344  ret void
345}
346
347; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
348; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
349; GCN: s_barrier
350; GCN: buffer_store_dwordx4 [[LOAD]]
351define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
352  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
353  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
354  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
355  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
356  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
357  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
358
359  %x = load i32, i32 addrspace(1)* %in
360  %y = load i32, i32 addrspace(1)* %in.gep.1
361  %z = load i32, i32 addrspace(1)* %in.gep.2
362  %w = load i32, i32 addrspace(1)* %in.gep.3
363
364  ; Make sure the barrier doesn't stop this
365  tail call void @llvm.amdgcn.s.barrier() #1
366
367  store i32 %w, i32 addrspace(1)* %out.gep.3
368  store i32 %z, i32 addrspace(1)* %out.gep.2
369  store i32 %y, i32 addrspace(1)* %out.gep.1
370  store i32 %x, i32 addrspace(1)* %out
371
372  ret void
373}
374
375; TODO: Re-packing of loaded register required. Maybe an IR pass
376; should catch this?
377
378; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
379; GCN: buffer_load_dwordx4 v
380; GCN: s_barrier
381; GCN: buffer_store_dwordx4 v
382define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
383  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
384  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
385  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
386  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
387  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
388  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
389
390  %x = load i32, i32 addrspace(1)* %in
391  %y = load i32, i32 addrspace(1)* %in.gep.1
392  %z = load i32, i32 addrspace(1)* %in.gep.2
393  %w = load i32, i32 addrspace(1)* %in.gep.3
394
395  ; Make sure the barrier doesn't stop this
396  tail call void @llvm.amdgcn.s.barrier() #1
397
398  store i32 %w, i32 addrspace(1)* %out
399  store i32 %z, i32 addrspace(1)* %out.gep.1
400  store i32 %y, i32 addrspace(1)* %out.gep.2
401  store i32 %x, i32 addrspace(1)* %out.gep.3
402
403  ret void
404}
405
406; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
407; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
408; GCN: buffer_store_dword [[LOAD]]
409; GCN: s_endpgm
410define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
411  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
412  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
413  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
414  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
415  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
416  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
417
418  %x = load i8, i8 addrspace(1)* %in, align 4
419  %y = load i8, i8 addrspace(1)* %in.gep.1
420  %z = load i8, i8 addrspace(1)* %in.gep.2
421  %w = load i8, i8 addrspace(1)* %in.gep.3
422
423  store i8 %x, i8 addrspace(1)* %out, align 4
424  store i8 %y, i8 addrspace(1)* %out.gep.1
425  store i8 %z, i8 addrspace(1)* %out.gep.2
426  store i8 %w, i8 addrspace(1)* %out.gep.3
427  ret void
428}
429
430; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
431; GCN: buffer_load_ubyte
432; GCN: buffer_load_ubyte
433; GCN: buffer_load_ubyte
434; GCN: buffer_load_ubyte
435; GCN: buffer_store_byte
436; GCN: buffer_store_byte
437; GCN: buffer_store_byte
438; GCN: buffer_store_byte
439; GCN: s_endpgm
440define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
441  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
442  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
443  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
444  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
445  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
446  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
447
448  %x = load i8, i8 addrspace(1)* %in
449  %y = load i8, i8 addrspace(1)* %in.gep.1
450  %z = load i8, i8 addrspace(1)* %in.gep.2
451  %w = load i8, i8 addrspace(1)* %in.gep.3
452
453  store i8 %x, i8 addrspace(1)* %out
454  store i8 %y, i8 addrspace(1)* %out.gep.1
455  store i8 %z, i8 addrspace(1)* %out.gep.2
456  store i8 %w, i8 addrspace(1)* %out.gep.3
457  ret void
458}
459
460; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
461; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
462; GCN: buffer_store_dwordx4 [[LOAD]]
463; GCN: s_endpgm
464define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
465  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
466  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
467  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
468  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
469
470  %x = extractelement <4 x i32> %vec, i32 0
471  %y = extractelement <4 x i32> %vec, i32 1
472  %z = extractelement <4 x i32> %vec, i32 2
473  %w = extractelement <4 x i32> %vec, i32 3
474
475  store i32 %x, i32 addrspace(1)* %out
476  store i32 %y, i32 addrspace(1)* %out.gep.1
477  store i32 %z, i32 addrspace(1)* %out.gep.2
478  store i32 %w, i32 addrspace(1)* %out.gep.3
479  ret void
480}
481
482; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
483; GCN: ds_write_b16
484; GCN: s_endpgm
485define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
486  %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
487
488  store i8 123, i8 addrspace(3)* %out.gep.1
489  store i8 456, i8 addrspace(3)* %out, align 2
490  ret void
491}
492
493; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
494; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
495; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
496; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
497define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
498  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
499
500  store i32 123, i32 addrspace(3)* %out.gep.1
501  store i32 456, i32 addrspace(3)* %out
502  ret void
503}
504
505; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
506; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
507; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
508; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
509
510; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
511; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
512; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
513
514; GCN: s_endpgm
515define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
516  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
517  %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
518  %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
519
520  store i32 123, i32 addrspace(3)* %out.gep.1
521  store i32 456, i32 addrspace(3)* %out.gep.2
522  store i32 333, i32 addrspace(3)* %out.gep.3
523  store i32 1234, i32 addrspace(3)* %out
524  ret void
525}
526
527; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
528; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
529; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
530; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
531; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
532; GCN: buffer_store_dword v[[HI]]
533define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
534  store i32 9, i32 addrspace(1)* %out, align 4
535  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
536  store i32 12, i32 addrspace(1)* %idx1, align 4
537  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
538  store i32 16, i32 addrspace(1)* %idx2, align 4
539  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
540  store i32 -12, i32 addrspace(1)* %idx3, align 4
541  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
542  store i32 11, i32 addrspace(1)* %idx4, align 4
543  ret void
544}
545
546; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
547; GCN: buffer_store_dwordx4
548; GCN: buffer_store_dwordx2
549define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
550  store i32 13, i32 addrspace(1)* %out, align 4
551  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
552  store i32 15, i32 addrspace(1)* %idx1, align 4
553  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
554  store i32 62, i32 addrspace(1)* %idx2, align 4
555  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
556  store i32 63, i32 addrspace(1)* %idx3, align 4
557  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
558  store i32 11, i32 addrspace(1)* %idx4, align 4
559  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
560  store i32 123, i32 addrspace(1)* %idx5, align 4
561  ret void
562}
563
564; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
565; GCN: buffer_store_dwordx4
566; GCN: buffer_store_dwordx2
567; GCN: buffer_store_dword v
568define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
569  store i32 34, i32 addrspace(1)* %out, align 4
570  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
571  store i32 999, i32 addrspace(1)* %idx1, align 4
572  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
573  store i32 65, i32 addrspace(1)* %idx2, align 4
574  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
575  store i32 33, i32 addrspace(1)* %idx3, align 4
576  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
577  store i32 98, i32 addrspace(1)* %idx4, align 4
578  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
579  store i32 91, i32 addrspace(1)* %idx5, align 4
580  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
581  store i32 212, i32 addrspace(1)* %idx6, align 4
582  ret void
583}
584
585; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
586; GCN: buffer_store_dwordx4
587; GCN: buffer_store_dwordx4
588; GCN: s_endpgm
589define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
590  store i32 34, i32 addrspace(1)* %out, align 4
591  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
592  store i32 999, i32 addrspace(1)* %idx1, align 4
593  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
594  store i32 65, i32 addrspace(1)* %idx2, align 4
595  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
596  store i32 33, i32 addrspace(1)* %idx3, align 4
597  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
598  store i32 98, i32 addrspace(1)* %idx4, align 4
599  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
600  store i32 91, i32 addrspace(1)* %idx5, align 4
601  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
602  store i32 212, i32 addrspace(1)* %idx6, align 4
603  %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
604  store i32 999, i32 addrspace(1)* %idx7, align 4
605  ret void
606}
607
608; This requires handling of scalar_to_vector for v2i64 to avoid
609; scratch usage.
610; FIXME: Should do single load and store
611
612; GCN-LABEL: {{^}}copy_v3i32_align4:
613; GCN-NOT: SCRATCH_RSRC_DWORD
614; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
615; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
616; GCN-NOT: offen
617; GCN: s_waitcnt vmcnt
618; GCN-NOT: offen
619; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
620; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
621
622; GCN: ScratchSize: 0{{$}}
623define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
624  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
625  store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
626  ret void
627}
628
629; GCN-LABEL: {{^}}copy_v3i64_align4:
630; GCN-NOT: SCRATCH_RSRC_DWORD
631; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
632; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
633; GCN-NOT: offen
634; GCN: s_waitcnt vmcnt
635; GCN-NOT: offen
636; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
637; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
638; GCN: ScratchSize: 0{{$}}
639define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
640  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
641  store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
642  ret void
643}
644
645; GCN-LABEL: {{^}}copy_v3f32_align4:
646; GCN-NOT: SCRATCH_RSRC_DWORD
647; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
648; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
649; GCN-NOT: offen
650; GCN: s_waitcnt vmcnt
651; GCN-NOT: offen
652; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
653; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
654; GCN: ScratchSize: 0{{$}}
655define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
656  %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
657  %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
658  store <3 x float> %fadd, <3 x float> addrspace(1)* %out
659  ret void
660}
661
662; GCN-LABEL: {{^}}copy_v3f64_align4:
663; GCN-NOT: SCRATCH_RSRC_DWORD
664; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
665; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
666; GCN-NOT: offen
667; GCN: s_waitcnt vmcnt
668; GCN-NOT: offen
669; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
670; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
671; GCN: ScratchSize: 0{{$}}
672define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
673  %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
674  %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
675  store <3 x double> %fadd, <3 x double> addrspace(1)* %out
676  ret void
677}
678
679declare void @llvm.amdgcn.s.barrier() #1
680
681attributes #0 = { nounwind }
682attributes #1 = { convergent nounwind }
683