• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
4
5; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
6; GCN: s_waitcnt
7; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
8; GFX900-NEXT: s_waitcnt
9; GFX900-NEXT: s_setpc_b64
10
11; NO-D16-HI: ds_read_u16 v
12define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
13entry:
14  %load = load i16, i16 addrspace(3)* %in
15  %build = insertelement <2 x i16> undef, i16 %load, i32 1
16  ret <2 x i16> %build
17}
18
19; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo:
20; GCN: s_waitcnt
21; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
22; GFX900-NEXT: s_waitcnt
23; GFX900-NEXT: v_mov_b32_e32 v0, v1
24; GFX900-NEXT: s_setpc_b64
25
26; NO-D16-HI: ds_read_u16 v
27define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
28entry:
29  %load = load i16, i16 addrspace(3)* %in
30  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
31  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
32  ret <2 x i16> %build1
33}
34
35; Show that we get reasonable regalloc without physreg constraints.
36; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg:
37; GCN: s_waitcnt
38; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
39; GFX900-NEXT: s_waitcnt
40; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
41; GFX900-NEXT: s_waitcnt
42; GFX900-NEXT: s_setpc_b64
43
44; NO-D16-HI: ds_read_u16 v
45define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
46entry:
47  %load = load i16, i16 addrspace(3)* %in
48  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
49  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
50  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
51  ret void
52}
53
54; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo:
55; GCN: s_waitcnt
56; GFX900-NEXT: v_mov_b32_e32 v1, 0
57; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
58; GFX900-NEXT: s_waitcnt
59; GFX900-NEXT: v_mov_b32_e32 v0, v1
60; GFX900-NEXT: s_setpc_b64
61
62; NO-D16-HI: ds_read_u16 v
63define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
64entry:
65  %load = load i16, i16 addrspace(3)* %in
66  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
67  ret <2 x i16> %build
68}
69
70; FIXME: Remove m0 initialization
71; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
72; GCN: s_waitcnt
73; GFX900-NEXT: ds_read_u16 v0, v0
74; GFX900-NEXT: s_waitcnt lgkmcnt(0)
75; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
76; GFX900-NEXT: s_setpc_b64
77
78; NO-D16-HI: ds_read_u16 v
79; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0
80define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 {
81entry:
82  %load = load i16, i16 addrspace(3)* %in
83  %zext = zext i16 %load to i32
84  %shift = shl i32 %zext, 16
85  ret i32 %shift
86}
87
88; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg:
89; GCN: s_waitcnt
90; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
91; GFX900-NEXT: s_waitcnt
92; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
93; GFX900-NEXT: s_waitcnt
94; GFX900-NEXT: s_setpc_b64
95
96; NO-D16-HI: ds_read_u16 v
97define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
98entry:
99  %load = load half, half addrspace(3)* %in
100  %build0 = insertelement <2 x half> undef, half %reg, i32 0
101  %build1 = insertelement <2 x half> %build0, half %load, i32 1
102  store <2 x half> %build1, <2 x half> addrspace(1)* undef
103  ret void
104}
105
106; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8:
107; GCN: s_waitcnt
108; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
109; GFX900-NEXT: s_waitcnt
110; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
111; GFX900-NEXT: s_waitcnt
112; GFX900-NEXT: s_setpc_b64
113
114; NO-D16-HI: ds_read_u8 v
115define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
116entry:
117  %load = load i8, i8 addrspace(3)* %in
118  %ext = zext i8 %load to i16
119  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
120  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
121  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
122  ret void
123}
124
125; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8:
126; GCN: s_waitcnt
127; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
128; GFX900-NEXT: s_waitcnt
129; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
130; GFX900-NEXT: s_waitcnt
131; GFX900-NEXT: s_setpc_b64
132
133; NO-D16-HI: ds_read_i8 v
134define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
135entry:
136  %load = load i8, i8 addrspace(3)* %in
137  %ext = sext i8 %load to i16
138  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
139  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
140  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
141  ret void
142}
143
144; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg:
145; GCN: s_waitcnt
146; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
147; GFX900-NEXT: s_waitcnt
148; GFX900-NEXT: global_store_dword
149; GFX900-NEXT: s_waitcnt
150; GFX900-NEXT: s_setpc_b64
151define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 {
152entry:
153  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
154  %load = load i16, i16 addrspace(1)* %gep
155  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
156  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
157  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
158  ret void
159}
160
161; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg:
162; GCN: s_waitcnt
163; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
164; GFX900-NEXT: s_waitcnt
165; GFX900-NEXT: global_store_dword
166; GFX900-NEXT: s_waitcnt
167; GFX900-NEXT: s_setpc_b64
168define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 {
169entry:
170  %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
171  %load = load half, half addrspace(1)* %gep
172  %build0 = insertelement <2 x half> undef, half %reg, i32 0
173  %build1 = insertelement <2 x half> %build0, half %load, i32 1
174  store <2 x half> %build1, <2 x half> addrspace(1)* undef
175  ret void
176}
177
178; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8:
179; GCN: s_waitcnt
180; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
181; GFX900-NEXT: s_waitcnt
182; GFX900-NEXT: global_store_dword
183; GFX900-NEXT: s_waitcnt
184; GFX900-NEXT: s_setpc_b64
185define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
186entry:
187  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
188  %load = load i8, i8 addrspace(1)* %gep
189  %ext = zext i8 %load to i16
190  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
191  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
192  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
193  ret void
194}
195
196; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8:
197; GCN: s_waitcnt
198; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
199; GFX900-NEXT: s_waitcnt
200; GFX900-NEXT: global_store_dword
201; GFX900-NEXT: s_waitcnt
202; GFX900-NEXT: s_setpc_b64
203define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
204entry:
205  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
206  %load = load i8, i8 addrspace(1)* %gep
207  %ext = sext i8 %load to i16
208  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
209  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
210  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
211  ret void
212}
213
214; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg:
215; GCN: s_waitcnt
216; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
217; GFX900-NEXT: s_waitcnt
218; GFX900-NEXT: global_store_dword v[0:1], v2
219; GFX900-NEXT: s_waitcnt
220; GFX900-NEXT: s_setpc_b64
221
222; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
223; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
224; GFX803: v_or_b32_sdwa
225; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
226define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 {
227entry:
228  %load = load i16, i16* %in
229  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
230  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
231  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
232  ret void
233}
234
235; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg:
236; GCN: s_waitcnt
237; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
238; GFX900-NEXT: s_waitcnt
239; GFX900-NEXT: global_store_dword v[0:1], v2
240; GFX900-NEXT: s_waitcnt
241; GFX900-NEXT: s_setpc_b64
242
243; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
244; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
245; GFX803: v_or_b32_sdwa
246; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
247define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 {
248entry:
249  %load = load half, half* %in
250  %build0 = insertelement <2 x half> undef, half %reg, i32 0
251  %build1 = insertelement <2 x half> %build0, half %load, i32 1
252  store <2 x half> %build1, <2 x half> addrspace(1)* undef
253  ret void
254}
255
256; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8:
257; GCN: s_waitcnt
258; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
259; GFX900-NEXT: s_waitcnt
260; GFX900-NEXT: global_store_dword v[0:1], v2
261; GFX900-NEXT: s_waitcnt
262; GFX900-NEXT: s_setpc_b64
263
264; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
265; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
266; GFX803: v_or_b32_sdwa
267; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
268define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 {
269entry:
270  %load = load i8, i8* %in
271  %ext = zext i8 %load to i16
272  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
273  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
274  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
275  ret void
276}
277
278; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8:
279; GCN: s_waitcnt
280; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
281; GFX900-NEXT: s_waitcnt
282; GFX900-NEXT: global_store_dword v[0:1], v2
283; GFX900-NEXT: s_waitcnt
284; GFX900-NEXT: s_setpc_b64
285
286; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
287; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
288; GFX803: v_or_b32_sdwa
289; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
290define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 {
291entry:
292  %load = load i8, i8* %in
293  %ext = sext i8 %load to i16
294  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
295  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
296  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
297  ret void
298}
299
300; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg:
301; GCN: s_waitcnt
302; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
303; GFX900-NEXT: s_waitcnt
304; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
305; GFX900-NEXT: s_waitcnt
306; GFX900-NEXT: s_setpc_b64
307
308; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
309define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
310entry:
311  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045
312  %load = load i16, i16 addrspace(5)* %gep
313  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
314  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
315  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
316  ret void
317}
318
319; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg:
320; GCN: s_waitcnt
321; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
322; GFX900-NEXT: s_waitcnt
323; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
324; GFX900-NEXT: s_waitcnt
325; GFX900-NEXT: s_setpc_b64
326
327; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
328define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 {
329entry:
330  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045
331  %load = load half, half addrspace(5)* %gep
332  %build0 = insertelement <2 x half> undef, half %reg, i32 0
333  %build1 = insertelement <2 x half> %build0, half %load, i32 1
334  store <2 x half> %build1, <2 x half> addrspace(1)* undef
335  ret void
336}
337
338; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
339; GCN: s_waitcnt
340; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s4 offset:4094{{$}}
341; GFX900: s_waitcnt
342; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
343; GFX900-NEXT: s_waitcnt
344; GFX900-NEXT: s_setpc_b64
345
346; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
347define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 {
348entry:
349  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
350  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
351  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
352  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
353  ret void
354}
355
356; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
357; GCN: s_waitcnt
358; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
359; GFX900-NEXT: s_waitcnt
360; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
361; GFX900-NEXT: s_waitcnt
362; GFX900-NEXT: s_setpc_b64
363
364; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
365define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
366entry:
367  %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
368  %build0 = insertelement <2 x half> undef, half %reg, i32 0
369  %build1 = insertelement <2 x half> %build0, half %load, i32 1
370  store <2 x half> %build1, <2 x half> addrspace(1)* undef
371  ret void
372}
373
374; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8:
375; GCN: s_waitcnt
376; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}}
377; GFX900-NEXT: s_waitcnt
378; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
379; GFX900-NEXT: s_waitcnt
380; GFX900-NEXT: s_setpc_b64
381
382; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
383define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
384entry:
385  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
386  %load = load i8, i8 addrspace(5)* %gep
387  %ext = zext i8 %load to i16
388  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
389  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
390  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
391  ret void
392}
393
394; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8:
395; GCN: s_waitcnt
396; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}}
397; GFX900-NEXT: s_waitcnt
398; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
399; GFX900-NEXT: s_waitcnt
400; GFX900-NEXT: s_setpc_b64
401
402; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
403define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
404entry:
405  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
406  %load = load i8, i8 addrspace(5)* %gep
407  %ext = sext i8 %load to i16
408  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
409  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
410  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
411  ret void
412}
413
414; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
415; GCN: s_waitcnt
416; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
417; GFX900-NEXT: s_waitcnt
418; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
419; GFX900-NEXT: s_waitcnt
420; GFX900-NEXT: s_setpc_b64
421
422; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
423define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
424entry:
425  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
426  %ext = zext i8 %load to i16
427  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
428  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
429  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
430  ret void
431}
432
433; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
434; GCN: s_waitcnt
435; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
436; GFX900-NEXT: s_waitcnt
437; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
438; GFX900-NEXT: s_waitcnt
439; GFX900-NEXT: s_setpc_b64
440
441; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}}
442define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
443entry:
444  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
445  %ext = sext i8 %load to i16
446  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
447  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
448  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
449  ret void
450}
451
452; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
453; GCN: s_waitcnt
454; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
455; GFX900-NEXT: s_waitcnt
456; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
457; GFX900-NEXT: s_waitcnt
458; GFX900-NEXT: s_setpc_b64
459
460; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
461define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
462entry:
463  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
464  %ext = zext i8 %load to i16
465  %bc.ext = bitcast i16 %ext to half
466  %build0 = insertelement <2 x half> undef, half %reg, i32 0
467  %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1
468  store <2 x half> %build1, <2 x half> addrspace(1)* undef
469  ret void
470}
471
472; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg:
473; GCN: s_waitcnt
474; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
475; GFX900-NEXT: s_waitcnt
476; GFX900-NEXT: global_store_dword
477; GFX900-NEXT: s_waitcnt
478; GFX900-NEXT: s_setpc_b64
479
480; GFX803: flat_load_ushort
481; GFX906: global_load_ushort
482define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 {
483entry:
484  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
485  %load = load i16, i16 addrspace(4)* %gep
486  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
487  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
488  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
489  ret void
490}
491
492; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg
493; GCN: s_waitcnt
494; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
495; GFX900-NEXT: s_waitcnt
496; GFX900-NEXT: global_store_dword
497; GFX900-NEXT: s_waitcnt
498; GFX900-NEXT: s_setpc_b64
499
500; GFX803: flat_load_ushort
501; GFX906: global_load_ushort
502define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 {
503entry:
504  %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
505  %load = load half, half addrspace(4)* %gep
506  %build0 = insertelement <2 x half> undef, half %reg, i32 0
507  %build1 = insertelement <2 x half> %build0, half %load, i32 1
508  store <2 x half> %build1, <2 x half> addrspace(1)* undef
509  ret void
510}
511
512; Local object gives known offset, so requires converting from offen
513; to offset variant.
514
515; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
516; GFX900: buffer_store_dword
517; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4094
518define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
519entry:
520  %obj0 = alloca [10 x i32], align 4, addrspace(5)
521  %obj1 = alloca [4096 x i16], align 2, addrspace(5)
522  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
523  store volatile i32 123, i32 addrspace(5)* %bc
524  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025
525  %load = load i16, i16 addrspace(5)* %gep
526  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
527  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
528  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
529  ret void
530}
531
532; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
533; GFX900: buffer_store_dword
534; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095
535define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
536entry:
537  %obj0 = alloca [10 x i32], align 4, addrspace(5)
538  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
539  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
540  store volatile i32 123, i32 addrspace(5)* %bc
541  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
542  %load = load i8, i8 addrspace(5)* %gep
543  %ext = sext i8 %load to i16
544  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
545  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
546  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
547  ret void
548}
549
550; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
551; GFX900: buffer_store_dword
552; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095
553define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
554entry:
555  %obj0 = alloca [10 x i32], align 4, addrspace(5)
556  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
557  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
558  store volatile i32 123, i32 addrspace(5)* %bc
559  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
560  %load = load i8, i8 addrspace(5)* %gep
561  %ext = zext i8 %load to i16
562  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
563  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
564  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
565  ret void
566}
567
568; FIXME: Remove m0 init and waitcnt between reads
569; FIXME: Is there a cost to using the extload over not?
570; GCN-LABEL: {{^}}load_local_v2i16_split:
571; GCN: s_waitcnt
572; GFX900-NEXT: ds_read_u16 v1, v0
573; GFX900-NEXT: s_waitcnt
574; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
575; GFX900-NEXT: s_waitcnt
576; GFX900-NEXT: v_mov_b32_e32 v0, v1
577; GFX900-NEXT: s_setpc_b64
578define <2 x i16> @load_local_v2i16_split(i16 addrspace(3)* %in) #0 {
579entry:
580  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
581  %load0 = load volatile i16, i16 addrspace(3)* %in
582  %load1 = load volatile i16, i16 addrspace(3)* %gep
583  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
584  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
585  ret <2 x i16> %build1
586}
587
588; FIXME: Remove waitcnt between reads
589; GCN-LABEL: {{^}}load_global_v2i16_split:
590; GCN: s_waitcnt
591; GFX900-NEXT: global_load_ushort v2
592; GFX900-NEXT: s_waitcnt
593; GFX900-NEXT: global_load_short_d16_hi v2
594; GFX900-NEXT: s_waitcnt
595; GFX900-NEXT: v_mov_b32_e32 v0, v2
596; GFX900-NEXT: s_setpc_b64
597define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
598entry:
599  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
600  %load0 = load volatile i16, i16 addrspace(1)* %in
601  %load1 = load volatile i16, i16 addrspace(1)* %gep
602  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
603  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
604  ret <2 x i16> %build1
605}
606
607; FIXME: Remove waitcnt between reads
608; GCN-LABEL: {{^}}load_flat_v2i16_split:
609; GCN: s_waitcnt
610; GFX900-NEXT: flat_load_ushort v2
611; GFX900-NEXT: s_waitcnt
612; GFX900-NEXT: flat_load_short_d16_hi v2
613; GFX900-NEXT: s_waitcnt
614; GFX900-NEXT: v_mov_b32_e32 v0, v2
615; GFX900-NEXT: s_setpc_b64
616define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 {
617entry:
618  %gep = getelementptr inbounds i16, i16* %in, i64 1
619  %load0 = load volatile i16, i16* %in
620  %load1 = load volatile i16, i16* %gep
621  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
622  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
623  ret <2 x i16> %build1
624}
625
626; FIXME: Remove waitcnt between reads
627; GCN-LABEL: {{^}}load_constant_v2i16_split:
628; GCN: s_waitcnt
629; GFX900-NEXT: global_load_ushort v2
630; GFX900-NEXT: s_waitcnt
631; GFX900-NEXT: global_load_short_d16_hi v2
632; GFX900-NEXT: s_waitcnt
633; GFX900-NEXT: v_mov_b32_e32 v0, v2
634; GFX900-NEXT: s_setpc_b64
635define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 {
636entry:
637  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
638  %load0 = load volatile i16, i16 addrspace(4)* %in
639  %load1 = load volatile i16, i16 addrspace(4)* %gep
640  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
641  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
642  ret <2 x i16> %build1
643}
644
645; FIXME: Remove m0 init and waitcnt between reads
646; FIXME: Is there a cost to using the extload over not?
647; GCN-LABEL: {{^}}load_private_v2i16_split:
648; GCN: s_waitcnt
649; GFX900: buffer_load_ushort v0, off, s[0:3], s5 offset:4{{$}}
650; GFX900-NEXT: s_waitcnt
651; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:6
652; GFX900-NEXT: s_waitcnt
653; GFX900-NEXT: s_setpc_b64
654define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 {
655entry:
656  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1
657  %load0 = load volatile i16, i16 addrspace(5)* %in
658  %load1 = load volatile i16, i16 addrspace(5)* %gep
659  %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
660  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
661  ret <2 x i16> %build1
662}
663
664attributes #0 = { nounwind }
665