• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s
5
6define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 {
7; GFX803-LABEL: test_kern_empty:
8; GFX803:       ; %bb.0: ; %entry
9; GFX803-NEXT:    s_endpgm
10;
11; GFX900-LABEL: test_kern_empty:
12; GFX900:       ; %bb.0: ; %entry
13; GFX900-NEXT:    s_endpgm
14;
15; GFX1010-LABEL: test_kern_empty:
16; GFX1010:       ; %bb.0: ; %entry
17; GFX1010-NEXT:    s_endpgm
18entry:
19  ret void
20}
21
22define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
23; GFX803-LABEL: test_kern_stack:
24; GFX803:       ; %bb.0: ; %entry
25; GFX803-NEXT:    s_add_u32 s4, s4, s7
26; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
27; GFX803-NEXT:    s_add_u32 s0, s0, s7
28; GFX803-NEXT:    s_addc_u32 s1, s1, 0
29; GFX803-NEXT:    v_mov_b32_e32 v0, 0
30; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
31; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
32; GFX803-NEXT:    s_endpgm
33;
34; GFX900-LABEL: test_kern_stack:
35; GFX900:       ; %bb.0: ; %entry
36; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
37; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
38; GFX900-NEXT:    s_add_u32 s0, s0, s7
39; GFX900-NEXT:    s_addc_u32 s1, s1, 0
40; GFX900-NEXT:    v_mov_b32_e32 v0, 0
41; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
42; GFX900-NEXT:    s_endpgm
43;
44; GFX1010-LABEL: test_kern_stack:
45; GFX1010:       ; %bb.0: ; %entry
46; GFX1010-NEXT:    s_add_u32 s4, s4, s7
47; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
48; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
49; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
50; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
51; GFX1010-NEXT:    s_add_u32 s0, s0, s7
52; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
53; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
54; GFX1010-NEXT:    s_endpgm
55entry:
56  %x = alloca i32, align 4, addrspace(5)
57  store volatile i32 0, i32 addrspace(5)* %x, align 4
58  ret void
59}
60
61define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
62; GFX803-LABEL: test_kern_call:
63; GFX803:       ; %bb.0: ; %entry
64; GFX803-NEXT:    s_add_u32 s4, s4, s7
65; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
66; GFX803-NEXT:    s_add_u32 s0, s0, s7
67; GFX803-NEXT:    s_addc_u32 s1, s1, 0
68; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
69; GFX803-NEXT:    s_getpc_b64 s[4:5]
70; GFX803-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
71; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
72; GFX803-NEXT:    s_mov_b32 s32, 0
73; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
74; GFX803-NEXT:    s_endpgm
75;
76; GFX900-LABEL: test_kern_call:
77; GFX900:       ; %bb.0: ; %entry
78; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
79; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
80; GFX900-NEXT:    s_add_u32 s0, s0, s7
81; GFX900-NEXT:    s_addc_u32 s1, s1, 0
82; GFX900-NEXT:    s_getpc_b64 s[4:5]
83; GFX900-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
84; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
85; GFX900-NEXT:    s_mov_b32 s32, 0
86; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
87; GFX900-NEXT:    s_endpgm
88;
89; GFX1010-LABEL: test_kern_call:
90; GFX1010:       ; %bb.0: ; %entry
91; GFX1010-NEXT:    s_add_u32 s4, s4, s7
92; GFX1010-NEXT:    s_mov_b32 s32, 0
93; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
94; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
95; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
96; GFX1010-NEXT:    s_add_u32 s0, s0, s7
97; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
98; GFX1010-NEXT:    s_getpc_b64 s[4:5]
99; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
100; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
101; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
102; GFX1010-NEXT:    s_endpgm
103entry:
104  tail call void @ex() #0
105  ret void
106}
107
108define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
109; GFX803-LABEL: test_kern_stack_and_call:
110; GFX803:       ; %bb.0: ; %entry
111; GFX803-NEXT:    s_add_u32 s4, s4, s7
112; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
113; GFX803-NEXT:    s_add_u32 s0, s0, s7
114; GFX803-NEXT:    s_addc_u32 s1, s1, 0
115; GFX803-NEXT:    v_mov_b32_e32 v0, 0
116; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
117; GFX803-NEXT:    s_getpc_b64 s[4:5]
118; GFX803-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
119; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
120; GFX803-NEXT:    s_movk_i32 s32, 0x400
121; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
122; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
123; GFX803-NEXT:    s_endpgm
124;
125; GFX900-LABEL: test_kern_stack_and_call:
126; GFX900:       ; %bb.0: ; %entry
127; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
128; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
129; GFX900-NEXT:    s_add_u32 s0, s0, s7
130; GFX900-NEXT:    s_addc_u32 s1, s1, 0
131; GFX900-NEXT:    v_mov_b32_e32 v0, 0
132; GFX900-NEXT:    s_getpc_b64 s[4:5]
133; GFX900-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
134; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
135; GFX900-NEXT:    s_movk_i32 s32, 0x400
136; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
137; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
138; GFX900-NEXT:    s_endpgm
139;
140; GFX1010-LABEL: test_kern_stack_and_call:
141; GFX1010:       ; %bb.0: ; %entry
142; GFX1010-NEXT:    s_add_u32 s4, s4, s7
143; GFX1010-NEXT:    s_movk_i32 s32, 0x200
144; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
145; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
146; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
147; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
148; GFX1010-NEXT:    s_add_u32 s0, s0, s7
149; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
150; GFX1010-NEXT:    s_getpc_b64 s[4:5]
151; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
152; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
153; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
154; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
155; GFX1010-NEXT:    s_endpgm
156entry:
157  %x = alloca i32, align 4, addrspace(5)
158  store volatile i32 0, i32 addrspace(5)* %x, align 4
159  tail call void @ex() #0
160  ret void
161}
162
163define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
164; GFX803-LABEL: test_force_fp_kern_empty:
165; GFX803:       ; %bb.0: ; %entry
166; GFX803-NEXT:    s_mov_b32 s33, 0
167; GFX803-NEXT:    s_endpgm
168;
169; GFX900-LABEL: test_force_fp_kern_empty:
170; GFX900:       ; %bb.0: ; %entry
171; GFX900-NEXT:    s_mov_b32 s33, 0
172; GFX900-NEXT:    s_endpgm
173;
174; GFX1010-LABEL: test_force_fp_kern_empty:
175; GFX1010:       ; %bb.0: ; %entry
176; GFX1010-NEXT:    s_mov_b32 s33, 0
177; GFX1010-NEXT:    s_endpgm
178entry:
179  ret void
180}
181
182define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
183; GFX803-LABEL: test_force_fp_kern_stack:
184; GFX803:       ; %bb.0: ; %entry
185; GFX803-NEXT:    s_add_u32 s4, s4, s7
186; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
187; GFX803-NEXT:    s_add_u32 s0, s0, s7
188; GFX803-NEXT:    s_mov_b32 s33, 0
189; GFX803-NEXT:    s_addc_u32 s1, s1, 0
190; GFX803-NEXT:    v_mov_b32_e32 v0, 0
191; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
192; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
193; GFX803-NEXT:    s_endpgm
194;
195; GFX900-LABEL: test_force_fp_kern_stack:
196; GFX900:       ; %bb.0: ; %entry
197; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
198; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
199; GFX900-NEXT:    s_add_u32 s0, s0, s7
200; GFX900-NEXT:    s_mov_b32 s33, 0
201; GFX900-NEXT:    s_addc_u32 s1, s1, 0
202; GFX900-NEXT:    v_mov_b32_e32 v0, 0
203; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
204; GFX900-NEXT:    s_endpgm
205;
206; GFX1010-LABEL: test_force_fp_kern_stack:
207; GFX1010:       ; %bb.0: ; %entry
208; GFX1010-NEXT:    s_add_u32 s4, s4, s7
209; GFX1010-NEXT:    s_mov_b32 s33, 0
210; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
211; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
212; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
213; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
214; GFX1010-NEXT:    s_add_u32 s0, s0, s7
215; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
216; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
217; GFX1010-NEXT:    s_endpgm
218entry:
219  %x = alloca i32, align 4, addrspace(5)
220  store volatile i32 0, i32 addrspace(5)* %x, align 4
221  ret void
222}
223
224define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
225; GFX803-LABEL: test_force_fp_kern_call:
226; GFX803:       ; %bb.0: ; %entry
227; GFX803-NEXT:    s_add_u32 s4, s4, s7
228; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
229; GFX803-NEXT:    s_add_u32 s0, s0, s7
230; GFX803-NEXT:    s_addc_u32 s1, s1, 0
231; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
232; GFX803-NEXT:    s_getpc_b64 s[4:5]
233; GFX803-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
234; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
235; GFX803-NEXT:    s_mov_b32 s32, 0
236; GFX803-NEXT:    s_mov_b32 s33, 0
237; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
238; GFX803-NEXT:    s_endpgm
239;
240; GFX900-LABEL: test_force_fp_kern_call:
241; GFX900:       ; %bb.0: ; %entry
242; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
243; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
244; GFX900-NEXT:    s_add_u32 s0, s0, s7
245; GFX900-NEXT:    s_addc_u32 s1, s1, 0
246; GFX900-NEXT:    s_getpc_b64 s[4:5]
247; GFX900-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
248; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
249; GFX900-NEXT:    s_mov_b32 s32, 0
250; GFX900-NEXT:    s_mov_b32 s33, 0
251; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
252; GFX900-NEXT:    s_endpgm
253;
254; GFX1010-LABEL: test_force_fp_kern_call:
255; GFX1010:       ; %bb.0: ; %entry
256; GFX1010-NEXT:    s_add_u32 s4, s4, s7
257; GFX1010-NEXT:    s_mov_b32 s32, 0
258; GFX1010-NEXT:    s_mov_b32 s33, 0
259; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
260; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
261; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
262; GFX1010-NEXT:    s_add_u32 s0, s0, s7
263; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
264; GFX1010-NEXT:    s_getpc_b64 s[4:5]
265; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
266; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
267; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
268; GFX1010-NEXT:    s_endpgm
269entry:
270  tail call void @ex() #2
271  ret void
272}
273
274define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
275; GFX803-LABEL: test_force_fp_kern_stack_and_call:
276; GFX803:       ; %bb.0: ; %entry
277; GFX803-NEXT:    s_add_u32 s4, s4, s7
278; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
279; GFX803-NEXT:    s_add_u32 s0, s0, s7
280; GFX803-NEXT:    s_mov_b32 s33, 0
281; GFX803-NEXT:    s_addc_u32 s1, s1, 0
282; GFX803-NEXT:    v_mov_b32_e32 v0, 0
283; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
284; GFX803-NEXT:    s_getpc_b64 s[4:5]
285; GFX803-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
286; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
287; GFX803-NEXT:    s_movk_i32 s32, 0x400
288; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
289; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
290; GFX803-NEXT:    s_endpgm
291;
292; GFX900-LABEL: test_force_fp_kern_stack_and_call:
293; GFX900:       ; %bb.0: ; %entry
294; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
295; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
296; GFX900-NEXT:    s_add_u32 s0, s0, s7
297; GFX900-NEXT:    s_addc_u32 s1, s1, 0
298; GFX900-NEXT:    s_mov_b32 s33, 0
299; GFX900-NEXT:    v_mov_b32_e32 v0, 0
300; GFX900-NEXT:    s_getpc_b64 s[4:5]
301; GFX900-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
302; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
303; GFX900-NEXT:    s_movk_i32 s32, 0x400
304; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
305; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
306; GFX900-NEXT:    s_endpgm
307;
308; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
309; GFX1010:       ; %bb.0: ; %entry
310; GFX1010-NEXT:    s_add_u32 s4, s4, s7
311; GFX1010-NEXT:    s_movk_i32 s32, 0x200
312; GFX1010-NEXT:    s_mov_b32 s33, 0
313; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
314; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
315; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
316; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
317; GFX1010-NEXT:    s_add_u32 s0, s0, s7
318; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
319; GFX1010-NEXT:    s_getpc_b64 s[4:5]
320; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
321; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
322; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
323; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
324; GFX1010-NEXT:    s_endpgm
325entry:
326  %x = alloca i32, align 4, addrspace(5)
327  store volatile i32 0, i32 addrspace(5)* %x, align 4
328  tail call void @ex() #2
329  ret void
330}
331
332define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
333; GFX803-LABEL: test_sgpr_offset_kernel:
334; GFX803:       ; %bb.0: ; %entry
335; GFX803-NEXT:    s_add_u32 s4, s4, s7
336; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
337; GFX803-NEXT:    s_add_u32 s0, s0, s7
338; GFX803-NEXT:    s_addc_u32 s1, s1, 0
339; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
340; GFX803-NEXT:    s_mov_b32 s4, 0x40000
341; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
342; GFX803-NEXT:    s_waitcnt vmcnt(0)
343; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
344; GFX803-NEXT:    ;;#ASMSTART
345; GFX803-NEXT:    ;;#ASMEND
346; GFX803-NEXT:    s_mov_b32 s4, 0x40000
347; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
348; GFX803-NEXT:    s_waitcnt vmcnt(0)
349; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
350; GFX803-NEXT:    s_endpgm
351;
352; GFX900-LABEL: test_sgpr_offset_kernel:
353; GFX900:       ; %bb.0: ; %entry
354; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
355; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
356; GFX900-NEXT:    s_add_u32 s0, s0, s7
357; GFX900-NEXT:    s_addc_u32 s1, s1, 0
358; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
359; GFX900-NEXT:    s_mov_b32 s6, 0x40000
360; GFX900-NEXT:    s_waitcnt vmcnt(0)
361; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
362; GFX900-NEXT:    ;;#ASMSTART
363; GFX900-NEXT:    ;;#ASMEND
364; GFX900-NEXT:    s_mov_b32 s6, 0x40000
365; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
366; GFX900-NEXT:    s_waitcnt vmcnt(0)
367; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
368; GFX900-NEXT:    s_endpgm
369;
370; GFX1010-LABEL: test_sgpr_offset_kernel:
371; GFX1010:       ; %bb.0: ; %entry
372; GFX1010-NEXT:    s_add_u32 s4, s4, s7
373; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
374; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
375; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
376; GFX1010-NEXT:    s_add_u32 s0, s0, s7
377; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
378; GFX1010-NEXT:    s_mov_b32 s6, 0x20000
379; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
380; GFX1010-NEXT:    s_waitcnt vmcnt(0)
381; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
382; GFX1010-NEXT:    s_waitcnt_depctr 0xffe3
383; GFX1010-NEXT:    s_mov_b32 s6, 0x20000
384; GFX1010-NEXT:    ;;#ASMSTART
385; GFX1010-NEXT:    ;;#ASMEND
386; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
387; GFX1010-NEXT:    s_waitcnt vmcnt(0)
388; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
389; GFX1010-NEXT:    s_endpgm
390entry:
391  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
392  ; fit in the instruction, and has to live in the SGPR offset.
393  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
394  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
395
396  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
397  ; 0x40000 / 64 = 4096 (for wave64)
398  ; CHECK: s_add_u32 s6, s7, 0x40000
399  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
400  %a = load volatile i32, i32 addrspace(5)* %aptr
401
402  ; Force %a to spill
403  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
404
405  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
406  store volatile i32 %a, i32 addrspace(5)* %outptr
407
408  ret void
409}
410
411declare hidden void @ex() local_unnamed_addr #0
412
413attributes #0 = { nounwind }
414attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
415attributes #2 = { nounwind "frame-pointer"="all" }
416