• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6
7define amdgpu_kernel void @flat_agent_unordered_load(
8; GFX7-LABEL: flat_agent_unordered_load:
9; GFX7:       ; %bb.0: ; %entry
10; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
11; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
12; GFX7-NEXT:    v_mov_b32_e32 v0, s0
13; GFX7-NEXT:    v_mov_b32_e32 v1, s1
14; GFX7-NEXT:    flat_load_dword v0, v[0:1]
15; GFX7-NEXT:    v_mov_b32_e32 v2, s2
16; GFX7-NEXT:    v_mov_b32_e32 v3, s3
17; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
18; GFX7-NEXT:    flat_store_dword v[2:3], v0
19; GFX7-NEXT:    s_endpgm
20;
21; GFX10-WGP-LABEL: flat_agent_unordered_load:
22; GFX10-WGP:       ; %bb.0: ; %entry
23; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
24; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
25; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
26; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
27; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
28; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
29; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
30; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
31; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
32; GFX10-WGP-NEXT:    s_endpgm
33;
34; GFX10-CU-LABEL: flat_agent_unordered_load:
35; GFX10-CU:       ; %bb.0: ; %entry
36; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
37; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
38; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
39; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
40; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
41; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
42; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
43; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
44; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
45; GFX10-CU-NEXT:    s_endpgm
46;
47; SKIP-CACHE-INV-LABEL: flat_agent_unordered_load:
48; SKIP-CACHE-INV:       ; %bb.0: ; %entry
49; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
50; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
51; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
52; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
53; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
54; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
55; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
56; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
57; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
58; SKIP-CACHE-INV-NEXT:    s_endpgm
59    i32* %in, i32* %out) {
60entry:
61  %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4
62  store i32 %val, i32* %out
63  ret void
64}
65
66define amdgpu_kernel void @flat_agent_monotonic_load(
67; GFX7-LABEL: flat_agent_monotonic_load:
68; GFX7:       ; %bb.0: ; %entry
69; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
70; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX7-NEXT:    v_mov_b32_e32 v0, s0
72; GFX7-NEXT:    v_mov_b32_e32 v1, s1
73; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
74; GFX7-NEXT:    v_mov_b32_e32 v2, s2
75; GFX7-NEXT:    v_mov_b32_e32 v3, s3
76; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
77; GFX7-NEXT:    flat_store_dword v[2:3], v0
78; GFX7-NEXT:    s_endpgm
79;
80; GFX10-WGP-LABEL: flat_agent_monotonic_load:
81; GFX10-WGP:       ; %bb.0: ; %entry
82; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
83; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
84; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
85; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
86; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
87; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
88; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
89; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
90; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
91; GFX10-WGP-NEXT:    s_endpgm
92;
93; GFX10-CU-LABEL: flat_agent_monotonic_load:
94; GFX10-CU:       ; %bb.0: ; %entry
95; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
96; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
98; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
99; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
100; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
101; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
102; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
103; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
104; GFX10-CU-NEXT:    s_endpgm
105;
106; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_load:
107; SKIP-CACHE-INV:       ; %bb.0: ; %entry
108; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
109; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
110; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
111; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
112; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
113; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
114; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
115; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
116; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
117; SKIP-CACHE-INV-NEXT:    s_endpgm
118    i32* %in, i32* %out) {
119entry:
120  %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4
121  store i32 %val, i32* %out
122  ret void
123}
124
125define amdgpu_kernel void @flat_agent_acquire_load(
126; GFX7-LABEL: flat_agent_acquire_load:
127; GFX7:       ; %bb.0: ; %entry
128; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
129; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX7-NEXT:    v_mov_b32_e32 v0, s0
131; GFX7-NEXT:    v_mov_b32_e32 v1, s1
132; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
133; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
134; GFX7-NEXT:    buffer_wbinvl1_vol
135; GFX7-NEXT:    v_mov_b32_e32 v2, s2
136; GFX7-NEXT:    v_mov_b32_e32 v3, s3
137; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
138; GFX7-NEXT:    flat_store_dword v[2:3], v0
139; GFX7-NEXT:    s_endpgm
140;
141; GFX10-WGP-LABEL: flat_agent_acquire_load:
142; GFX10-WGP:       ; %bb.0: ; %entry
143; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
144; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
145; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
146; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
147; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
148; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
149; GFX10-WGP-NEXT:    buffer_gl0_inv
150; GFX10-WGP-NEXT:    buffer_gl1_inv
151; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
152; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
153; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
154; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
155; GFX10-WGP-NEXT:    s_endpgm
156;
157; GFX10-CU-LABEL: flat_agent_acquire_load:
158; GFX10-CU:       ; %bb.0: ; %entry
159; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
160; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
161; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
162; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
163; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
164; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
165; GFX10-CU-NEXT:    buffer_gl0_inv
166; GFX10-CU-NEXT:    buffer_gl1_inv
167; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
168; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
169; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
171; GFX10-CU-NEXT:    s_endpgm
172;
173; SKIP-CACHE-INV-LABEL: flat_agent_acquire_load:
174; SKIP-CACHE-INV:       ; %bb.0: ; %entry
175; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
176; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
177; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
178; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
179; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
180; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
181; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
182; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
183; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
184; SKIP-CACHE-INV-NEXT:    s_endpgm
185    i32* %in, i32* %out) {
186entry:
187  %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4
188  store i32 %val, i32* %out
189  ret void
190}
191
192define amdgpu_kernel void @flat_agent_seq_cst_load(
193; GFX7-LABEL: flat_agent_seq_cst_load:
194; GFX7:       ; %bb.0: ; %entry
195; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
196; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
197; GFX7-NEXT:    v_mov_b32_e32 v0, s0
198; GFX7-NEXT:    v_mov_b32_e32 v1, s1
199; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
200; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
201; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
202; GFX7-NEXT:    buffer_wbinvl1_vol
203; GFX7-NEXT:    v_mov_b32_e32 v2, s2
204; GFX7-NEXT:    v_mov_b32_e32 v3, s3
205; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
206; GFX7-NEXT:    flat_store_dword v[2:3], v0
207; GFX7-NEXT:    s_endpgm
208;
209; GFX10-WGP-LABEL: flat_agent_seq_cst_load:
210; GFX10-WGP:       ; %bb.0: ; %entry
211; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
212; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
214; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
215; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
216; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
217; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
218; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
219; GFX10-WGP-NEXT:    buffer_gl0_inv
220; GFX10-WGP-NEXT:    buffer_gl1_inv
221; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
222; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
223; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
224; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
225; GFX10-WGP-NEXT:    s_endpgm
226;
227; GFX10-CU-LABEL: flat_agent_seq_cst_load:
228; GFX10-CU:       ; %bb.0: ; %entry
229; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
230; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
232; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
233; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
234; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
235; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
236; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
237; GFX10-CU-NEXT:    buffer_gl0_inv
238; GFX10-CU-NEXT:    buffer_gl1_inv
239; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
240; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
241; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
242; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
243; GFX10-CU-NEXT:    s_endpgm
244;
245; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_load:
246; SKIP-CACHE-INV:       ; %bb.0: ; %entry
247; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
248; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
249; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
250; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
251; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
252; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
253; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
254; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
255; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
256; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
257; SKIP-CACHE-INV-NEXT:    s_endpgm
258    i32* %in, i32* %out) {
259entry:
260  %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4
261  store i32 %val, i32* %out
262  ret void
263}
264
265define amdgpu_kernel void @flat_agent_unordered_store(
266; GFX7-LABEL: flat_agent_unordered_store:
267; GFX7:       ; %bb.0: ; %entry
268; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
269; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
270; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
271; GFX7-NEXT:    v_mov_b32_e32 v2, s2
272; GFX7-NEXT:    v_mov_b32_e32 v0, s0
273; GFX7-NEXT:    v_mov_b32_e32 v1, s1
274; GFX7-NEXT:    flat_store_dword v[0:1], v2
275; GFX7-NEXT:    s_endpgm
276;
277; GFX10-WGP-LABEL: flat_agent_unordered_store:
278; GFX10-WGP:       ; %bb.0: ; %entry
279; GFX10-WGP-NEXT:    s_clause 0x1
280; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
281; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
282; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
283; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
284; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
285; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
286; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
287; GFX10-WGP-NEXT:    s_endpgm
288;
289; GFX10-CU-LABEL: flat_agent_unordered_store:
290; GFX10-CU:       ; %bb.0: ; %entry
291; GFX10-CU-NEXT:    s_clause 0x1
292; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
293; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
294; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
295; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
296; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
297; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
298; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
299; GFX10-CU-NEXT:    s_endpgm
300;
301; SKIP-CACHE-INV-LABEL: flat_agent_unordered_store:
302; SKIP-CACHE-INV:       ; %bb.0: ; %entry
303; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
304; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
305; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
306; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
307; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
308; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
309; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
310; SKIP-CACHE-INV-NEXT:    s_endpgm
311    i32 %in, i32* %out) {
312entry:
313  store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4
314  ret void
315}
316
317define amdgpu_kernel void @flat_agent_monotonic_store(
318; GFX7-LABEL: flat_agent_monotonic_store:
319; GFX7:       ; %bb.0: ; %entry
320; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
321; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
322; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX7-NEXT:    v_mov_b32_e32 v2, s2
324; GFX7-NEXT:    v_mov_b32_e32 v0, s0
325; GFX7-NEXT:    v_mov_b32_e32 v1, s1
326; GFX7-NEXT:    flat_store_dword v[0:1], v2
327; GFX7-NEXT:    s_endpgm
328;
329; GFX10-WGP-LABEL: flat_agent_monotonic_store:
330; GFX10-WGP:       ; %bb.0: ; %entry
331; GFX10-WGP-NEXT:    s_clause 0x1
332; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
333; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
334; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
335; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
336; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
337; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
338; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
339; GFX10-WGP-NEXT:    s_endpgm
340;
341; GFX10-CU-LABEL: flat_agent_monotonic_store:
342; GFX10-CU:       ; %bb.0: ; %entry
343; GFX10-CU-NEXT:    s_clause 0x1
344; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
345; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
346; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
348; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
349; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
350; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
351; GFX10-CU-NEXT:    s_endpgm
352;
353; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_store:
354; SKIP-CACHE-INV:       ; %bb.0: ; %entry
355; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
356; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
357; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
358; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
359; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
360; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
361; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
362; SKIP-CACHE-INV-NEXT:    s_endpgm
363    i32 %in, i32* %out) {
364entry:
365  store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4
366  ret void
367}
368
369define amdgpu_kernel void @flat_agent_release_store(
370; GFX7-LABEL: flat_agent_release_store:
371; GFX7:       ; %bb.0: ; %entry
372; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
373; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
374; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX7-NEXT:    v_mov_b32_e32 v2, s2
376; GFX7-NEXT:    v_mov_b32_e32 v0, s0
377; GFX7-NEXT:    v_mov_b32_e32 v1, s1
378; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
379; GFX7-NEXT:    flat_store_dword v[0:1], v2
380; GFX7-NEXT:    s_endpgm
381;
382; GFX10-WGP-LABEL: flat_agent_release_store:
383; GFX10-WGP:       ; %bb.0: ; %entry
384; GFX10-WGP-NEXT:    s_clause 0x1
385; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
386; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
387; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
388; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
389; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
390; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
391; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
392; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
393; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
394; GFX10-WGP-NEXT:    s_endpgm
395;
396; GFX10-CU-LABEL: flat_agent_release_store:
397; GFX10-CU:       ; %bb.0: ; %entry
398; GFX10-CU-NEXT:    s_clause 0x1
399; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
400; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
401; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
402; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
403; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
404; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
405; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
406; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
407; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
408; GFX10-CU-NEXT:    s_endpgm
409;
410; SKIP-CACHE-INV-LABEL: flat_agent_release_store:
411; SKIP-CACHE-INV:       ; %bb.0: ; %entry
412; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
413; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
414; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
415; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
416; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
417; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
418; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
419; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
420; SKIP-CACHE-INV-NEXT:    s_endpgm
421    i32 %in, i32* %out) {
422entry:
423  store atomic i32 %in, i32* %out syncscope("agent") release, align 4
424  ret void
425}
426
427define amdgpu_kernel void @flat_agent_seq_cst_store(
428; GFX7-LABEL: flat_agent_seq_cst_store:
429; GFX7:       ; %bb.0: ; %entry
430; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
431; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
432; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
433; GFX7-NEXT:    v_mov_b32_e32 v2, s2
434; GFX7-NEXT:    v_mov_b32_e32 v0, s0
435; GFX7-NEXT:    v_mov_b32_e32 v1, s1
436; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
437; GFX7-NEXT:    flat_store_dword v[0:1], v2
438; GFX7-NEXT:    s_endpgm
439;
440; GFX10-WGP-LABEL: flat_agent_seq_cst_store:
441; GFX10-WGP:       ; %bb.0: ; %entry
442; GFX10-WGP-NEXT:    s_clause 0x1
443; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
444; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
445; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
446; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
447; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
448; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
449; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
450; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
451; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
452; GFX10-WGP-NEXT:    s_endpgm
453;
454; GFX10-CU-LABEL: flat_agent_seq_cst_store:
455; GFX10-CU:       ; %bb.0: ; %entry
456; GFX10-CU-NEXT:    s_clause 0x1
457; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
458; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
459; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
460; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
461; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
462; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
463; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
464; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
465; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
466; GFX10-CU-NEXT:    s_endpgm
467;
468; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_store:
469; SKIP-CACHE-INV:       ; %bb.0: ; %entry
470; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
471; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
472; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
473; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
474; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
475; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
476; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
477; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
478; SKIP-CACHE-INV-NEXT:    s_endpgm
479    i32 %in, i32* %out) {
480entry:
481  store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4
482  ret void
483}
484
485define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
486; GFX7-LABEL: flat_agent_monotonic_atomicrmw:
487; GFX7:       ; %bb.0: ; %entry
488; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
489; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
490; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
491; GFX7-NEXT:    v_mov_b32_e32 v0, s0
492; GFX7-NEXT:    v_mov_b32_e32 v1, s1
493; GFX7-NEXT:    v_mov_b32_e32 v2, s2
494; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
495; GFX7-NEXT:    s_endpgm
496;
497; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw:
498; GFX10-WGP:       ; %bb.0: ; %entry
499; GFX10-WGP-NEXT:    s_clause 0x1
500; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
501; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
502; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
503; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
504; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
505; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
506; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
507; GFX10-WGP-NEXT:    s_endpgm
508;
509; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw:
510; GFX10-CU:       ; %bb.0: ; %entry
511; GFX10-CU-NEXT:    s_clause 0x1
512; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
513; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
514; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
515; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
516; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
517; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
518; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
519; GFX10-CU-NEXT:    s_endpgm
520;
521; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_atomicrmw:
522; SKIP-CACHE-INV:       ; %bb.0: ; %entry
523; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
524; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
525; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
526; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
527; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
528; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
529; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
530; SKIP-CACHE-INV-NEXT:    s_endpgm
531    i32* %out, i32 %in) {
532entry:
533  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") monotonic
534  ret void
535}
536
537define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
538; GFX7-LABEL: flat_agent_acquire_atomicrmw:
539; GFX7:       ; %bb.0: ; %entry
540; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
541; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
542; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
543; GFX7-NEXT:    v_mov_b32_e32 v0, s0
544; GFX7-NEXT:    v_mov_b32_e32 v1, s1
545; GFX7-NEXT:    v_mov_b32_e32 v2, s2
546; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
547; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
548; GFX7-NEXT:    buffer_wbinvl1_vol
549; GFX7-NEXT:    s_endpgm
550;
551; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw:
552; GFX10-WGP:       ; %bb.0: ; %entry
553; GFX10-WGP-NEXT:    s_clause 0x1
554; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
555; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
556; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
557; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
558; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
559; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
560; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
561; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
562; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
563; GFX10-WGP-NEXT:    buffer_gl0_inv
564; GFX10-WGP-NEXT:    buffer_gl1_inv
565; GFX10-WGP-NEXT:    s_endpgm
566;
567; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
568; GFX10-CU:       ; %bb.0: ; %entry
569; GFX10-CU-NEXT:    s_clause 0x1
570; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
571; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
572; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
573; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
574; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
575; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
576; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
577; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
578; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
579; GFX10-CU-NEXT:    buffer_gl0_inv
580; GFX10-CU-NEXT:    buffer_gl1_inv
581; GFX10-CU-NEXT:    s_endpgm
582;
583; SKIP-CACHE-INV-LABEL: flat_agent_acquire_atomicrmw:
584; SKIP-CACHE-INV:       ; %bb.0: ; %entry
585; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
586; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
587; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
588; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
589; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
590; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
591; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
592; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
593; SKIP-CACHE-INV-NEXT:    s_endpgm
594    i32* %out, i32 %in) {
595entry:
596  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire
597  ret void
598}
599
600define amdgpu_kernel void @flat_agent_release_atomicrmw(
601; GFX7-LABEL: flat_agent_release_atomicrmw:
602; GFX7:       ; %bb.0: ; %entry
603; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
604; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
605; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
606; GFX7-NEXT:    v_mov_b32_e32 v0, s0
607; GFX7-NEXT:    v_mov_b32_e32 v1, s1
608; GFX7-NEXT:    v_mov_b32_e32 v2, s2
609; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
610; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
611; GFX7-NEXT:    s_endpgm
612;
613; GFX10-WGP-LABEL: flat_agent_release_atomicrmw:
614; GFX10-WGP:       ; %bb.0: ; %entry
615; GFX10-WGP-NEXT:    s_clause 0x1
616; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
617; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
618; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
619; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
620; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
621; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
622; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
623; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
624; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
625; GFX10-WGP-NEXT:    s_endpgm
626;
627; GFX10-CU-LABEL: flat_agent_release_atomicrmw:
628; GFX10-CU:       ; %bb.0: ; %entry
629; GFX10-CU-NEXT:    s_clause 0x1
630; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
631; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
632; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
633; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
634; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
635; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
636; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
637; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
638; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
639; GFX10-CU-NEXT:    s_endpgm
640;
641; SKIP-CACHE-INV-LABEL: flat_agent_release_atomicrmw:
642; SKIP-CACHE-INV:       ; %bb.0: ; %entry
643; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
644; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
645; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
646; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
647; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
648; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
649; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
650; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
651; SKIP-CACHE-INV-NEXT:    s_endpgm
652    i32* %out, i32 %in) {
653entry:
654  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") release
655  ret void
656}
657
658define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
659; GFX7-LABEL: flat_agent_acq_rel_atomicrmw:
660; GFX7:       ; %bb.0: ; %entry
661; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
662; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
663; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
664; GFX7-NEXT:    v_mov_b32_e32 v0, s0
665; GFX7-NEXT:    v_mov_b32_e32 v1, s1
666; GFX7-NEXT:    v_mov_b32_e32 v2, s2
667; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
668; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
669; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
670; GFX7-NEXT:    buffer_wbinvl1_vol
671; GFX7-NEXT:    s_endpgm
672;
673; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
674; GFX10-WGP:       ; %bb.0: ; %entry
675; GFX10-WGP-NEXT:    s_clause 0x1
676; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
677; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
678; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
679; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
680; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
681; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
682; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
683; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
684; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
685; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
686; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
687; GFX10-WGP-NEXT:    buffer_gl0_inv
688; GFX10-WGP-NEXT:    buffer_gl1_inv
689; GFX10-WGP-NEXT:    s_endpgm
690;
691; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
692; GFX10-CU:       ; %bb.0: ; %entry
693; GFX10-CU-NEXT:    s_clause 0x1
694; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
695; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
696; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
697; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
698; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
699; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
700; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
701; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
702; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
703; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
704; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
705; GFX10-CU-NEXT:    buffer_gl0_inv
706; GFX10-CU-NEXT:    buffer_gl1_inv
707; GFX10-CU-NEXT:    s_endpgm
708;
709; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_atomicrmw:
710; SKIP-CACHE-INV:       ; %bb.0: ; %entry
711; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
712; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
713; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
714; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
715; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
716; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
717; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
718; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
719; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
720; SKIP-CACHE-INV-NEXT:    s_endpgm
721    i32* %out, i32 %in) {
722entry:
723  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel
724  ret void
725}
726
727define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
728; GFX7-LABEL: flat_agent_seq_cst_atomicrmw:
729; GFX7:       ; %bb.0: ; %entry
730; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
731; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
732; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
733; GFX7-NEXT:    v_mov_b32_e32 v0, s0
734; GFX7-NEXT:    v_mov_b32_e32 v1, s1
735; GFX7-NEXT:    v_mov_b32_e32 v2, s2
736; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
737; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
738; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
739; GFX7-NEXT:    buffer_wbinvl1_vol
740; GFX7-NEXT:    s_endpgm
741;
742; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
743; GFX10-WGP:       ; %bb.0: ; %entry
744; GFX10-WGP-NEXT:    s_clause 0x1
745; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
746; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
747; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
749; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
750; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
751; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
752; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
753; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
754; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
755; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
756; GFX10-WGP-NEXT:    buffer_gl0_inv
757; GFX10-WGP-NEXT:    buffer_gl1_inv
758; GFX10-WGP-NEXT:    s_endpgm
759;
760; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
761; GFX10-CU:       ; %bb.0: ; %entry
762; GFX10-CU-NEXT:    s_clause 0x1
763; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
764; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
765; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
766; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
767; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
768; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
769; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
770; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
771; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
772; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
773; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
774; GFX10-CU-NEXT:    buffer_gl0_inv
775; GFX10-CU-NEXT:    buffer_gl1_inv
776; GFX10-CU-NEXT:    s_endpgm
777;
778; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_atomicrmw:
779; SKIP-CACHE-INV:       ; %bb.0: ; %entry
780; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
781; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
782; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
783; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
784; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
785; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
786; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
787; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
788; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
789; SKIP-CACHE-INV-NEXT:    s_endpgm
790    i32* %out, i32 %in) {
791entry:
792  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst
793  ret void
794}
795
796define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
797; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw:
798; GFX7:       ; %bb.0: ; %entry
799; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
800; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
801; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
802; GFX7-NEXT:    v_mov_b32_e32 v0, s0
803; GFX7-NEXT:    v_mov_b32_e32 v1, s1
804; GFX7-NEXT:    v_mov_b32_e32 v2, s2
805; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
806; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
807; GFX7-NEXT:    buffer_wbinvl1_vol
808; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
809; GFX7-NEXT:    flat_store_dword v[0:1], v2
810; GFX7-NEXT:    s_endpgm
811;
812; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
813; GFX10-WGP:       ; %bb.0: ; %entry
814; GFX10-WGP-NEXT:    s_clause 0x1
815; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
816; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
817; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
818; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
819; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
820; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
821; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
822; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
823; GFX10-WGP-NEXT:    buffer_gl0_inv
824; GFX10-WGP-NEXT:    buffer_gl1_inv
825; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
826; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
827; GFX10-WGP-NEXT:    s_endpgm
828;
829; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
830; GFX10-CU:       ; %bb.0: ; %entry
831; GFX10-CU-NEXT:    s_clause 0x1
832; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
833; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
834; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
835; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
836; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
837; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
838; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
839; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
840; GFX10-CU-NEXT:    buffer_gl0_inv
841; GFX10-CU-NEXT:    buffer_gl1_inv
842; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
843; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
844; GFX10-CU-NEXT:    s_endpgm
845;
846; SKIP-CACHE-INV-LABEL: flat_agent_acquire_ret_atomicrmw:
847; SKIP-CACHE-INV:       ; %bb.0: ; %entry
848; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
849; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
850; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
851; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
852; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
853; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
854; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
855; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
856; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
857; SKIP-CACHE-INV-NEXT:    s_endpgm
858    i32* %out, i32 %in) {
859entry:
860  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire
861  store i32 %val, i32* %out, align 4
862  ret void
863}
864
865define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
866; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw:
867; GFX7:       ; %bb.0: ; %entry
868; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
869; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
870; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
871; GFX7-NEXT:    v_mov_b32_e32 v0, s0
872; GFX7-NEXT:    v_mov_b32_e32 v1, s1
873; GFX7-NEXT:    v_mov_b32_e32 v2, s2
874; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
875; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
876; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
877; GFX7-NEXT:    buffer_wbinvl1_vol
878; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
879; GFX7-NEXT:    flat_store_dword v[0:1], v2
880; GFX7-NEXT:    s_endpgm
881;
882; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
883; GFX10-WGP:       ; %bb.0: ; %entry
884; GFX10-WGP-NEXT:    s_clause 0x1
885; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
886; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
887; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
888; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
889; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
890; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
891; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
892; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
893; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
894; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
895; GFX10-WGP-NEXT:    buffer_gl0_inv
896; GFX10-WGP-NEXT:    buffer_gl1_inv
897; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
898; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
899; GFX10-WGP-NEXT:    s_endpgm
900;
901; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
902; GFX10-CU:       ; %bb.0: ; %entry
903; GFX10-CU-NEXT:    s_clause 0x1
904; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
905; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
906; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
907; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
908; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
909; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
910; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
911; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
912; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
913; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
914; GFX10-CU-NEXT:    buffer_gl0_inv
915; GFX10-CU-NEXT:    buffer_gl1_inv
916; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
917; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
918; GFX10-CU-NEXT:    s_endpgm
919;
920; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_ret_atomicrmw:
921; SKIP-CACHE-INV:       ; %bb.0: ; %entry
922; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
923; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
924; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
925; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
926; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
927; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
928; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
929; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
930; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
931; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
932; SKIP-CACHE-INV-NEXT:    s_endpgm
933    i32* %out, i32 %in) {
934entry:
935  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel
936  store i32 %val, i32* %out, align 4
937  ret void
938}
939
940define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
941; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw:
942; GFX7:       ; %bb.0: ; %entry
943; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
944; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
945; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
946; GFX7-NEXT:    v_mov_b32_e32 v0, s0
947; GFX7-NEXT:    v_mov_b32_e32 v1, s1
948; GFX7-NEXT:    v_mov_b32_e32 v2, s2
949; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
950; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
951; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
952; GFX7-NEXT:    buffer_wbinvl1_vol
953; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
954; GFX7-NEXT:    flat_store_dword v[0:1], v2
955; GFX7-NEXT:    s_endpgm
956;
957; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
958; GFX10-WGP:       ; %bb.0: ; %entry
959; GFX10-WGP-NEXT:    s_clause 0x1
960; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
961; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
962; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
963; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
964; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
965; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
966; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
967; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
968; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
969; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
970; GFX10-WGP-NEXT:    buffer_gl0_inv
971; GFX10-WGP-NEXT:    buffer_gl1_inv
972; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
973; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
974; GFX10-WGP-NEXT:    s_endpgm
975;
976; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
977; GFX10-CU:       ; %bb.0: ; %entry
978; GFX10-CU-NEXT:    s_clause 0x1
979; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
980; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
981; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
982; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
983; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
984; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
985; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
986; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
987; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
988; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
989; GFX10-CU-NEXT:    buffer_gl0_inv
990; GFX10-CU-NEXT:    buffer_gl1_inv
991; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
992; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
993; GFX10-CU-NEXT:    s_endpgm
994;
995; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_ret_atomicrmw:
996; SKIP-CACHE-INV:       ; %bb.0: ; %entry
997; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
998; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
999; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1000; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1001; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1002; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1003; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1004; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1005; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1006; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1007; SKIP-CACHE-INV-NEXT:    s_endpgm
1008    i32* %out, i32 %in) {
1009entry:
1010  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst
1011  store i32 %val, i32* %out, align 4
1012  ret void
1013}
1014
1015define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
1016; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
1017; GFX7:       ; %bb.0: ; %entry
1018; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1019; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1020; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1021; GFX7-NEXT:    s_add_u32 s0, s0, 16
1022; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1023; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1024; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1025; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1026; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1027; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1028; GFX7-NEXT:    s_endpgm
1029;
1030; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
1031; GFX10-WGP:       ; %bb.0: ; %entry
1032; GFX10-WGP-NEXT:    s_clause 0x1
1033; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1034; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1035; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1036; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1037; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1038; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1039; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1040; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1041; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1042; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1043; GFX10-WGP-NEXT:    s_endpgm
1044;
1045; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
1046; GFX10-CU:       ; %bb.0: ; %entry
1047; GFX10-CU-NEXT:    s_clause 0x1
1048; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1049; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1050; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1051; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1052; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1053; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1054; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1055; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1056; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1057; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1058; GFX10-CU-NEXT:    s_endpgm
1059;
1060; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
1061; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1062; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1063; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1064; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1065; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1066; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1067; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1068; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1069; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1070; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1071; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1072; SKIP-CACHE-INV-NEXT:    s_endpgm
1073    i32* %out, i32 %in, i32 %old) {
1074entry:
1075  %gep = getelementptr i32, i32* %out, i32 4
1076  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
1077  ret void
1078}
1079
1080define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
1081; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg:
1082; GFX7:       ; %bb.0: ; %entry
1083; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1084; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1085; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1086; GFX7-NEXT:    s_add_u32 s0, s0, 16
1087; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1088; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1089; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1090; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1091; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1092; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1093; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1094; GFX7-NEXT:    buffer_wbinvl1_vol
1095; GFX7-NEXT:    s_endpgm
1096;
1097; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
1098; GFX10-WGP:       ; %bb.0: ; %entry
1099; GFX10-WGP-NEXT:    s_clause 0x1
1100; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1101; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1102; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1103; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1104; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1105; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1106; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1107; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1108; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1109; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1110; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1111; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1112; GFX10-WGP-NEXT:    buffer_gl0_inv
1113; GFX10-WGP-NEXT:    buffer_gl1_inv
1114; GFX10-WGP-NEXT:    s_endpgm
1115;
1116; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
1117; GFX10-CU:       ; %bb.0: ; %entry
1118; GFX10-CU-NEXT:    s_clause 0x1
1119; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1120; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1121; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1122; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1123; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1124; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1125; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1126; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1127; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1128; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1129; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1130; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1131; GFX10-CU-NEXT:    buffer_gl0_inv
1132; GFX10-CU-NEXT:    buffer_gl1_inv
1133; GFX10-CU-NEXT:    s_endpgm
1134;
1135; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_cmpxchg:
1136; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1137; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1138; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1139; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1140; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1141; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1142; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1143; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1144; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1145; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1146; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1147; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1148; SKIP-CACHE-INV-NEXT:    s_endpgm
1149    i32* %out, i32 %in, i32 %old) {
1150entry:
1151  %gep = getelementptr i32, i32* %out, i32 4
1152  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
1153  ret void
1154}
1155
1156define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
1157; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg:
1158; GFX7:       ; %bb.0: ; %entry
1159; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1160; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1161; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1162; GFX7-NEXT:    s_add_u32 s0, s0, 16
1163; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1164; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1165; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1166; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1167; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1168; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1169; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1170; GFX7-NEXT:    s_endpgm
1171;
1172; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
1173; GFX10-WGP:       ; %bb.0: ; %entry
1174; GFX10-WGP-NEXT:    s_clause 0x1
1175; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1176; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1177; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1178; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1179; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1180; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1181; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1182; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1183; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1184; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1185; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1186; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1187; GFX10-WGP-NEXT:    s_endpgm
1188;
1189; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
1190; GFX10-CU:       ; %bb.0: ; %entry
1191; GFX10-CU-NEXT:    s_clause 0x1
1192; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1193; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1194; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1195; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1196; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1197; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1198; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1199; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1200; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1201; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1202; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1203; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1204; GFX10-CU-NEXT:    s_endpgm
1205;
1206; SKIP-CACHE-INV-LABEL: flat_agent_release_monotonic_cmpxchg:
1207; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1208; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1209; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1210; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1211; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1212; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1213; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1214; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1215; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1216; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1217; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1218; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1219; SKIP-CACHE-INV-NEXT:    s_endpgm
1220    i32* %out, i32 %in, i32 %old) {
1221entry:
1222  %gep = getelementptr i32, i32* %out, i32 4
1223  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release monotonic
1224  ret void
1225}
1226
1227define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
1228; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
1229; GFX7:       ; %bb.0: ; %entry
1230; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1231; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1232; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1233; GFX7-NEXT:    s_add_u32 s0, s0, 16
1234; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1235; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1236; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1237; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1238; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1239; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1240; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1241; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1242; GFX7-NEXT:    buffer_wbinvl1_vol
1243; GFX7-NEXT:    s_endpgm
1244;
1245; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
1246; GFX10-WGP:       ; %bb.0: ; %entry
1247; GFX10-WGP-NEXT:    s_clause 0x1
1248; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1249; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1250; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1251; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1252; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1253; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1254; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1255; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1256; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1257; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1258; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1259; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1260; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1261; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1262; GFX10-WGP-NEXT:    buffer_gl0_inv
1263; GFX10-WGP-NEXT:    buffer_gl1_inv
1264; GFX10-WGP-NEXT:    s_endpgm
1265;
1266; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
1267; GFX10-CU:       ; %bb.0: ; %entry
1268; GFX10-CU-NEXT:    s_clause 0x1
1269; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1270; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1271; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1272; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1273; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1274; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1275; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1276; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1277; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1278; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1279; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1280; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1281; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1282; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1283; GFX10-CU-NEXT:    buffer_gl0_inv
1284; GFX10-CU-NEXT:    buffer_gl1_inv
1285; GFX10-CU-NEXT:    s_endpgm
1286;
1287; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
1288; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1289; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1290; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1291; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1292; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1293; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1294; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1295; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1296; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1297; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1298; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1299; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1300; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1301; SKIP-CACHE-INV-NEXT:    s_endpgm
1302    i32* %out, i32 %in, i32 %old) {
1303entry:
1304  %gep = getelementptr i32, i32* %out, i32 4
1305  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
1306  ret void
1307}
1308
1309define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
1310; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
1311; GFX7:       ; %bb.0: ; %entry
1312; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1313; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1314; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1315; GFX7-NEXT:    s_add_u32 s0, s0, 16
1316; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1317; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1318; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1319; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1320; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1321; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1322; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1323; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1324; GFX7-NEXT:    buffer_wbinvl1_vol
1325; GFX7-NEXT:    s_endpgm
1326;
1327; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
1328; GFX10-WGP:       ; %bb.0: ; %entry
1329; GFX10-WGP-NEXT:    s_clause 0x1
1330; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1331; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1332; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1333; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1334; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1335; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1336; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1337; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1338; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1339; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1340; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1341; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1342; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1344; GFX10-WGP-NEXT:    buffer_gl0_inv
1345; GFX10-WGP-NEXT:    buffer_gl1_inv
1346; GFX10-WGP-NEXT:    s_endpgm
1347;
1348; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
1349; GFX10-CU:       ; %bb.0: ; %entry
1350; GFX10-CU-NEXT:    s_clause 0x1
1351; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1352; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1353; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1354; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1355; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1356; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1357; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1358; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1359; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1360; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1361; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1362; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1363; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1364; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1365; GFX10-CU-NEXT:    buffer_gl0_inv
1366; GFX10-CU-NEXT:    buffer_gl1_inv
1367; GFX10-CU-NEXT:    s_endpgm
1368;
1369; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
1370; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1371; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1372; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1373; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1374; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1375; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1376; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1377; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1378; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1379; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1380; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1381; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1382; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1383; SKIP-CACHE-INV-NEXT:    s_endpgm
1384    i32* %out, i32 %in, i32 %old) {
1385entry:
1386  %gep = getelementptr i32, i32* %out, i32 4
1387  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
1388  ret void
1389}
1390
1391define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
1392; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg:
1393; GFX7:       ; %bb.0: ; %entry
1394; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1395; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1396; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1397; GFX7-NEXT:    s_add_u32 s0, s0, 16
1398; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1399; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1400; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1401; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1402; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1403; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1404; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1405; GFX7-NEXT:    buffer_wbinvl1_vol
1406; GFX7-NEXT:    s_endpgm
1407;
1408; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
1409; GFX10-WGP:       ; %bb.0: ; %entry
1410; GFX10-WGP-NEXT:    s_clause 0x1
1411; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1412; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1413; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1414; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1415; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1416; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1417; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1418; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1419; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1420; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1421; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1422; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1423; GFX10-WGP-NEXT:    buffer_gl0_inv
1424; GFX10-WGP-NEXT:    buffer_gl1_inv
1425; GFX10-WGP-NEXT:    s_endpgm
1426;
1427; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
1428; GFX10-CU:       ; %bb.0: ; %entry
1429; GFX10-CU-NEXT:    s_clause 0x1
1430; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1431; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1432; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1433; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1434; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1435; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1436; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1437; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1438; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1439; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1440; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1441; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1442; GFX10-CU-NEXT:    buffer_gl0_inv
1443; GFX10-CU-NEXT:    buffer_gl1_inv
1444; GFX10-CU-NEXT:    s_endpgm
1445;
1446; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_cmpxchg:
1447; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1448; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1449; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1450; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1451; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1452; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1453; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1454; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1455; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1456; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1457; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1458; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1459; SKIP-CACHE-INV-NEXT:    s_endpgm
1460    i32* %out, i32 %in, i32 %old) {
1461entry:
1462  %gep = getelementptr i32, i32* %out, i32 4
1463  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
1464  ret void
1465}
1466
1467define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
1468; GFX7-LABEL: flat_agent_release_acquire_cmpxchg:
1469; GFX7:       ; %bb.0: ; %entry
1470; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1471; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1472; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1473; GFX7-NEXT:    s_add_u32 s0, s0, 16
1474; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1475; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1476; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1477; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1478; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1479; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1480; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1481; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1482; GFX7-NEXT:    buffer_wbinvl1_vol
1483; GFX7-NEXT:    s_endpgm
1484;
1485; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
1486; GFX10-WGP:       ; %bb.0: ; %entry
1487; GFX10-WGP-NEXT:    s_clause 0x1
1488; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1489; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1490; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1491; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1492; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1493; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1494; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1495; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1496; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1497; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1498; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1499; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1500; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1501; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1502; GFX10-WGP-NEXT:    buffer_gl0_inv
1503; GFX10-WGP-NEXT:    buffer_gl1_inv
1504; GFX10-WGP-NEXT:    s_endpgm
1505;
1506; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
1507; GFX10-CU:       ; %bb.0: ; %entry
1508; GFX10-CU-NEXT:    s_clause 0x1
1509; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1510; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1511; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1512; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1513; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1514; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1515; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1516; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1517; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1518; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1519; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1520; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1521; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1522; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1523; GFX10-CU-NEXT:    buffer_gl0_inv
1524; GFX10-CU-NEXT:    buffer_gl1_inv
1525; GFX10-CU-NEXT:    s_endpgm
1526;
1527; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_cmpxchg:
1528; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1529; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1530; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1531; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1532; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1533; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1534; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1535; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1536; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1537; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1538; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1539; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1540; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1541; SKIP-CACHE-INV-NEXT:    s_endpgm
1542    i32* %out, i32 %in, i32 %old) {
1543entry:
1544  %gep = getelementptr i32, i32* %out, i32 4
1545  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire
1546  ret void
1547}
1548
1549define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
1550; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
1551; GFX7:       ; %bb.0: ; %entry
1552; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1553; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1554; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1555; GFX7-NEXT:    s_add_u32 s0, s0, 16
1556; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1557; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1558; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1559; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1560; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1561; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1562; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1563; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1564; GFX7-NEXT:    buffer_wbinvl1_vol
1565; GFX7-NEXT:    s_endpgm
1566;
1567; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
1568; GFX10-WGP:       ; %bb.0: ; %entry
1569; GFX10-WGP-NEXT:    s_clause 0x1
1570; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1571; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1572; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1573; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1574; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1575; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1576; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1577; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1578; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1579; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1580; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1581; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1582; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1583; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1584; GFX10-WGP-NEXT:    buffer_gl0_inv
1585; GFX10-WGP-NEXT:    buffer_gl1_inv
1586; GFX10-WGP-NEXT:    s_endpgm
1587;
1588; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
1589; GFX10-CU:       ; %bb.0: ; %entry
1590; GFX10-CU-NEXT:    s_clause 0x1
1591; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1592; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1593; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1594; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1595; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1596; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1597; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1598; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1599; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1600; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1601; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1602; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1603; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1604; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1605; GFX10-CU-NEXT:    buffer_gl0_inv
1606; GFX10-CU-NEXT:    buffer_gl1_inv
1607; GFX10-CU-NEXT:    s_endpgm
1608;
1609; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
1610; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1611; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1612; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1613; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1614; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1615; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1616; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1617; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1618; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1619; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1620; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1621; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1622; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1623; SKIP-CACHE-INV-NEXT:    s_endpgm
1624    i32* %out, i32 %in, i32 %old) {
1625entry:
1626  %gep = getelementptr i32, i32* %out, i32 4
1627  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
1628  ret void
1629}
1630
1631define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
1632; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
1633; GFX7:       ; %bb.0: ; %entry
1634; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1635; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1636; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1637; GFX7-NEXT:    s_add_u32 s0, s0, 16
1638; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1639; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1640; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1641; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1642; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1643; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1644; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1645; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1646; GFX7-NEXT:    buffer_wbinvl1_vol
1647; GFX7-NEXT:    s_endpgm
1648;
1649; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
1650; GFX10-WGP:       ; %bb.0: ; %entry
1651; GFX10-WGP-NEXT:    s_clause 0x1
1652; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1653; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1654; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1655; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1656; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1657; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1658; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1659; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1660; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1661; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1662; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1663; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1664; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1665; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1666; GFX10-WGP-NEXT:    buffer_gl0_inv
1667; GFX10-WGP-NEXT:    buffer_gl1_inv
1668; GFX10-WGP-NEXT:    s_endpgm
1669;
1670; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
1671; GFX10-CU:       ; %bb.0: ; %entry
1672; GFX10-CU-NEXT:    s_clause 0x1
1673; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1674; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1675; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1676; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1677; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1678; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1679; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1680; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1681; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1682; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1683; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1684; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1685; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1686; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1687; GFX10-CU-NEXT:    buffer_gl0_inv
1688; GFX10-CU-NEXT:    buffer_gl1_inv
1689; GFX10-CU-NEXT:    s_endpgm
1690;
1691; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
1692; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1693; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1694; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1695; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1696; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1697; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1698; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1699; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1700; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1701; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1702; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1703; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1704; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1705; SKIP-CACHE-INV-NEXT:    s_endpgm
1706    i32* %out, i32 %in, i32 %old) {
1707entry:
1708  %gep = getelementptr i32, i32* %out, i32 4
1709  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
1710  ret void
1711}
1712
1713define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
1714; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
1715; GFX7:       ; %bb.0: ; %entry
1716; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1717; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1718; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1719; GFX7-NEXT:    s_add_u32 s0, s0, 16
1720; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1721; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1722; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1723; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1724; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1725; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1726; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1727; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1728; GFX7-NEXT:    buffer_wbinvl1_vol
1729; GFX7-NEXT:    s_endpgm
1730;
1731; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
1732; GFX10-WGP:       ; %bb.0: ; %entry
1733; GFX10-WGP-NEXT:    s_clause 0x1
1734; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1735; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1736; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1737; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
1738; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
1739; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1740; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1741; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1742; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1743; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1744; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1745; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1746; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1747; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1748; GFX10-WGP-NEXT:    buffer_gl0_inv
1749; GFX10-WGP-NEXT:    buffer_gl1_inv
1750; GFX10-WGP-NEXT:    s_endpgm
1751;
1752; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
1753; GFX10-CU:       ; %bb.0: ; %entry
1754; GFX10-CU-NEXT:    s_clause 0x1
1755; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1756; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1757; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1758; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
1759; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
1760; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1761; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1762; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1763; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1764; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1765; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1766; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1767; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1768; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1769; GFX10-CU-NEXT:    buffer_gl0_inv
1770; GFX10-CU-NEXT:    buffer_gl1_inv
1771; GFX10-CU-NEXT:    s_endpgm
1772;
1773; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
1774; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1775; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1776; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1777; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1778; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
1779; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
1780; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1781; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1782; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1783; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1784; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1785; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1786; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1787; SKIP-CACHE-INV-NEXT:    s_endpgm
1788    i32* %out, i32 %in, i32 %old) {
1789entry:
1790  %gep = getelementptr i32, i32* %out, i32 4
1791  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
1792  ret void
1793}
1794
1795define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
1796; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
1797; GFX7:       ; %bb.0: ; %entry
1798; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1799; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1800; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1801; GFX7-NEXT:    s_add_u32 s4, s0, 16
1802; GFX7-NEXT:    s_addc_u32 s5, s1, 0
1803; GFX7-NEXT:    v_mov_b32_e32 v0, s4
1804; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1805; GFX7-NEXT:    v_mov_b32_e32 v1, s5
1806; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1807; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1808; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1809; GFX7-NEXT:    buffer_wbinvl1_vol
1810; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1811; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1812; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1813; GFX7-NEXT:    flat_store_dword v[0:1], v2
1814; GFX7-NEXT:    s_endpgm
1815;
1816; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
1817; GFX10-WGP:       ; %bb.0: ; %entry
1818; GFX10-WGP-NEXT:    s_clause 0x1
1819; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1820; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1821; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1822; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
1823; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
1824; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
1825; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1826; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
1827; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1828; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1829; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1830; GFX10-WGP-NEXT:    buffer_gl0_inv
1831; GFX10-WGP-NEXT:    buffer_gl1_inv
1832; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1833; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1834; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1835; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1836; GFX10-WGP-NEXT:    s_endpgm
1837;
1838; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
1839; GFX10-CU:       ; %bb.0: ; %entry
1840; GFX10-CU-NEXT:    s_clause 0x1
1841; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1842; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1843; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1844; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
1845; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
1846; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
1847; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1848; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
1849; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1850; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1851; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1852; GFX10-CU-NEXT:    buffer_gl0_inv
1853; GFX10-CU-NEXT:    buffer_gl1_inv
1854; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1855; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1856; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1857; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1858; GFX10-CU-NEXT:    s_endpgm
1859;
1860; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
1861; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1862; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1863; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1864; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1865; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
1866; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
1867; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
1868; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1869; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
1870; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1871; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1872; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1873; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1874; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1875; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1876; SKIP-CACHE-INV-NEXT:    s_endpgm
1877    i32* %out, i32 %in, i32 %old) {
1878entry:
1879  %gep = getelementptr i32, i32* %out, i32 4
1880  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
1881  %val0 = extractvalue { i32, i1 } %val, 0
1882  store i32 %val0, i32* %out, align 4
1883  ret void
1884}
1885
1886define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
1887; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
1888; GFX7:       ; %bb.0: ; %entry
1889; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1890; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1891; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1892; GFX7-NEXT:    s_add_u32 s4, s0, 16
1893; GFX7-NEXT:    s_addc_u32 s5, s1, 0
1894; GFX7-NEXT:    v_mov_b32_e32 v0, s4
1895; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1896; GFX7-NEXT:    v_mov_b32_e32 v1, s5
1897; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1898; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1899; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1900; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1901; GFX7-NEXT:    buffer_wbinvl1_vol
1902; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1903; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1904; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1905; GFX7-NEXT:    flat_store_dword v[0:1], v2
1906; GFX7-NEXT:    s_endpgm
1907;
1908; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
1909; GFX10-WGP:       ; %bb.0: ; %entry
1910; GFX10-WGP-NEXT:    s_clause 0x1
1911; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1912; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1913; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1914; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
1915; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
1916; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
1917; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
1918; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
1919; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
1920; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1921; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1922; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1923; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1924; GFX10-WGP-NEXT:    buffer_gl0_inv
1925; GFX10-WGP-NEXT:    buffer_gl1_inv
1926; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1927; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1928; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1929; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
1930; GFX10-WGP-NEXT:    s_endpgm
1931;
1932; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
1933; GFX10-CU:       ; %bb.0: ; %entry
1934; GFX10-CU-NEXT:    s_clause 0x1
1935; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1936; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1937; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1938; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
1939; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
1940; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
1941; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
1942; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
1943; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
1944; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1945; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1946; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1947; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1948; GFX10-CU-NEXT:    buffer_gl0_inv
1949; GFX10-CU-NEXT:    buffer_gl1_inv
1950; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1951; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1952; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1953; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
1954; GFX10-CU-NEXT:    s_endpgm
1955;
1956; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
1957; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1958; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
1959; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1960; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1961; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
1962; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
1963; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
1964; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
1965; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
1966; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
1967; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1968; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1969; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1970; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
1971; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
1972; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
1973; SKIP-CACHE-INV-NEXT:    s_endpgm
1974    i32* %out, i32 %in, i32 %old) {
1975entry:
1976  %gep = getelementptr i32, i32* %out, i32 4
1977  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
1978  %val0 = extractvalue { i32, i1 } %val, 0
1979  store i32 %val0, i32* %out, align 4
1980  ret void
1981}
1982
1983define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
1984; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
1985; GFX7:       ; %bb.0: ; %entry
1986; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1987; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1988; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1989; GFX7-NEXT:    s_add_u32 s4, s0, 16
1990; GFX7-NEXT:    s_addc_u32 s5, s1, 0
1991; GFX7-NEXT:    v_mov_b32_e32 v0, s4
1992; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1993; GFX7-NEXT:    v_mov_b32_e32 v1, s5
1994; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1995; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1996; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1997; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1998; GFX7-NEXT:    buffer_wbinvl1_vol
1999; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2000; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2001; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2002; GFX7-NEXT:    flat_store_dword v[0:1], v2
2003; GFX7-NEXT:    s_endpgm
2004;
2005; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
2006; GFX10-WGP:       ; %bb.0: ; %entry
2007; GFX10-WGP-NEXT:    s_clause 0x1
2008; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2009; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2010; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2011; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2012; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2013; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2014; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2015; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2016; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2017; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2018; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2019; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2020; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2021; GFX10-WGP-NEXT:    buffer_gl0_inv
2022; GFX10-WGP-NEXT:    buffer_gl1_inv
2023; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2024; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2025; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2026; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2027; GFX10-WGP-NEXT:    s_endpgm
2028;
2029; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
2030; GFX10-CU:       ; %bb.0: ; %entry
2031; GFX10-CU-NEXT:    s_clause 0x1
2032; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2033; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2034; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2035; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2036; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2037; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2038; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2039; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2040; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2041; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2042; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2043; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2044; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2045; GFX10-CU-NEXT:    buffer_gl0_inv
2046; GFX10-CU-NEXT:    buffer_gl1_inv
2047; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2048; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2049; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2050; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2051; GFX10-CU-NEXT:    s_endpgm
2052;
2053; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
2054; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2055; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2056; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2057; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2058; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2059; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2060; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2061; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2062; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2063; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2064; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2065; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2066; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2067; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2068; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2069; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2070; SKIP-CACHE-INV-NEXT:    s_endpgm
2071    i32* %out, i32 %in, i32 %old) {
2072entry:
2073  %gep = getelementptr i32, i32* %out, i32 4
2074  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
2075  %val0 = extractvalue { i32, i1 } %val, 0
2076  store i32 %val0, i32* %out, align 4
2077  ret void
2078}
2079
2080define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
2081; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
2082; GFX7:       ; %bb.0: ; %entry
2083; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2084; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2085; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2086; GFX7-NEXT:    s_add_u32 s4, s0, 16
2087; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2088; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2089; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2090; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2091; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2092; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2093; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2094; GFX7-NEXT:    buffer_wbinvl1_vol
2095; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2096; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2097; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2098; GFX7-NEXT:    flat_store_dword v[0:1], v2
2099; GFX7-NEXT:    s_endpgm
2100;
2101; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
2102; GFX10-WGP:       ; %bb.0: ; %entry
2103; GFX10-WGP-NEXT:    s_clause 0x1
2104; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2105; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2106; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2107; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2108; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2109; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2110; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2111; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2112; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2113; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2114; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2115; GFX10-WGP-NEXT:    buffer_gl0_inv
2116; GFX10-WGP-NEXT:    buffer_gl1_inv
2117; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2118; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2119; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2120; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2121; GFX10-WGP-NEXT:    s_endpgm
2122;
2123; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
2124; GFX10-CU:       ; %bb.0: ; %entry
2125; GFX10-CU-NEXT:    s_clause 0x1
2126; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2127; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2128; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2129; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2130; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2131; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2132; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2133; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2134; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2135; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2136; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2137; GFX10-CU-NEXT:    buffer_gl0_inv
2138; GFX10-CU-NEXT:    buffer_gl1_inv
2139; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2140; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2141; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2142; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2143; GFX10-CU-NEXT:    s_endpgm
2144;
2145; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
2146; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2147; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2148; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2149; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2150; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2151; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2152; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2153; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2154; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2155; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2156; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2157; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2158; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2159; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2160; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2161; SKIP-CACHE-INV-NEXT:    s_endpgm
2162    i32* %out, i32 %in, i32 %old) {
2163entry:
2164  %gep = getelementptr i32, i32* %out, i32 4
2165  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
2166  %val0 = extractvalue { i32, i1 } %val, 0
2167  store i32 %val0, i32* %out, align 4
2168  ret void
2169}
2170
2171define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
2172; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg:
2173; GFX7:       ; %bb.0: ; %entry
2174; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2175; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2176; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2177; GFX7-NEXT:    s_add_u32 s4, s0, 16
2178; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2179; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2180; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2181; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2182; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2183; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2184; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2185; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2186; GFX7-NEXT:    buffer_wbinvl1_vol
2187; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2188; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2189; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2190; GFX7-NEXT:    flat_store_dword v[0:1], v2
2191; GFX7-NEXT:    s_endpgm
2192;
2193; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
2194; GFX10-WGP:       ; %bb.0: ; %entry
2195; GFX10-WGP-NEXT:    s_clause 0x1
2196; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2197; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2198; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2199; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2200; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2201; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2202; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2203; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2204; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2205; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2206; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2207; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2208; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2209; GFX10-WGP-NEXT:    buffer_gl0_inv
2210; GFX10-WGP-NEXT:    buffer_gl1_inv
2211; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2212; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2213; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2214; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2215; GFX10-WGP-NEXT:    s_endpgm
2216;
2217; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
2218; GFX10-CU:       ; %bb.0: ; %entry
2219; GFX10-CU-NEXT:    s_clause 0x1
2220; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2221; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2222; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2223; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2224; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2225; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2226; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2227; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2228; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2229; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2230; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2231; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2232; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2233; GFX10-CU-NEXT:    buffer_gl0_inv
2234; GFX10-CU-NEXT:    buffer_gl1_inv
2235; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2236; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2237; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2238; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2239; GFX10-CU-NEXT:    s_endpgm
2240;
2241; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_ret_cmpxchg:
2242; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2243; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2244; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2245; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2246; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2247; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2248; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2249; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2250; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2251; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2252; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2253; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2254; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2255; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2256; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2257; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2258; SKIP-CACHE-INV-NEXT:    s_endpgm
2259    i32* %out, i32 %in, i32 %old) {
2260entry:
2261  %gep = getelementptr i32, i32* %out, i32 4
2262  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire
2263  %val0 = extractvalue { i32, i1 } %val, 0
2264  store i32 %val0, i32* %out, align 4
2265  ret void
2266}
2267
2268define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
2269; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
2270; GFX7:       ; %bb.0: ; %entry
2271; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2272; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2273; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2274; GFX7-NEXT:    s_add_u32 s4, s0, 16
2275; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2276; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2277; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2278; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2279; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2280; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2281; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2282; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2283; GFX7-NEXT:    buffer_wbinvl1_vol
2284; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2285; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2286; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2287; GFX7-NEXT:    flat_store_dword v[0:1], v2
2288; GFX7-NEXT:    s_endpgm
2289;
2290; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
2291; GFX10-WGP:       ; %bb.0: ; %entry
2292; GFX10-WGP-NEXT:    s_clause 0x1
2293; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2294; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2295; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2296; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2297; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2298; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2299; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2300; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2301; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2302; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2303; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2304; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2305; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2306; GFX10-WGP-NEXT:    buffer_gl0_inv
2307; GFX10-WGP-NEXT:    buffer_gl1_inv
2308; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2309; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2310; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2311; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2312; GFX10-WGP-NEXT:    s_endpgm
2313;
2314; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
2315; GFX10-CU:       ; %bb.0: ; %entry
2316; GFX10-CU-NEXT:    s_clause 0x1
2317; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2318; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2319; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2320; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2321; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2322; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2323; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2324; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2325; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2326; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2327; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2328; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2329; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2330; GFX10-CU-NEXT:    buffer_gl0_inv
2331; GFX10-CU-NEXT:    buffer_gl1_inv
2332; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2333; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2334; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2335; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2336; GFX10-CU-NEXT:    s_endpgm
2337;
2338; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
2339; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2340; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2341; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2342; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2343; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2344; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2345; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2346; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2347; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2348; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2349; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2350; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2351; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2352; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2353; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2354; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2355; SKIP-CACHE-INV-NEXT:    s_endpgm
2356    i32* %out, i32 %in, i32 %old) {
2357entry:
2358  %gep = getelementptr i32, i32* %out, i32 4
2359  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
2360  %val0 = extractvalue { i32, i1 } %val, 0
2361  store i32 %val0, i32* %out, align 4
2362  ret void
2363}
2364
2365define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
2366; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
2367; GFX7:       ; %bb.0: ; %entry
2368; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2369; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2370; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2371; GFX7-NEXT:    s_add_u32 s4, s0, 16
2372; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2373; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2374; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2375; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2376; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2377; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2378; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2379; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2380; GFX7-NEXT:    buffer_wbinvl1_vol
2381; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2382; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2383; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2384; GFX7-NEXT:    flat_store_dword v[0:1], v2
2385; GFX7-NEXT:    s_endpgm
2386;
2387; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
2388; GFX10-WGP:       ; %bb.0: ; %entry
2389; GFX10-WGP-NEXT:    s_clause 0x1
2390; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2391; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2392; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2393; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2394; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2395; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2396; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2397; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2398; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2399; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2400; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2401; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2402; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2403; GFX10-WGP-NEXT:    buffer_gl0_inv
2404; GFX10-WGP-NEXT:    buffer_gl1_inv
2405; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2406; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2407; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2408; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2409; GFX10-WGP-NEXT:    s_endpgm
2410;
2411; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
2412; GFX10-CU:       ; %bb.0: ; %entry
2413; GFX10-CU-NEXT:    s_clause 0x1
2414; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2415; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2416; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2417; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2418; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2419; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2420; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2421; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2422; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2423; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2424; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2425; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2426; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2427; GFX10-CU-NEXT:    buffer_gl0_inv
2428; GFX10-CU-NEXT:    buffer_gl1_inv
2429; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2430; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2431; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2432; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2433; GFX10-CU-NEXT:    s_endpgm
2434;
2435; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
2436; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2437; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2438; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2439; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2440; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2441; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2442; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2443; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2444; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2445; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2446; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2447; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2448; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2449; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2450; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2451; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2452; SKIP-CACHE-INV-NEXT:    s_endpgm
2453    i32* %out, i32 %in, i32 %old) {
2454entry:
2455  %gep = getelementptr i32, i32* %out, i32 4
2456  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
2457  %val0 = extractvalue { i32, i1 } %val, 0
2458  store i32 %val0, i32* %out, align 4
2459  ret void
2460}
2461
2462define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
2463; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
2464; GFX7:       ; %bb.0: ; %entry
2465; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2466; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2467; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2468; GFX7-NEXT:    s_add_u32 s4, s0, 16
2469; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2470; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2471; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2472; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2473; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2474; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2475; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2476; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2477; GFX7-NEXT:    buffer_wbinvl1_vol
2478; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2479; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2480; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2481; GFX7-NEXT:    flat_store_dword v[0:1], v2
2482; GFX7-NEXT:    s_endpgm
2483;
2484; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
2485; GFX10-WGP:       ; %bb.0: ; %entry
2486; GFX10-WGP-NEXT:    s_clause 0x1
2487; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2488; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2489; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2490; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
2491; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
2492; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
2493; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2494; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
2495; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
2496; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2497; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2498; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2499; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2500; GFX10-WGP-NEXT:    buffer_gl0_inv
2501; GFX10-WGP-NEXT:    buffer_gl1_inv
2502; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2503; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2504; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2505; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2506; GFX10-WGP-NEXT:    s_endpgm
2507;
2508; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
2509; GFX10-CU:       ; %bb.0: ; %entry
2510; GFX10-CU-NEXT:    s_clause 0x1
2511; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2512; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2513; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2514; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
2515; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
2516; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
2517; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2518; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
2519; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
2520; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2521; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2522; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2523; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2524; GFX10-CU-NEXT:    buffer_gl0_inv
2525; GFX10-CU-NEXT:    buffer_gl1_inv
2526; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2527; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2528; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2529; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2530; GFX10-CU-NEXT:    s_endpgm
2531;
2532; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
2533; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2534; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
2535; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2536; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2537; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
2538; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
2539; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2540; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
2541; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
2542; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
2543; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2544; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2545; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2546; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
2547; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
2548; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2549; SKIP-CACHE-INV-NEXT:    s_endpgm
2550    i32* %out, i32 %in, i32 %old) {
2551entry:
2552  %gep = getelementptr i32, i32* %out, i32 4
2553  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
2554  %val0 = extractvalue { i32, i1 } %val, 0
2555  store i32 %val0, i32* %out, align 4
2556  ret void
2557}
2558
2559define amdgpu_kernel void @flat_agent_one_as_unordered_load(
2560; GFX7-LABEL: flat_agent_one_as_unordered_load:
2561; GFX7:       ; %bb.0: ; %entry
2562; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2563; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2564; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2565; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2566; GFX7-NEXT:    flat_load_dword v0, v[0:1]
2567; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2568; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2569; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2570; GFX7-NEXT:    flat_store_dword v[2:3], v0
2571; GFX7-NEXT:    s_endpgm
2572;
2573; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load:
2574; GFX10-WGP:       ; %bb.0: ; %entry
2575; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2576; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2577; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2578; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2579; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
2580; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
2581; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
2582; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2583; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2584; GFX10-WGP-NEXT:    s_endpgm
2585;
2586; GFX10-CU-LABEL: flat_agent_one_as_unordered_load:
2587; GFX10-CU:       ; %bb.0: ; %entry
2588; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2589; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2590; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2591; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2592; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
2593; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
2594; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
2595; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2596; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2597; GFX10-CU-NEXT:    s_endpgm
2598;
2599; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_load:
2600; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2601; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2602; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2603; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2604; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2605; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1]
2606; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2607; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
2608; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2609; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
2610; SKIP-CACHE-INV-NEXT:    s_endpgm
2611    i32* %in, i32* %out) {
2612entry:
2613  %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4
2614  store i32 %val, i32* %out
2615  ret void
2616}
2617
2618define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
2619; GFX7-LABEL: flat_agent_one_as_monotonic_load:
2620; GFX7:       ; %bb.0: ; %entry
2621; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2622; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2623; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2624; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2625; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
2626; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2627; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2628; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2629; GFX7-NEXT:    flat_store_dword v[2:3], v0
2630; GFX7-NEXT:    s_endpgm
2631;
2632; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load:
2633; GFX10-WGP:       ; %bb.0: ; %entry
2634; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2635; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2636; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2637; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2638; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
2639; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
2640; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
2641; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2642; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2643; GFX10-WGP-NEXT:    s_endpgm
2644;
2645; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load:
2646; GFX10-CU:       ; %bb.0: ; %entry
2647; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2648; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2649; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2650; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2651; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
2652; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
2653; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
2654; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2655; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2656; GFX10-CU-NEXT:    s_endpgm
2657;
2658; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_load:
2659; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2660; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2661; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2662; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2663; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2664; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
2665; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2666; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
2667; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2668; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
2669; SKIP-CACHE-INV-NEXT:    s_endpgm
2670    i32* %in, i32* %out) {
2671entry:
2672  %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4
2673  store i32 %val, i32* %out
2674  ret void
2675}
2676
2677define amdgpu_kernel void @flat_agent_one_as_acquire_load(
2678; GFX7-LABEL: flat_agent_one_as_acquire_load:
2679; GFX7:       ; %bb.0: ; %entry
2680; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2681; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2682; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2683; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2684; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
2685; GFX7-NEXT:    s_waitcnt vmcnt(0)
2686; GFX7-NEXT:    buffer_wbinvl1_vol
2687; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2688; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2689; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2690; GFX7-NEXT:    flat_store_dword v[2:3], v0
2691; GFX7-NEXT:    s_endpgm
2692;
2693; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load:
2694; GFX10-WGP:       ; %bb.0: ; %entry
2695; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2696; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2697; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2698; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2699; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
2700; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2701; GFX10-WGP-NEXT:    buffer_gl0_inv
2702; GFX10-WGP-NEXT:    buffer_gl1_inv
2703; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
2704; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
2705; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2706; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2707; GFX10-WGP-NEXT:    s_endpgm
2708;
2709; GFX10-CU-LABEL: flat_agent_one_as_acquire_load:
2710; GFX10-CU:       ; %bb.0: ; %entry
2711; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2712; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2713; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2714; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2715; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
2716; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2717; GFX10-CU-NEXT:    buffer_gl0_inv
2718; GFX10-CU-NEXT:    buffer_gl1_inv
2719; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
2720; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
2721; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2722; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2723; GFX10-CU-NEXT:    s_endpgm
2724;
2725; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_load:
2726; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2727; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2728; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2729; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2730; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2731; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
2732; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2733; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2734; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
2735; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2736; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
2737; SKIP-CACHE-INV-NEXT:    s_endpgm
2738    i32* %in, i32* %out) {
2739entry:
2740  %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4
2741  store i32 %val, i32* %out
2742  ret void
2743}
2744
2745define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
2746; GFX7-LABEL: flat_agent_one_as_seq_cst_load:
2747; GFX7:       ; %bb.0: ; %entry
2748; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2749; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2750; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2751; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2752; GFX7-NEXT:    s_waitcnt vmcnt(0)
2753; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
2754; GFX7-NEXT:    s_waitcnt vmcnt(0)
2755; GFX7-NEXT:    buffer_wbinvl1_vol
2756; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2757; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2758; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2759; GFX7-NEXT:    flat_store_dword v[2:3], v0
2760; GFX7-NEXT:    s_endpgm
2761;
2762; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load:
2763; GFX10-WGP:       ; %bb.0: ; %entry
2764; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2765; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2766; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2767; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2768; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2769; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2770; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
2771; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2772; GFX10-WGP-NEXT:    buffer_gl0_inv
2773; GFX10-WGP-NEXT:    buffer_gl1_inv
2774; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
2775; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
2776; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2777; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2778; GFX10-WGP-NEXT:    s_endpgm
2779;
2780; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load:
2781; GFX10-CU:       ; %bb.0: ; %entry
2782; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2783; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2784; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2785; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2786; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2787; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2788; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
2789; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2790; GFX10-CU-NEXT:    buffer_gl0_inv
2791; GFX10-CU-NEXT:    buffer_gl1_inv
2792; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
2793; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
2794; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2795; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2796; GFX10-CU-NEXT:    s_endpgm
2797;
2798; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_load:
2799; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2800; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2801; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2802; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2803; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2804; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2805; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] glc
2806; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2807; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2808; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
2809; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2810; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0
2811; SKIP-CACHE-INV-NEXT:    s_endpgm
2812    i32* %in, i32* %out) {
2813entry:
2814  %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4
2815  store i32 %val, i32* %out
2816  ret void
2817}
2818
2819define amdgpu_kernel void @flat_agent_one_as_unordered_store(
2820; GFX7-LABEL: flat_agent_one_as_unordered_store:
2821; GFX7:       ; %bb.0: ; %entry
2822; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
2823; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
2824; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2825; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2826; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2827; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2828; GFX7-NEXT:    flat_store_dword v[0:1], v2
2829; GFX7-NEXT:    s_endpgm
2830;
2831; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store:
2832; GFX10-WGP:       ; %bb.0: ; %entry
2833; GFX10-WGP-NEXT:    s_clause 0x1
2834; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2835; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
2836; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2837; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2838; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2839; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2840; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2841; GFX10-WGP-NEXT:    s_endpgm
2842;
2843; GFX10-CU-LABEL: flat_agent_one_as_unordered_store:
2844; GFX10-CU:       ; %bb.0: ; %entry
2845; GFX10-CU-NEXT:    s_clause 0x1
2846; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2847; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
2848; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2849; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2850; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2851; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2852; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2853; GFX10-CU-NEXT:    s_endpgm
2854;
2855; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_store:
2856; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2857; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
2858; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2859; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2860; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2861; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2862; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2863; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2864; SKIP-CACHE-INV-NEXT:    s_endpgm
2865    i32 %in, i32* %out) {
2866entry:
2867  store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4
2868  ret void
2869}
2870
2871define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
2872; GFX7-LABEL: flat_agent_one_as_monotonic_store:
2873; GFX7:       ; %bb.0: ; %entry
2874; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
2875; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
2876; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2877; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2878; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2879; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2880; GFX7-NEXT:    flat_store_dword v[0:1], v2
2881; GFX7-NEXT:    s_endpgm
2882;
2883; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store:
2884; GFX10-WGP:       ; %bb.0: ; %entry
2885; GFX10-WGP-NEXT:    s_clause 0x1
2886; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2887; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
2888; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2889; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2890; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2891; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2892; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2893; GFX10-WGP-NEXT:    s_endpgm
2894;
2895; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store:
2896; GFX10-CU:       ; %bb.0: ; %entry
2897; GFX10-CU-NEXT:    s_clause 0x1
2898; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2899; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
2900; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2901; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2902; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2903; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2904; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2905; GFX10-CU-NEXT:    s_endpgm
2906;
2907; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_store:
2908; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2909; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
2910; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2911; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2912; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2913; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2914; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2915; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2916; SKIP-CACHE-INV-NEXT:    s_endpgm
2917    i32 %in, i32* %out) {
2918entry:
2919  store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4
2920  ret void
2921}
2922
2923define amdgpu_kernel void @flat_agent_one_as_release_store(
2924; GFX7-LABEL: flat_agent_one_as_release_store:
2925; GFX7:       ; %bb.0: ; %entry
2926; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
2927; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
2928; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2929; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2930; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2931; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2932; GFX7-NEXT:    s_waitcnt vmcnt(0)
2933; GFX7-NEXT:    flat_store_dword v[0:1], v2
2934; GFX7-NEXT:    s_endpgm
2935;
2936; GFX10-WGP-LABEL: flat_agent_one_as_release_store:
2937; GFX10-WGP:       ; %bb.0: ; %entry
2938; GFX10-WGP-NEXT:    s_clause 0x1
2939; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2940; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
2941; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2942; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2943; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2944; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
2945; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2946; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2947; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
2948; GFX10-WGP-NEXT:    s_endpgm
2949;
2950; GFX10-CU-LABEL: flat_agent_one_as_release_store:
2951; GFX10-CU:       ; %bb.0: ; %entry
2952; GFX10-CU-NEXT:    s_clause 0x1
2953; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2954; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
2955; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2956; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2957; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2958; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
2959; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2960; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2961; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
2962; GFX10-CU-NEXT:    s_endpgm
2963;
2964; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_store:
2965; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2966; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
2967; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2968; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2969; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
2970; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2971; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2972; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2973; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
2974; SKIP-CACHE-INV-NEXT:    s_endpgm
2975    i32 %in, i32* %out) {
2976entry:
2977  store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4
2978  ret void
2979}
2980
2981define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
2982; GFX7-LABEL: flat_agent_one_as_seq_cst_store:
2983; GFX7:       ; %bb.0: ; %entry
2984; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
2985; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
2986; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2987; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2988; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2989; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2990; GFX7-NEXT:    s_waitcnt vmcnt(0)
2991; GFX7-NEXT:    flat_store_dword v[0:1], v2
2992; GFX7-NEXT:    s_endpgm
2993;
2994; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store:
2995; GFX10-WGP:       ; %bb.0: ; %entry
2996; GFX10-WGP-NEXT:    s_clause 0x1
2997; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2998; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
2999; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3000; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3001; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3002; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3003; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3004; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3005; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3006; GFX10-WGP-NEXT:    s_endpgm
3007;
3008; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store:
3009; GFX10-CU:       ; %bb.0: ; %entry
3010; GFX10-CU-NEXT:    s_clause 0x1
3011; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3012; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
3013; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3014; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3015; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3016; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3017; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3018; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3019; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3020; GFX10-CU-NEXT:    s_endpgm
3021;
3022; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_store:
3023; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3024; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9
3025; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3026; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3027; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
3028; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3029; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3030; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3031; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3032; SKIP-CACHE-INV-NEXT:    s_endpgm
3033    i32 %in, i32* %out) {
3034entry:
3035  store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4
3036  ret void
3037}
3038
3039define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
3040; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw:
3041; GFX7:       ; %bb.0: ; %entry
3042; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3043; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3044; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3045; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3046; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3047; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3048; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3049; GFX7-NEXT:    s_endpgm
3050;
3051; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
3052; GFX10-WGP:       ; %bb.0: ; %entry
3053; GFX10-WGP-NEXT:    s_clause 0x1
3054; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3055; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3056; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3057; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3058; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3059; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3060; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
3061; GFX10-WGP-NEXT:    s_endpgm
3062;
3063; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
3064; GFX10-CU:       ; %bb.0: ; %entry
3065; GFX10-CU-NEXT:    s_clause 0x1
3066; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3067; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3068; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3069; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3070; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3071; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3072; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
3073; GFX10-CU-NEXT:    s_endpgm
3074;
3075; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_atomicrmw:
3076; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3077; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3078; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3079; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3080; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3081; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3082; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3083; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
3084; SKIP-CACHE-INV-NEXT:    s_endpgm
3085    i32* %out, i32 %in) {
3086entry:
3087  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic
3088  ret void
3089}
3090
3091define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
3092; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw:
3093; GFX7:       ; %bb.0: ; %entry
3094; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3095; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3096; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3097; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3098; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3099; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3100; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3101; GFX7-NEXT:    s_waitcnt vmcnt(0)
3102; GFX7-NEXT:    buffer_wbinvl1_vol
3103; GFX7-NEXT:    s_endpgm
3104;
3105; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
3106; GFX10-WGP:       ; %bb.0: ; %entry
3107; GFX10-WGP-NEXT:    s_clause 0x1
3108; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3109; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3110; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3111; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3112; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3113; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3114; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
3115; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3116; GFX10-WGP-NEXT:    buffer_gl0_inv
3117; GFX10-WGP-NEXT:    buffer_gl1_inv
3118; GFX10-WGP-NEXT:    s_endpgm
3119;
3120; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
3121; GFX10-CU:       ; %bb.0: ; %entry
3122; GFX10-CU-NEXT:    s_clause 0x1
3123; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3124; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3125; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3126; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3127; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3128; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3129; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
3130; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3131; GFX10-CU-NEXT:    buffer_gl0_inv
3132; GFX10-CU-NEXT:    buffer_gl1_inv
3133; GFX10-CU-NEXT:    s_endpgm
3134;
3135; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_atomicrmw:
3136; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3137; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3138; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3139; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3140; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3141; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3142; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3143; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
3144; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3145; SKIP-CACHE-INV-NEXT:    s_endpgm
3146    i32* %out, i32 %in) {
3147entry:
3148  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire
3149  ret void
3150}
3151
3152define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
3153; GFX7-LABEL: flat_agent_one_as_release_atomicrmw:
3154; GFX7:       ; %bb.0: ; %entry
3155; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3156; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3157; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3158; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3159; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3160; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3161; GFX7-NEXT:    s_waitcnt vmcnt(0)
3162; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3163; GFX7-NEXT:    s_endpgm
3164;
3165; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
3166; GFX10-WGP:       ; %bb.0: ; %entry
3167; GFX10-WGP-NEXT:    s_clause 0x1
3168; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3169; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3170; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3171; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3172; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3173; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3174; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3175; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3176; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
3177; GFX10-WGP-NEXT:    s_endpgm
3178;
3179; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw:
3180; GFX10-CU:       ; %bb.0: ; %entry
3181; GFX10-CU-NEXT:    s_clause 0x1
3182; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3183; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3184; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3185; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3186; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3187; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3188; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3189; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3190; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
3191; GFX10-CU-NEXT:    s_endpgm
3192;
3193; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_atomicrmw:
3194; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3195; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3196; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3197; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3198; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3199; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3200; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3201; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3202; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
3203; SKIP-CACHE-INV-NEXT:    s_endpgm
3204    i32* %out, i32 %in) {
3205entry:
3206  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release
3207  ret void
3208}
3209
3210define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
3211; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
3212; GFX7:       ; %bb.0: ; %entry
3213; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3214; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3215; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3216; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3217; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3218; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3219; GFX7-NEXT:    s_waitcnt vmcnt(0)
3220; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3221; GFX7-NEXT:    s_waitcnt vmcnt(0)
3222; GFX7-NEXT:    buffer_wbinvl1_vol
3223; GFX7-NEXT:    s_endpgm
3224;
3225; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
3226; GFX10-WGP:       ; %bb.0: ; %entry
3227; GFX10-WGP-NEXT:    s_clause 0x1
3228; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3229; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3230; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3231; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3232; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3233; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3234; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3235; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3236; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
3237; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3238; GFX10-WGP-NEXT:    buffer_gl0_inv
3239; GFX10-WGP-NEXT:    buffer_gl1_inv
3240; GFX10-WGP-NEXT:    s_endpgm
3241;
3242; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
3243; GFX10-CU:       ; %bb.0: ; %entry
3244; GFX10-CU-NEXT:    s_clause 0x1
3245; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3246; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3247; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3248; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3249; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3250; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3251; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3252; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3253; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
3254; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3255; GFX10-CU-NEXT:    buffer_gl0_inv
3256; GFX10-CU-NEXT:    buffer_gl1_inv
3257; GFX10-CU-NEXT:    s_endpgm
3258;
3259; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
3260; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3261; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3262; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3263; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3264; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3265; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3266; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3267; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3268; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
3269; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3270; SKIP-CACHE-INV-NEXT:    s_endpgm
3271    i32* %out, i32 %in) {
3272entry:
3273  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel
3274  ret void
3275}
3276
3277define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
3278; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
3279; GFX7:       ; %bb.0: ; %entry
3280; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3281; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3282; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3283; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3284; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3285; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3286; GFX7-NEXT:    s_waitcnt vmcnt(0)
3287; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3288; GFX7-NEXT:    s_waitcnt vmcnt(0)
3289; GFX7-NEXT:    buffer_wbinvl1_vol
3290; GFX7-NEXT:    s_endpgm
3291;
3292; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
3293; GFX10-WGP:       ; %bb.0: ; %entry
3294; GFX10-WGP-NEXT:    s_clause 0x1
3295; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3296; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3297; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3298; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3299; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3300; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3301; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3302; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3303; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
3304; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3305; GFX10-WGP-NEXT:    buffer_gl0_inv
3306; GFX10-WGP-NEXT:    buffer_gl1_inv
3307; GFX10-WGP-NEXT:    s_endpgm
3308;
3309; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
3310; GFX10-CU:       ; %bb.0: ; %entry
3311; GFX10-CU-NEXT:    s_clause 0x1
3312; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3313; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3314; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3315; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3316; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3317; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3318; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3319; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3320; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
3321; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3322; GFX10-CU-NEXT:    buffer_gl0_inv
3323; GFX10-CU-NEXT:    buffer_gl1_inv
3324; GFX10-CU-NEXT:    s_endpgm
3325;
3326; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
3327; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3328; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3329; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3330; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3331; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3332; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3333; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3334; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3335; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
3336; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3337; SKIP-CACHE-INV-NEXT:    s_endpgm
3338    i32* %out, i32 %in) {
3339entry:
3340  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst
3341  ret void
3342}
3343
3344define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
3345; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
3346; GFX7:       ; %bb.0: ; %entry
3347; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3348; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3349; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3350; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3351; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3352; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3353; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3354; GFX7-NEXT:    s_waitcnt vmcnt(0)
3355; GFX7-NEXT:    buffer_wbinvl1_vol
3356; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3357; GFX7-NEXT:    flat_store_dword v[0:1], v2
3358; GFX7-NEXT:    s_endpgm
3359;
3360; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
3361; GFX10-WGP:       ; %bb.0: ; %entry
3362; GFX10-WGP-NEXT:    s_clause 0x1
3363; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3364; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3365; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3366; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3367; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3368; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3369; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3370; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3371; GFX10-WGP-NEXT:    buffer_gl0_inv
3372; GFX10-WGP-NEXT:    buffer_gl1_inv
3373; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3374; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3375; GFX10-WGP-NEXT:    s_endpgm
3376;
3377; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
3378; GFX10-CU:       ; %bb.0: ; %entry
3379; GFX10-CU-NEXT:    s_clause 0x1
3380; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3381; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3382; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3383; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3384; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3385; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3386; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3387; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3388; GFX10-CU-NEXT:    buffer_gl0_inv
3389; GFX10-CU-NEXT:    buffer_gl1_inv
3390; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3391; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3392; GFX10-CU-NEXT:    s_endpgm
3393;
3394; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
3395; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3396; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3397; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3398; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3399; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3400; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3401; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3402; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3403; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3404; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3405; SKIP-CACHE-INV-NEXT:    s_endpgm
3406    i32* %out, i32 %in) {
3407entry:
3408  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire
3409  store i32 %val, i32* %out, align 4
3410  ret void
3411}
3412
3413define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
3414; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
3415; GFX7:       ; %bb.0: ; %entry
3416; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3417; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3418; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3419; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3420; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3421; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3422; GFX7-NEXT:    s_waitcnt vmcnt(0)
3423; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3424; GFX7-NEXT:    s_waitcnt vmcnt(0)
3425; GFX7-NEXT:    buffer_wbinvl1_vol
3426; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3427; GFX7-NEXT:    flat_store_dword v[0:1], v2
3428; GFX7-NEXT:    s_endpgm
3429;
3430; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
3431; GFX10-WGP:       ; %bb.0: ; %entry
3432; GFX10-WGP-NEXT:    s_clause 0x1
3433; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3434; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3435; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3436; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3437; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3438; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3439; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3440; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3441; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3442; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3443; GFX10-WGP-NEXT:    buffer_gl0_inv
3444; GFX10-WGP-NEXT:    buffer_gl1_inv
3445; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3446; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3447; GFX10-WGP-NEXT:    s_endpgm
3448;
3449; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
3450; GFX10-CU:       ; %bb.0: ; %entry
3451; GFX10-CU-NEXT:    s_clause 0x1
3452; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3453; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3454; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3455; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3456; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3457; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3458; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3459; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3460; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3461; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3462; GFX10-CU-NEXT:    buffer_gl0_inv
3463; GFX10-CU-NEXT:    buffer_gl1_inv
3464; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3465; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3466; GFX10-CU-NEXT:    s_endpgm
3467;
3468; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
3469; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3470; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3471; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3472; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3473; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3474; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3475; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3476; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3477; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3478; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3479; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3480; SKIP-CACHE-INV-NEXT:    s_endpgm
3481    i32* %out, i32 %in) {
3482entry:
3483  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel
3484  store i32 %val, i32* %out, align 4
3485  ret void
3486}
3487
3488define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
3489; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
3490; GFX7:       ; %bb.0: ; %entry
3491; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3492; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3493; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3494; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3495; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3496; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3497; GFX7-NEXT:    s_waitcnt vmcnt(0)
3498; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3499; GFX7-NEXT:    s_waitcnt vmcnt(0)
3500; GFX7-NEXT:    buffer_wbinvl1_vol
3501; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3502; GFX7-NEXT:    flat_store_dword v[0:1], v2
3503; GFX7-NEXT:    s_endpgm
3504;
3505; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
3506; GFX10-WGP:       ; %bb.0: ; %entry
3507; GFX10-WGP-NEXT:    s_clause 0x1
3508; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3509; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3510; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3511; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3512; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3513; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3514; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3515; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3516; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3517; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3518; GFX10-WGP-NEXT:    buffer_gl0_inv
3519; GFX10-WGP-NEXT:    buffer_gl1_inv
3520; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3521; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
3522; GFX10-WGP-NEXT:    s_endpgm
3523;
3524; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
3525; GFX10-CU:       ; %bb.0: ; %entry
3526; GFX10-CU-NEXT:    s_clause 0x1
3527; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3528; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3529; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3530; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3531; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3532; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3533; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3534; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3535; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3536; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3537; GFX10-CU-NEXT:    buffer_gl0_inv
3538; GFX10-CU-NEXT:    buffer_gl1_inv
3539; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3540; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
3541; GFX10-CU-NEXT:    s_endpgm
3542;
3543; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
3544; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3545; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3546; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3547; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3548; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3549; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3550; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3551; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3552; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3553; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3554; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
3555; SKIP-CACHE-INV-NEXT:    s_endpgm
3556    i32* %out, i32 %in) {
3557entry:
3558  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst
3559  store i32 %val, i32* %out, align 4
3560  ret void
3561}
3562
3563define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
3564; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
3565; GFX7:       ; %bb.0: ; %entry
3566; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3567; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3568; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3569; GFX7-NEXT:    s_add_u32 s0, s0, 16
3570; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3571; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3572; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3573; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3574; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3575; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3576; GFX7-NEXT:    s_endpgm
3577;
3578; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
3579; GFX10-WGP:       ; %bb.0: ; %entry
3580; GFX10-WGP-NEXT:    s_clause 0x1
3581; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3582; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3583; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3584; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3585; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3586; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3587; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3588; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3589; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3590; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3591; GFX10-WGP-NEXT:    s_endpgm
3592;
3593; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
3594; GFX10-CU:       ; %bb.0: ; %entry
3595; GFX10-CU-NEXT:    s_clause 0x1
3596; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3597; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3598; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3599; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3600; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3601; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3602; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3603; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3604; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3605; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3606; GFX10-CU-NEXT:    s_endpgm
3607;
3608; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
3609; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3610; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3611; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3612; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3613; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3614; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3615; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3616; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3617; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3618; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3619; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3620; SKIP-CACHE-INV-NEXT:    s_endpgm
3621    i32* %out, i32 %in, i32 %old) {
3622entry:
3623  %gep = getelementptr i32, i32* %out, i32 4
3624  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
3625  ret void
3626}
3627
3628define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
3629; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
3630; GFX7:       ; %bb.0: ; %entry
3631; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3632; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3633; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3634; GFX7-NEXT:    s_add_u32 s0, s0, 16
3635; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3636; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3637; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3638; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3639; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3640; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3641; GFX7-NEXT:    s_waitcnt vmcnt(0)
3642; GFX7-NEXT:    buffer_wbinvl1_vol
3643; GFX7-NEXT:    s_endpgm
3644;
3645; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
3646; GFX10-WGP:       ; %bb.0: ; %entry
3647; GFX10-WGP-NEXT:    s_clause 0x1
3648; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3649; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3650; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3651; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3652; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3653; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3654; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3655; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3656; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3657; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3658; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3659; GFX10-WGP-NEXT:    buffer_gl0_inv
3660; GFX10-WGP-NEXT:    buffer_gl1_inv
3661; GFX10-WGP-NEXT:    s_endpgm
3662;
3663; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
3664; GFX10-CU:       ; %bb.0: ; %entry
3665; GFX10-CU-NEXT:    s_clause 0x1
3666; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3667; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3668; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3669; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3670; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3671; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3672; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3673; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3674; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3675; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3676; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3677; GFX10-CU-NEXT:    buffer_gl0_inv
3678; GFX10-CU-NEXT:    buffer_gl1_inv
3679; GFX10-CU-NEXT:    s_endpgm
3680;
3681; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
3682; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3683; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3684; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3685; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3686; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3687; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3688; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3689; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3690; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3691; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3692; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3693; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3694; SKIP-CACHE-INV-NEXT:    s_endpgm
3695    i32* %out, i32 %in, i32 %old) {
3696entry:
3697  %gep = getelementptr i32, i32* %out, i32 4
3698  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
3699  ret void
3700}
3701
3702define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
3703; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
3704; GFX7:       ; %bb.0: ; %entry
3705; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3706; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3707; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3708; GFX7-NEXT:    s_add_u32 s0, s0, 16
3709; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3710; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3711; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3712; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3713; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3714; GFX7-NEXT:    s_waitcnt vmcnt(0)
3715; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3716; GFX7-NEXT:    s_endpgm
3717;
3718; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
3719; GFX10-WGP:       ; %bb.0: ; %entry
3720; GFX10-WGP-NEXT:    s_clause 0x1
3721; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3722; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3723; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3724; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3725; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3726; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3727; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3728; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3729; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3730; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3731; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3732; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3733; GFX10-WGP-NEXT:    s_endpgm
3734;
3735; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
3736; GFX10-CU:       ; %bb.0: ; %entry
3737; GFX10-CU-NEXT:    s_clause 0x1
3738; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3739; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3740; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3741; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3742; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3743; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3744; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3745; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3746; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3747; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3748; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3749; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3750; GFX10-CU-NEXT:    s_endpgm
3751;
3752; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
3753; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3754; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3755; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3756; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3757; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3758; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3759; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3760; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3761; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3762; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3763; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3764; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3765; SKIP-CACHE-INV-NEXT:    s_endpgm
3766    i32* %out, i32 %in, i32 %old) {
3767entry:
3768  %gep = getelementptr i32, i32* %out, i32 4
3769  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic
3770  ret void
3771}
3772
3773define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
3774; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
3775; GFX7:       ; %bb.0: ; %entry
3776; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3777; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3778; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3779; GFX7-NEXT:    s_add_u32 s0, s0, 16
3780; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3781; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3782; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3783; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3784; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3785; GFX7-NEXT:    s_waitcnt vmcnt(0)
3786; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3787; GFX7-NEXT:    s_waitcnt vmcnt(0)
3788; GFX7-NEXT:    buffer_wbinvl1_vol
3789; GFX7-NEXT:    s_endpgm
3790;
3791; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
3792; GFX10-WGP:       ; %bb.0: ; %entry
3793; GFX10-WGP-NEXT:    s_clause 0x1
3794; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3795; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3796; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3797; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3798; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3799; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3800; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3801; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3802; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3803; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3804; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3805; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3806; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3807; GFX10-WGP-NEXT:    buffer_gl0_inv
3808; GFX10-WGP-NEXT:    buffer_gl1_inv
3809; GFX10-WGP-NEXT:    s_endpgm
3810;
3811; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
3812; GFX10-CU:       ; %bb.0: ; %entry
3813; GFX10-CU-NEXT:    s_clause 0x1
3814; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3815; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3816; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3817; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3818; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3819; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3820; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3821; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3822; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3823; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3824; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3825; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3826; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3827; GFX10-CU-NEXT:    buffer_gl0_inv
3828; GFX10-CU-NEXT:    buffer_gl1_inv
3829; GFX10-CU-NEXT:    s_endpgm
3830;
3831; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
3832; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3833; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3834; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3835; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3836; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3837; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3838; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3839; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3840; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3841; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3842; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3843; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3844; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3845; SKIP-CACHE-INV-NEXT:    s_endpgm
3846    i32* %out, i32 %in, i32 %old) {
3847entry:
3848  %gep = getelementptr i32, i32* %out, i32 4
3849  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
3850  ret void
3851}
3852
3853define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
3854; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
3855; GFX7:       ; %bb.0: ; %entry
3856; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3857; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3858; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3859; GFX7-NEXT:    s_add_u32 s0, s0, 16
3860; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3861; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3862; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3863; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3864; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3865; GFX7-NEXT:    s_waitcnt vmcnt(0)
3866; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3867; GFX7-NEXT:    s_waitcnt vmcnt(0)
3868; GFX7-NEXT:    buffer_wbinvl1_vol
3869; GFX7-NEXT:    s_endpgm
3870;
3871; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
3872; GFX10-WGP:       ; %bb.0: ; %entry
3873; GFX10-WGP-NEXT:    s_clause 0x1
3874; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3875; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3876; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3877; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3878; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3879; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3880; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3881; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3882; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3883; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3884; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3885; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3886; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3887; GFX10-WGP-NEXT:    buffer_gl0_inv
3888; GFX10-WGP-NEXT:    buffer_gl1_inv
3889; GFX10-WGP-NEXT:    s_endpgm
3890;
3891; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
3892; GFX10-CU:       ; %bb.0: ; %entry
3893; GFX10-CU-NEXT:    s_clause 0x1
3894; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3895; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3896; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3897; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3898; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3899; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3900; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3901; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3902; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3903; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3904; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3905; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3906; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3907; GFX10-CU-NEXT:    buffer_gl0_inv
3908; GFX10-CU-NEXT:    buffer_gl1_inv
3909; GFX10-CU-NEXT:    s_endpgm
3910;
3911; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
3912; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3913; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3914; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3915; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3916; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3917; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3918; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3919; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3920; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3921; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3922; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3923; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3924; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3925; SKIP-CACHE-INV-NEXT:    s_endpgm
3926    i32* %out, i32 %in, i32 %old) {
3927entry:
3928  %gep = getelementptr i32, i32* %out, i32 4
3929  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
3930  ret void
3931}
3932
3933define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
3934; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
3935; GFX7:       ; %bb.0: ; %entry
3936; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3937; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3938; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3939; GFX7-NEXT:    s_add_u32 s0, s0, 16
3940; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3941; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3942; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3943; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3944; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3945; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3946; GFX7-NEXT:    s_waitcnt vmcnt(0)
3947; GFX7-NEXT:    buffer_wbinvl1_vol
3948; GFX7-NEXT:    s_endpgm
3949;
3950; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
3951; GFX10-WGP:       ; %bb.0: ; %entry
3952; GFX10-WGP-NEXT:    s_clause 0x1
3953; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3954; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3955; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3956; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
3957; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
3958; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3959; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
3960; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3961; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
3962; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3963; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3964; GFX10-WGP-NEXT:    buffer_gl0_inv
3965; GFX10-WGP-NEXT:    buffer_gl1_inv
3966; GFX10-WGP-NEXT:    s_endpgm
3967;
3968; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
3969; GFX10-CU:       ; %bb.0: ; %entry
3970; GFX10-CU-NEXT:    s_clause 0x1
3971; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3972; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
3973; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3974; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
3975; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
3976; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3977; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
3978; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3979; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
3980; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3981; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3982; GFX10-CU-NEXT:    buffer_gl0_inv
3983; GFX10-CU-NEXT:    buffer_gl1_inv
3984; GFX10-CU-NEXT:    s_endpgm
3985;
3986; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
3987; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3988; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
3989; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3990; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3991; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
3992; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
3993; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
3994; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
3995; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
3996; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
3997; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3998; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3999; SKIP-CACHE-INV-NEXT:    s_endpgm
4000    i32* %out, i32 %in, i32 %old) {
4001entry:
4002  %gep = getelementptr i32, i32* %out, i32 4
4003  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
4004  ret void
4005}
4006
4007define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
4008; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
4009; GFX7:       ; %bb.0: ; %entry
4010; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4011; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4012; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4013; GFX7-NEXT:    s_add_u32 s0, s0, 16
4014; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4015; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4016; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4017; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4018; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4019; GFX7-NEXT:    s_waitcnt vmcnt(0)
4020; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4021; GFX7-NEXT:    s_waitcnt vmcnt(0)
4022; GFX7-NEXT:    buffer_wbinvl1_vol
4023; GFX7-NEXT:    s_endpgm
4024;
4025; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
4026; GFX10-WGP:       ; %bb.0: ; %entry
4027; GFX10-WGP-NEXT:    s_clause 0x1
4028; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4029; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4030; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4031; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
4032; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
4033; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4034; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4035; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4036; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4037; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4038; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4039; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4040; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4041; GFX10-WGP-NEXT:    buffer_gl0_inv
4042; GFX10-WGP-NEXT:    buffer_gl1_inv
4043; GFX10-WGP-NEXT:    s_endpgm
4044;
4045; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
4046; GFX10-CU:       ; %bb.0: ; %entry
4047; GFX10-CU-NEXT:    s_clause 0x1
4048; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4049; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4050; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4051; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
4052; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
4053; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4054; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4055; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4056; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4057; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4058; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4059; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4060; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4061; GFX10-CU-NEXT:    buffer_gl0_inv
4062; GFX10-CU-NEXT:    buffer_gl1_inv
4063; GFX10-CU-NEXT:    s_endpgm
4064;
4065; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
4066; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4067; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4068; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4069; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4070; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
4071; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
4072; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4073; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4074; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4075; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4076; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4077; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4078; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4079; SKIP-CACHE-INV-NEXT:    s_endpgm
4080    i32* %out, i32 %in, i32 %old) {
4081entry:
4082  %gep = getelementptr i32, i32* %out, i32 4
4083  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
4084  ret void
4085}
4086
4087define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
4088; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
4089; GFX7:       ; %bb.0: ; %entry
4090; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4091; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4092; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4093; GFX7-NEXT:    s_add_u32 s0, s0, 16
4094; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4095; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4096; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4097; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4098; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4099; GFX7-NEXT:    s_waitcnt vmcnt(0)
4100; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4101; GFX7-NEXT:    s_waitcnt vmcnt(0)
4102; GFX7-NEXT:    buffer_wbinvl1_vol
4103; GFX7-NEXT:    s_endpgm
4104;
4105; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
4106; GFX10-WGP:       ; %bb.0: ; %entry
4107; GFX10-WGP-NEXT:    s_clause 0x1
4108; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4109; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4110; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4111; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
4112; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
4113; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4114; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4115; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4116; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4117; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4118; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4119; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4120; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4121; GFX10-WGP-NEXT:    buffer_gl0_inv
4122; GFX10-WGP-NEXT:    buffer_gl1_inv
4123; GFX10-WGP-NEXT:    s_endpgm
4124;
4125; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
4126; GFX10-CU:       ; %bb.0: ; %entry
4127; GFX10-CU-NEXT:    s_clause 0x1
4128; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4129; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4130; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4131; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
4132; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
4133; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4134; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4135; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4136; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4137; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4138; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4139; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4140; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4141; GFX10-CU-NEXT:    buffer_gl0_inv
4142; GFX10-CU-NEXT:    buffer_gl1_inv
4143; GFX10-CU-NEXT:    s_endpgm
4144;
4145; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
4146; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4147; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4148; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4149; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4150; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
4151; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
4152; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4153; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4154; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4155; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4156; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4157; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4158; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4159; SKIP-CACHE-INV-NEXT:    s_endpgm
4160    i32* %out, i32 %in, i32 %old) {
4161entry:
4162  %gep = getelementptr i32, i32* %out, i32 4
4163  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
4164  ret void
4165}
4166
4167define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
4168; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
4169; GFX7:       ; %bb.0: ; %entry
4170; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4171; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4172; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4173; GFX7-NEXT:    s_add_u32 s0, s0, 16
4174; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4175; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4176; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4177; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4178; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4179; GFX7-NEXT:    s_waitcnt vmcnt(0)
4180; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4181; GFX7-NEXT:    s_waitcnt vmcnt(0)
4182; GFX7-NEXT:    buffer_wbinvl1_vol
4183; GFX7-NEXT:    s_endpgm
4184;
4185; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
4186; GFX10-WGP:       ; %bb.0: ; %entry
4187; GFX10-WGP-NEXT:    s_clause 0x1
4188; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4189; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4190; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4191; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
4192; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
4193; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4194; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4195; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4196; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4197; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4198; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4199; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4200; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4201; GFX10-WGP-NEXT:    buffer_gl0_inv
4202; GFX10-WGP-NEXT:    buffer_gl1_inv
4203; GFX10-WGP-NEXT:    s_endpgm
4204;
4205; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
4206; GFX10-CU:       ; %bb.0: ; %entry
4207; GFX10-CU-NEXT:    s_clause 0x1
4208; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4209; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4210; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4211; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
4212; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
4213; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4214; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4215; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4216; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4217; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4218; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4219; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4220; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4221; GFX10-CU-NEXT:    buffer_gl0_inv
4222; GFX10-CU-NEXT:    buffer_gl1_inv
4223; GFX10-CU-NEXT:    s_endpgm
4224;
4225; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
4226; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4227; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4228; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4229; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4230; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
4231; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
4232; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4233; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4234; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4235; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4236; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4237; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4238; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4239; SKIP-CACHE-INV-NEXT:    s_endpgm
4240    i32* %out, i32 %in, i32 %old) {
4241entry:
4242  %gep = getelementptr i32, i32* %out, i32 4
4243  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
4244  ret void
4245}
4246
4247define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
4248; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
4249; GFX7:       ; %bb.0: ; %entry
4250; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4251; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4252; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4253; GFX7-NEXT:    s_add_u32 s0, s0, 16
4254; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4255; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4256; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4257; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4258; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4259; GFX7-NEXT:    s_waitcnt vmcnt(0)
4260; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4261; GFX7-NEXT:    s_waitcnt vmcnt(0)
4262; GFX7-NEXT:    buffer_wbinvl1_vol
4263; GFX7-NEXT:    s_endpgm
4264;
4265; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
4266; GFX10-WGP:       ; %bb.0: ; %entry
4267; GFX10-WGP-NEXT:    s_clause 0x1
4268; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4269; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4270; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4271; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
4272; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
4273; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4274; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4275; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4276; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4277; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4278; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4279; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4280; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4281; GFX10-WGP-NEXT:    buffer_gl0_inv
4282; GFX10-WGP-NEXT:    buffer_gl1_inv
4283; GFX10-WGP-NEXT:    s_endpgm
4284;
4285; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
4286; GFX10-CU:       ; %bb.0: ; %entry
4287; GFX10-CU-NEXT:    s_clause 0x1
4288; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4289; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4290; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4291; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
4292; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
4293; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4294; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4295; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4296; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4297; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4298; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4299; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4300; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4301; GFX10-CU-NEXT:    buffer_gl0_inv
4302; GFX10-CU-NEXT:    buffer_gl1_inv
4303; GFX10-CU-NEXT:    s_endpgm
4304;
4305; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
4306; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4307; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4308; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4309; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4310; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16
4311; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0
4312; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4313; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4314; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4315; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4316; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4317; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4318; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4319; SKIP-CACHE-INV-NEXT:    s_endpgm
4320    i32* %out, i32 %in, i32 %old) {
4321entry:
4322  %gep = getelementptr i32, i32* %out, i32 4
4323  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
4324  ret void
4325}
4326
4327define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
4328; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
4329; GFX7:       ; %bb.0: ; %entry
4330; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4331; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4332; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4333; GFX7-NEXT:    s_add_u32 s4, s0, 16
4334; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4335; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4336; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4337; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4338; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4339; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4340; GFX7-NEXT:    s_waitcnt vmcnt(0)
4341; GFX7-NEXT:    buffer_wbinvl1_vol
4342; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4343; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4344; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4345; GFX7-NEXT:    flat_store_dword v[0:1], v2
4346; GFX7-NEXT:    s_endpgm
4347;
4348; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
4349; GFX10-WGP:       ; %bb.0: ; %entry
4350; GFX10-WGP-NEXT:    s_clause 0x1
4351; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4352; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4353; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4354; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4355; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4356; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4357; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4358; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4359; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4360; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4361; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4362; GFX10-WGP-NEXT:    buffer_gl0_inv
4363; GFX10-WGP-NEXT:    buffer_gl1_inv
4364; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4365; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4366; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4367; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4368; GFX10-WGP-NEXT:    s_endpgm
4369;
4370; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
4371; GFX10-CU:       ; %bb.0: ; %entry
4372; GFX10-CU-NEXT:    s_clause 0x1
4373; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4374; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4375; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4376; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4377; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4378; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4379; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4380; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4381; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4382; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4383; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4384; GFX10-CU-NEXT:    buffer_gl0_inv
4385; GFX10-CU-NEXT:    buffer_gl1_inv
4386; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4387; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4388; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4389; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4390; GFX10-CU-NEXT:    s_endpgm
4391;
4392; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
4393; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4394; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4395; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4396; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4397; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4398; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4399; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4400; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4401; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4402; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4403; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4404; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4405; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4406; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4407; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4408; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4409; SKIP-CACHE-INV-NEXT:    s_endpgm
4410    i32* %out, i32 %in, i32 %old) {
4411entry:
4412  %gep = getelementptr i32, i32* %out, i32 4
4413  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
4414  %val0 = extractvalue { i32, i1 } %val, 0
4415  store i32 %val0, i32* %out, align 4
4416  ret void
4417}
4418
4419define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
4420; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
4421; GFX7:       ; %bb.0: ; %entry
4422; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4423; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4424; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4425; GFX7-NEXT:    s_add_u32 s4, s0, 16
4426; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4427; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4428; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4429; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4430; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4431; GFX7-NEXT:    s_waitcnt vmcnt(0)
4432; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4433; GFX7-NEXT:    s_waitcnt vmcnt(0)
4434; GFX7-NEXT:    buffer_wbinvl1_vol
4435; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4436; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4437; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4438; GFX7-NEXT:    flat_store_dword v[0:1], v2
4439; GFX7-NEXT:    s_endpgm
4440;
4441; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
4442; GFX10-WGP:       ; %bb.0: ; %entry
4443; GFX10-WGP-NEXT:    s_clause 0x1
4444; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4445; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4446; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4447; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4448; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4449; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4450; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4451; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4452; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4453; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4454; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4455; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4456; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4457; GFX10-WGP-NEXT:    buffer_gl0_inv
4458; GFX10-WGP-NEXT:    buffer_gl1_inv
4459; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4460; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4461; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4462; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4463; GFX10-WGP-NEXT:    s_endpgm
4464;
4465; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
4466; GFX10-CU:       ; %bb.0: ; %entry
4467; GFX10-CU-NEXT:    s_clause 0x1
4468; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4469; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4470; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4471; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4472; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4473; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4474; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4475; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4476; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4477; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4478; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4479; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4480; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4481; GFX10-CU-NEXT:    buffer_gl0_inv
4482; GFX10-CU-NEXT:    buffer_gl1_inv
4483; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4484; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4485; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4486; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4487; GFX10-CU-NEXT:    s_endpgm
4488;
4489; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
4490; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4491; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4492; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4493; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4494; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4495; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4496; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4497; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4498; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4499; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4500; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4501; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4502; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4503; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4504; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4505; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4506; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4507; SKIP-CACHE-INV-NEXT:    s_endpgm
4508    i32* %out, i32 %in, i32 %old) {
4509entry:
4510  %gep = getelementptr i32, i32* %out, i32 4
4511  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
4512  %val0 = extractvalue { i32, i1 } %val, 0
4513  store i32 %val0, i32* %out, align 4
4514  ret void
4515}
4516
4517define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
4518; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
4519; GFX7:       ; %bb.0: ; %entry
4520; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4521; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4522; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4523; GFX7-NEXT:    s_add_u32 s4, s0, 16
4524; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4525; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4526; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4527; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4528; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4529; GFX7-NEXT:    s_waitcnt vmcnt(0)
4530; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4531; GFX7-NEXT:    s_waitcnt vmcnt(0)
4532; GFX7-NEXT:    buffer_wbinvl1_vol
4533; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4534; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4535; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4536; GFX7-NEXT:    flat_store_dword v[0:1], v2
4537; GFX7-NEXT:    s_endpgm
4538;
4539; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
4540; GFX10-WGP:       ; %bb.0: ; %entry
4541; GFX10-WGP-NEXT:    s_clause 0x1
4542; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4543; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4544; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4545; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4546; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4547; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4548; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4549; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4550; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4551; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4552; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4553; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4554; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4555; GFX10-WGP-NEXT:    buffer_gl0_inv
4556; GFX10-WGP-NEXT:    buffer_gl1_inv
4557; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4558; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4559; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4560; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4561; GFX10-WGP-NEXT:    s_endpgm
4562;
4563; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
4564; GFX10-CU:       ; %bb.0: ; %entry
4565; GFX10-CU-NEXT:    s_clause 0x1
4566; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4567; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4568; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4569; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4570; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4571; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4572; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4573; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4574; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4575; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4576; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4577; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4578; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4579; GFX10-CU-NEXT:    buffer_gl0_inv
4580; GFX10-CU-NEXT:    buffer_gl1_inv
4581; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4582; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4583; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4584; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4585; GFX10-CU-NEXT:    s_endpgm
4586;
4587; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
4588; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4589; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4590; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4591; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4592; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4593; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4594; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4595; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4596; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4597; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4598; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4599; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4600; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4601; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4602; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4603; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4604; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4605; SKIP-CACHE-INV-NEXT:    s_endpgm
4606    i32* %out, i32 %in, i32 %old) {
4607entry:
4608  %gep = getelementptr i32, i32* %out, i32 4
4609  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
4610  %val0 = extractvalue { i32, i1 } %val, 0
4611  store i32 %val0, i32* %out, align 4
4612  ret void
4613}
4614
4615define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
4616; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
4617; GFX7:       ; %bb.0: ; %entry
4618; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4619; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4620; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4621; GFX7-NEXT:    s_add_u32 s4, s0, 16
4622; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4623; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4624; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4625; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4626; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4627; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4628; GFX7-NEXT:    s_waitcnt vmcnt(0)
4629; GFX7-NEXT:    buffer_wbinvl1_vol
4630; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4631; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4632; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4633; GFX7-NEXT:    flat_store_dword v[0:1], v2
4634; GFX7-NEXT:    s_endpgm
4635;
4636; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
4637; GFX10-WGP:       ; %bb.0: ; %entry
4638; GFX10-WGP-NEXT:    s_clause 0x1
4639; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4640; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4641; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4642; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4643; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4644; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4645; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4646; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4647; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4648; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4649; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4650; GFX10-WGP-NEXT:    buffer_gl0_inv
4651; GFX10-WGP-NEXT:    buffer_gl1_inv
4652; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4653; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4654; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4655; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4656; GFX10-WGP-NEXT:    s_endpgm
4657;
4658; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
4659; GFX10-CU:       ; %bb.0: ; %entry
4660; GFX10-CU-NEXT:    s_clause 0x1
4661; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4662; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4663; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4664; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4665; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4666; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4667; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4668; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4669; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4670; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4671; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4672; GFX10-CU-NEXT:    buffer_gl0_inv
4673; GFX10-CU-NEXT:    buffer_gl1_inv
4674; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4675; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4676; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4677; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4678; GFX10-CU-NEXT:    s_endpgm
4679;
4680; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
4681; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4682; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4683; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4684; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4685; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4686; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4687; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4688; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4689; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4690; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4691; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4692; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4693; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4694; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4695; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4696; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4697; SKIP-CACHE-INV-NEXT:    s_endpgm
4698    i32* %out, i32 %in, i32 %old) {
4699entry:
4700  %gep = getelementptr i32, i32* %out, i32 4
4701  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
4702  %val0 = extractvalue { i32, i1 } %val, 0
4703  store i32 %val0, i32* %out, align 4
4704  ret void
4705}
4706
4707define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
4708; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
4709; GFX7:       ; %bb.0: ; %entry
4710; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4711; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4712; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4713; GFX7-NEXT:    s_add_u32 s4, s0, 16
4714; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4715; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4716; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4717; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4718; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4719; GFX7-NEXT:    s_waitcnt vmcnt(0)
4720; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4721; GFX7-NEXT:    s_waitcnt vmcnt(0)
4722; GFX7-NEXT:    buffer_wbinvl1_vol
4723; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4724; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4725; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4726; GFX7-NEXT:    flat_store_dword v[0:1], v2
4727; GFX7-NEXT:    s_endpgm
4728;
4729; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
4730; GFX10-WGP:       ; %bb.0: ; %entry
4731; GFX10-WGP-NEXT:    s_clause 0x1
4732; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4733; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4734; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4735; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4736; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4737; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4738; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4739; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4740; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4741; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4742; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4743; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4744; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4745; GFX10-WGP-NEXT:    buffer_gl0_inv
4746; GFX10-WGP-NEXT:    buffer_gl1_inv
4747; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4748; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4749; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4750; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4751; GFX10-WGP-NEXT:    s_endpgm
4752;
4753; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
4754; GFX10-CU:       ; %bb.0: ; %entry
4755; GFX10-CU-NEXT:    s_clause 0x1
4756; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4757; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4758; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4759; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4760; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4761; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4762; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4763; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4764; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4765; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4766; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4767; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4768; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4769; GFX10-CU-NEXT:    buffer_gl0_inv
4770; GFX10-CU-NEXT:    buffer_gl1_inv
4771; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4772; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4773; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4774; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4775; GFX10-CU-NEXT:    s_endpgm
4776;
4777; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
4778; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4779; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4780; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4781; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4782; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4783; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4784; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4785; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4786; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4787; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4788; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4789; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4790; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4791; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4792; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4793; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4794; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4795; SKIP-CACHE-INV-NEXT:    s_endpgm
4796    i32* %out, i32 %in, i32 %old) {
4797entry:
4798  %gep = getelementptr i32, i32* %out, i32 4
4799  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
4800  %val0 = extractvalue { i32, i1 } %val, 0
4801  store i32 %val0, i32* %out, align 4
4802  ret void
4803}
4804
4805define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
4806; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
4807; GFX7:       ; %bb.0: ; %entry
4808; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4809; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4810; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4811; GFX7-NEXT:    s_add_u32 s4, s0, 16
4812; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4813; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4814; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4815; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4816; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4817; GFX7-NEXT:    s_waitcnt vmcnt(0)
4818; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4819; GFX7-NEXT:    s_waitcnt vmcnt(0)
4820; GFX7-NEXT:    buffer_wbinvl1_vol
4821; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4822; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4823; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4824; GFX7-NEXT:    flat_store_dword v[0:1], v2
4825; GFX7-NEXT:    s_endpgm
4826;
4827; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
4828; GFX10-WGP:       ; %bb.0: ; %entry
4829; GFX10-WGP-NEXT:    s_clause 0x1
4830; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4831; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4832; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4833; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4834; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4835; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4836; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4837; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4838; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4839; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4840; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4841; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4842; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4843; GFX10-WGP-NEXT:    buffer_gl0_inv
4844; GFX10-WGP-NEXT:    buffer_gl1_inv
4845; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4846; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4847; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4848; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4849; GFX10-WGP-NEXT:    s_endpgm
4850;
4851; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
4852; GFX10-CU:       ; %bb.0: ; %entry
4853; GFX10-CU-NEXT:    s_clause 0x1
4854; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4855; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4856; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4857; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4858; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4859; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4860; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4861; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4862; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4863; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4864; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4865; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4866; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4867; GFX10-CU-NEXT:    buffer_gl0_inv
4868; GFX10-CU-NEXT:    buffer_gl1_inv
4869; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4870; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4871; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4872; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4873; GFX10-CU-NEXT:    s_endpgm
4874;
4875; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
4876; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4877; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4878; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4879; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4880; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4881; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4882; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4883; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4884; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4885; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4886; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4887; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4888; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4889; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4890; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4891; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4892; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4893; SKIP-CACHE-INV-NEXT:    s_endpgm
4894    i32* %out, i32 %in, i32 %old) {
4895entry:
4896  %gep = getelementptr i32, i32* %out, i32 4
4897  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
4898  %val0 = extractvalue { i32, i1 } %val, 0
4899  store i32 %val0, i32* %out, align 4
4900  ret void
4901}
4902
4903define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
4904; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
4905; GFX7:       ; %bb.0: ; %entry
4906; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4907; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4908; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4909; GFX7-NEXT:    s_add_u32 s4, s0, 16
4910; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4911; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4912; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4913; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4914; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4915; GFX7-NEXT:    s_waitcnt vmcnt(0)
4916; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4917; GFX7-NEXT:    s_waitcnt vmcnt(0)
4918; GFX7-NEXT:    buffer_wbinvl1_vol
4919; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4920; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4921; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4922; GFX7-NEXT:    flat_store_dword v[0:1], v2
4923; GFX7-NEXT:    s_endpgm
4924;
4925; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
4926; GFX10-WGP:       ; %bb.0: ; %entry
4927; GFX10-WGP-NEXT:    s_clause 0x1
4928; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4929; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4930; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4931; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
4932; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
4933; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
4934; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
4935; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
4936; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
4937; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4938; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4939; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4940; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4941; GFX10-WGP-NEXT:    buffer_gl0_inv
4942; GFX10-WGP-NEXT:    buffer_gl1_inv
4943; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4944; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4945; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4946; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
4947; GFX10-WGP-NEXT:    s_endpgm
4948;
4949; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
4950; GFX10-CU:       ; %bb.0: ; %entry
4951; GFX10-CU-NEXT:    s_clause 0x1
4952; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4953; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4954; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4955; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
4956; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
4957; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
4958; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
4959; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
4960; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
4961; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4962; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4963; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4964; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4965; GFX10-CU-NEXT:    buffer_gl0_inv
4966; GFX10-CU-NEXT:    buffer_gl1_inv
4967; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4968; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4969; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4970; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
4971; GFX10-CU-NEXT:    s_endpgm
4972;
4973; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
4974; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4975; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
4976; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4977; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4978; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
4979; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
4980; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
4981; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
4982; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
4983; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
4984; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4985; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4986; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4987; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
4988; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
4989; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4990; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
4991; SKIP-CACHE-INV-NEXT:    s_endpgm
4992    i32* %out, i32 %in, i32 %old) {
4993entry:
4994  %gep = getelementptr i32, i32* %out, i32 4
4995  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
4996  %val0 = extractvalue { i32, i1 } %val, 0
4997  store i32 %val0, i32* %out, align 4
4998  ret void
4999}
5000
5001define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
5002; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
5003; GFX7:       ; %bb.0: ; %entry
5004; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5005; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5006; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5007; GFX7-NEXT:    s_add_u32 s4, s0, 16
5008; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5009; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5010; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5011; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5012; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5013; GFX7-NEXT:    s_waitcnt vmcnt(0)
5014; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5015; GFX7-NEXT:    s_waitcnt vmcnt(0)
5016; GFX7-NEXT:    buffer_wbinvl1_vol
5017; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5018; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5019; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5020; GFX7-NEXT:    flat_store_dword v[0:1], v2
5021; GFX7-NEXT:    s_endpgm
5022;
5023; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
5024; GFX10-WGP:       ; %bb.0: ; %entry
5025; GFX10-WGP-NEXT:    s_clause 0x1
5026; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5027; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5028; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5029; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
5030; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
5031; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
5032; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
5033; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
5034; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
5035; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5036; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5037; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5038; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5039; GFX10-WGP-NEXT:    buffer_gl0_inv
5040; GFX10-WGP-NEXT:    buffer_gl1_inv
5041; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5042; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5043; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5044; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
5045; GFX10-WGP-NEXT:    s_endpgm
5046;
5047; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
5048; GFX10-CU:       ; %bb.0: ; %entry
5049; GFX10-CU-NEXT:    s_clause 0x1
5050; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5051; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
5052; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5053; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
5054; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
5055; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
5056; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
5057; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
5058; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
5059; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5060; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5061; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5062; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5063; GFX10-CU-NEXT:    buffer_gl0_inv
5064; GFX10-CU-NEXT:    buffer_gl1_inv
5065; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5066; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5067; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5068; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
5069; GFX10-CU-NEXT:    s_endpgm
5070;
5071; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
5072; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5073; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
5074; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5075; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5076; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16
5077; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0
5078; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
5079; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1
5080; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
5081; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
5082; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5083; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5084; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5085; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
5086; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
5087; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5088; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
5089; SKIP-CACHE-INV-NEXT:    s_endpgm
5090    i32* %out, i32 %in, i32 %old) {
5091entry:
5092  %gep = getelementptr i32, i32* %out, i32 4
5093  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
5094  %val0 = extractvalue { i32, i1 } %val, 0
5095  store i32 %val0, i32* %out, align 4
5096  ret void
5097}
5098
5099