1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 6 7define amdgpu_kernel void @flat_agent_unordered_load( 8; GFX7-LABEL: flat_agent_unordered_load: 9; GFX7: ; %bb.0: ; %entry 10; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 11; GFX7-NEXT: s_waitcnt lgkmcnt(0) 12; GFX7-NEXT: v_mov_b32_e32 v0, s0 13; GFX7-NEXT: v_mov_b32_e32 v1, s1 14; GFX7-NEXT: flat_load_dword v0, v[0:1] 15; GFX7-NEXT: v_mov_b32_e32 v2, s2 16; GFX7-NEXT: v_mov_b32_e32 v3, s3 17; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 18; GFX7-NEXT: flat_store_dword v[2:3], v0 19; GFX7-NEXT: s_endpgm 20; 21; GFX10-WGP-LABEL: flat_agent_unordered_load: 22; GFX10-WGP: ; %bb.0: ; %entry 23; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 24; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 25; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 26; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 27; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 28; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 29; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 30; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 31; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 32; GFX10-WGP-NEXT: s_endpgm 33; 34; GFX10-CU-LABEL: flat_agent_unordered_load: 35; GFX10-CU: ; %bb.0: ; %entry 36; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 37; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 38; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 39; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 40; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 41; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 42; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 43; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 44; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 45; GFX10-CU-NEXT: s_endpgm 46; 47; SKIP-CACHE-INV-LABEL: flat_agent_unordered_load: 48; SKIP-CACHE-INV: ; %bb.0: ; %entry 49; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 50; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 51; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 52; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 53; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 54; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 55; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 56; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 57; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 58; SKIP-CACHE-INV-NEXT: s_endpgm 59 i32* %in, i32* %out) { 60entry: 61 %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4 62 store i32 %val, i32* %out 63 ret void 64} 65 66define amdgpu_kernel void @flat_agent_monotonic_load( 67; GFX7-LABEL: flat_agent_monotonic_load: 68; GFX7: ; %bb.0: ; %entry 69; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 70; GFX7-NEXT: s_waitcnt lgkmcnt(0) 71; GFX7-NEXT: v_mov_b32_e32 v0, s0 72; GFX7-NEXT: v_mov_b32_e32 v1, s1 73; GFX7-NEXT: flat_load_dword v0, v[0:1] glc 74; GFX7-NEXT: v_mov_b32_e32 v2, s2 75; GFX7-NEXT: v_mov_b32_e32 v3, s3 76; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 77; GFX7-NEXT: flat_store_dword v[2:3], v0 78; GFX7-NEXT: s_endpgm 79; 80; GFX10-WGP-LABEL: flat_agent_monotonic_load: 81; GFX10-WGP: ; %bb.0: ; %entry 82; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 83; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 84; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 85; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 86; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc 87; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 88; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 89; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 90; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 91; GFX10-WGP-NEXT: s_endpgm 92; 93; GFX10-CU-LABEL: flat_agent_monotonic_load: 94; GFX10-CU: ; %bb.0: ; %entry 95; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 96; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 97; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 98; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 99; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc 100; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 101; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 102; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 103; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 104; GFX10-CU-NEXT: s_endpgm 105; 106; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_load: 107; SKIP-CACHE-INV: ; %bb.0: ; %entry 108; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 109; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 110; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 111; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 112; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc 113; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 114; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 115; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 116; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 117; SKIP-CACHE-INV-NEXT: s_endpgm 118 i32* %in, i32* %out) { 119entry: 120 %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4 121 store i32 %val, i32* %out 122 ret void 123} 124 125define amdgpu_kernel void @flat_agent_acquire_load( 126; GFX7-LABEL: flat_agent_acquire_load: 127; GFX7: ; %bb.0: ; %entry 128; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 129; GFX7-NEXT: s_waitcnt lgkmcnt(0) 130; GFX7-NEXT: v_mov_b32_e32 v0, s0 131; GFX7-NEXT: v_mov_b32_e32 v1, s1 132; GFX7-NEXT: flat_load_dword v0, v[0:1] glc 133; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 134; GFX7-NEXT: buffer_wbinvl1_vol 135; GFX7-NEXT: v_mov_b32_e32 v2, s2 136; GFX7-NEXT: v_mov_b32_e32 v3, s3 137; GFX7-NEXT: s_waitcnt lgkmcnt(0) 138; GFX7-NEXT: flat_store_dword v[2:3], v0 139; GFX7-NEXT: s_endpgm 140; 141; GFX10-WGP-LABEL: flat_agent_acquire_load: 142; GFX10-WGP: ; %bb.0: ; %entry 143; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 144; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 145; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 146; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 147; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc 148; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 149; GFX10-WGP-NEXT: buffer_gl0_inv 150; GFX10-WGP-NEXT: buffer_gl1_inv 151; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 152; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 153; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 154; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 155; GFX10-WGP-NEXT: s_endpgm 156; 157; GFX10-CU-LABEL: flat_agent_acquire_load: 158; GFX10-CU: ; %bb.0: ; %entry 159; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 160; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 161; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 162; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 163; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc 164; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 165; GFX10-CU-NEXT: buffer_gl0_inv 166; GFX10-CU-NEXT: buffer_gl1_inv 167; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 168; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 169; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 170; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 171; GFX10-CU-NEXT: s_endpgm 172; 173; SKIP-CACHE-INV-LABEL: flat_agent_acquire_load: 174; SKIP-CACHE-INV: ; %bb.0: ; %entry 175; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 176; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 177; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 178; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 179; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc 180; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 181; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 182; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 183; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 184; SKIP-CACHE-INV-NEXT: s_endpgm 185 i32* %in, i32* %out) { 186entry: 187 %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4 188 store i32 %val, i32* %out 189 ret void 190} 191 192define amdgpu_kernel void @flat_agent_seq_cst_load( 193; GFX7-LABEL: flat_agent_seq_cst_load: 194; GFX7: ; %bb.0: ; %entry 195; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 196; GFX7-NEXT: s_waitcnt lgkmcnt(0) 197; GFX7-NEXT: v_mov_b32_e32 v0, s0 198; GFX7-NEXT: v_mov_b32_e32 v1, s1 199; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 200; GFX7-NEXT: flat_load_dword v0, v[0:1] glc 201; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 202; GFX7-NEXT: buffer_wbinvl1_vol 203; GFX7-NEXT: v_mov_b32_e32 v2, s2 204; GFX7-NEXT: v_mov_b32_e32 v3, s3 205; GFX7-NEXT: s_waitcnt lgkmcnt(0) 206; GFX7-NEXT: flat_store_dword v[2:3], v0 207; GFX7-NEXT: s_endpgm 208; 209; GFX10-WGP-LABEL: flat_agent_seq_cst_load: 210; GFX10-WGP: ; %bb.0: ; %entry 211; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 212; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 213; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 214; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 215; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 216; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 217; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc 218; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 219; GFX10-WGP-NEXT: buffer_gl0_inv 220; GFX10-WGP-NEXT: buffer_gl1_inv 221; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 222; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 223; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 224; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 225; GFX10-WGP-NEXT: s_endpgm 226; 227; GFX10-CU-LABEL: flat_agent_seq_cst_load: 228; GFX10-CU: ; %bb.0: ; %entry 229; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 230; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 231; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 232; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 233; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 234; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 235; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc 236; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 237; GFX10-CU-NEXT: buffer_gl0_inv 238; GFX10-CU-NEXT: buffer_gl1_inv 239; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 240; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 241; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 242; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 243; GFX10-CU-NEXT: s_endpgm 244; 245; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_load: 246; SKIP-CACHE-INV: ; %bb.0: ; %entry 247; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 248; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 249; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 250; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 251; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 252; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc 253; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 254; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 255; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 256; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 257; SKIP-CACHE-INV-NEXT: s_endpgm 258 i32* %in, i32* %out) { 259entry: 260 %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4 261 store i32 %val, i32* %out 262 ret void 263} 264 265define amdgpu_kernel void @flat_agent_unordered_store( 266; GFX7-LABEL: flat_agent_unordered_store: 267; GFX7: ; %bb.0: ; %entry 268; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 269; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 270; GFX7-NEXT: s_waitcnt lgkmcnt(0) 271; GFX7-NEXT: v_mov_b32_e32 v2, s2 272; GFX7-NEXT: v_mov_b32_e32 v0, s0 273; GFX7-NEXT: v_mov_b32_e32 v1, s1 274; GFX7-NEXT: flat_store_dword v[0:1], v2 275; GFX7-NEXT: s_endpgm 276; 277; GFX10-WGP-LABEL: flat_agent_unordered_store: 278; GFX10-WGP: ; %bb.0: ; %entry 279; GFX10-WGP-NEXT: s_clause 0x1 280; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 281; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 282; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 283; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 284; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 285; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 286; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 287; GFX10-WGP-NEXT: s_endpgm 288; 289; GFX10-CU-LABEL: flat_agent_unordered_store: 290; GFX10-CU: ; %bb.0: ; %entry 291; GFX10-CU-NEXT: s_clause 0x1 292; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 293; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 294; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 295; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 296; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 297; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 298; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 299; GFX10-CU-NEXT: s_endpgm 300; 301; SKIP-CACHE-INV-LABEL: flat_agent_unordered_store: 302; SKIP-CACHE-INV: ; %bb.0: ; %entry 303; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 304; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 305; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 306; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 307; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 308; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 309; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 310; SKIP-CACHE-INV-NEXT: s_endpgm 311 i32 %in, i32* %out) { 312entry: 313 store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4 314 ret void 315} 316 317define amdgpu_kernel void @flat_agent_monotonic_store( 318; GFX7-LABEL: flat_agent_monotonic_store: 319; GFX7: ; %bb.0: ; %entry 320; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 321; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 322; GFX7-NEXT: s_waitcnt lgkmcnt(0) 323; GFX7-NEXT: v_mov_b32_e32 v2, s2 324; GFX7-NEXT: v_mov_b32_e32 v0, s0 325; GFX7-NEXT: v_mov_b32_e32 v1, s1 326; GFX7-NEXT: flat_store_dword v[0:1], v2 327; GFX7-NEXT: s_endpgm 328; 329; GFX10-WGP-LABEL: flat_agent_monotonic_store: 330; GFX10-WGP: ; %bb.0: ; %entry 331; GFX10-WGP-NEXT: s_clause 0x1 332; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 333; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 334; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 335; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 336; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 337; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 338; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 339; GFX10-WGP-NEXT: s_endpgm 340; 341; GFX10-CU-LABEL: flat_agent_monotonic_store: 342; GFX10-CU: ; %bb.0: ; %entry 343; GFX10-CU-NEXT: s_clause 0x1 344; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 345; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 346; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 347; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 348; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 349; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 350; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 351; GFX10-CU-NEXT: s_endpgm 352; 353; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_store: 354; SKIP-CACHE-INV: ; %bb.0: ; %entry 355; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 356; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 357; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 358; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 359; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 360; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 361; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 362; SKIP-CACHE-INV-NEXT: s_endpgm 363 i32 %in, i32* %out) { 364entry: 365 store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4 366 ret void 367} 368 369define amdgpu_kernel void @flat_agent_release_store( 370; GFX7-LABEL: flat_agent_release_store: 371; GFX7: ; %bb.0: ; %entry 372; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 373; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 374; GFX7-NEXT: s_waitcnt lgkmcnt(0) 375; GFX7-NEXT: v_mov_b32_e32 v2, s2 376; GFX7-NEXT: v_mov_b32_e32 v0, s0 377; GFX7-NEXT: v_mov_b32_e32 v1, s1 378; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 379; GFX7-NEXT: flat_store_dword v[0:1], v2 380; GFX7-NEXT: s_endpgm 381; 382; GFX10-WGP-LABEL: flat_agent_release_store: 383; GFX10-WGP: ; %bb.0: ; %entry 384; GFX10-WGP-NEXT: s_clause 0x1 385; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 386; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 387; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 388; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 389; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 390; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 391; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 392; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 393; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 394; GFX10-WGP-NEXT: s_endpgm 395; 396; GFX10-CU-LABEL: flat_agent_release_store: 397; GFX10-CU: ; %bb.0: ; %entry 398; GFX10-CU-NEXT: s_clause 0x1 399; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 400; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 401; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 402; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 403; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 404; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 405; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 406; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 407; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 408; GFX10-CU-NEXT: s_endpgm 409; 410; SKIP-CACHE-INV-LABEL: flat_agent_release_store: 411; SKIP-CACHE-INV: ; %bb.0: ; %entry 412; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 413; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 414; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 415; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 416; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 417; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 418; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 419; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 420; SKIP-CACHE-INV-NEXT: s_endpgm 421 i32 %in, i32* %out) { 422entry: 423 store atomic i32 %in, i32* %out syncscope("agent") release, align 4 424 ret void 425} 426 427define amdgpu_kernel void @flat_agent_seq_cst_store( 428; GFX7-LABEL: flat_agent_seq_cst_store: 429; GFX7: ; %bb.0: ; %entry 430; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 431; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 432; GFX7-NEXT: s_waitcnt lgkmcnt(0) 433; GFX7-NEXT: v_mov_b32_e32 v2, s2 434; GFX7-NEXT: v_mov_b32_e32 v0, s0 435; GFX7-NEXT: v_mov_b32_e32 v1, s1 436; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 437; GFX7-NEXT: flat_store_dword v[0:1], v2 438; GFX7-NEXT: s_endpgm 439; 440; GFX10-WGP-LABEL: flat_agent_seq_cst_store: 441; GFX10-WGP: ; %bb.0: ; %entry 442; GFX10-WGP-NEXT: s_clause 0x1 443; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 444; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 445; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 446; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 447; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 448; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 449; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 450; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 451; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 452; GFX10-WGP-NEXT: s_endpgm 453; 454; GFX10-CU-LABEL: flat_agent_seq_cst_store: 455; GFX10-CU: ; %bb.0: ; %entry 456; GFX10-CU-NEXT: s_clause 0x1 457; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 458; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 459; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 460; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 461; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 462; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 463; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 464; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 465; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 466; GFX10-CU-NEXT: s_endpgm 467; 468; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_store: 469; SKIP-CACHE-INV: ; %bb.0: ; %entry 470; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 471; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 472; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 473; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 474; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 475; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 476; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 477; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 478; SKIP-CACHE-INV-NEXT: s_endpgm 479 i32 %in, i32* %out) { 480entry: 481 store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4 482 ret void 483} 484 485define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( 486; GFX7-LABEL: flat_agent_monotonic_atomicrmw: 487; GFX7: ; %bb.0: ; %entry 488; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 489; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 490; GFX7-NEXT: s_waitcnt lgkmcnt(0) 491; GFX7-NEXT: v_mov_b32_e32 v0, s0 492; GFX7-NEXT: v_mov_b32_e32 v1, s1 493; GFX7-NEXT: v_mov_b32_e32 v2, s2 494; GFX7-NEXT: flat_atomic_swap v[0:1], v2 495; GFX7-NEXT: s_endpgm 496; 497; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw: 498; GFX10-WGP: ; %bb.0: ; %entry 499; GFX10-WGP-NEXT: s_clause 0x1 500; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 501; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 502; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 503; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 504; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 505; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 506; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 507; GFX10-WGP-NEXT: s_endpgm 508; 509; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw: 510; GFX10-CU: ; %bb.0: ; %entry 511; GFX10-CU-NEXT: s_clause 0x1 512; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 513; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 514; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 515; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 516; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 517; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 518; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 519; GFX10-CU-NEXT: s_endpgm 520; 521; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_atomicrmw: 522; SKIP-CACHE-INV: ; %bb.0: ; %entry 523; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 524; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 525; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 526; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 527; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 528; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 529; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 530; SKIP-CACHE-INV-NEXT: s_endpgm 531 i32* %out, i32 %in) { 532entry: 533 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") monotonic 534 ret void 535} 536 537define amdgpu_kernel void @flat_agent_acquire_atomicrmw( 538; GFX7-LABEL: flat_agent_acquire_atomicrmw: 539; GFX7: ; %bb.0: ; %entry 540; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 541; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 542; GFX7-NEXT: s_waitcnt lgkmcnt(0) 543; GFX7-NEXT: v_mov_b32_e32 v0, s0 544; GFX7-NEXT: v_mov_b32_e32 v1, s1 545; GFX7-NEXT: v_mov_b32_e32 v2, s2 546; GFX7-NEXT: flat_atomic_swap v[0:1], v2 547; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 548; GFX7-NEXT: buffer_wbinvl1_vol 549; GFX7-NEXT: s_endpgm 550; 551; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw: 552; GFX10-WGP: ; %bb.0: ; %entry 553; GFX10-WGP-NEXT: s_clause 0x1 554; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 555; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 556; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 557; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 558; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 559; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 560; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 561; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 562; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 563; GFX10-WGP-NEXT: buffer_gl0_inv 564; GFX10-WGP-NEXT: buffer_gl1_inv 565; GFX10-WGP-NEXT: s_endpgm 566; 567; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw: 568; GFX10-CU: ; %bb.0: ; %entry 569; GFX10-CU-NEXT: s_clause 0x1 570; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 571; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 572; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 573; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 574; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 575; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 576; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 577; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 578; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 579; GFX10-CU-NEXT: buffer_gl0_inv 580; GFX10-CU-NEXT: buffer_gl1_inv 581; GFX10-CU-NEXT: s_endpgm 582; 583; SKIP-CACHE-INV-LABEL: flat_agent_acquire_atomicrmw: 584; SKIP-CACHE-INV: ; %bb.0: ; %entry 585; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 586; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 587; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 588; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 589; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 590; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 591; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 592; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 593; SKIP-CACHE-INV-NEXT: s_endpgm 594 i32* %out, i32 %in) { 595entry: 596 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire 597 ret void 598} 599 600define amdgpu_kernel void @flat_agent_release_atomicrmw( 601; GFX7-LABEL: flat_agent_release_atomicrmw: 602; GFX7: ; %bb.0: ; %entry 603; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 604; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 605; GFX7-NEXT: s_waitcnt lgkmcnt(0) 606; GFX7-NEXT: v_mov_b32_e32 v0, s0 607; GFX7-NEXT: v_mov_b32_e32 v1, s1 608; GFX7-NEXT: v_mov_b32_e32 v2, s2 609; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 610; GFX7-NEXT: flat_atomic_swap v[0:1], v2 611; GFX7-NEXT: s_endpgm 612; 613; GFX10-WGP-LABEL: flat_agent_release_atomicrmw: 614; GFX10-WGP: ; %bb.0: ; %entry 615; GFX10-WGP-NEXT: s_clause 0x1 616; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 617; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 618; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 619; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 620; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 621; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 622; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 623; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 624; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 625; GFX10-WGP-NEXT: s_endpgm 626; 627; GFX10-CU-LABEL: flat_agent_release_atomicrmw: 628; GFX10-CU: ; %bb.0: ; %entry 629; GFX10-CU-NEXT: s_clause 0x1 630; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 631; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 632; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 633; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 634; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 635; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 636; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 637; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 638; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 639; GFX10-CU-NEXT: s_endpgm 640; 641; SKIP-CACHE-INV-LABEL: flat_agent_release_atomicrmw: 642; SKIP-CACHE-INV: ; %bb.0: ; %entry 643; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 644; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 645; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 646; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 647; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 648; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 649; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 650; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 651; SKIP-CACHE-INV-NEXT: s_endpgm 652 i32* %out, i32 %in) { 653entry: 654 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") release 655 ret void 656} 657 658define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( 659; GFX7-LABEL: flat_agent_acq_rel_atomicrmw: 660; GFX7: ; %bb.0: ; %entry 661; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 662; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 663; GFX7-NEXT: s_waitcnt lgkmcnt(0) 664; GFX7-NEXT: v_mov_b32_e32 v0, s0 665; GFX7-NEXT: v_mov_b32_e32 v1, s1 666; GFX7-NEXT: v_mov_b32_e32 v2, s2 667; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 668; GFX7-NEXT: flat_atomic_swap v[0:1], v2 669; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 670; GFX7-NEXT: buffer_wbinvl1_vol 671; GFX7-NEXT: s_endpgm 672; 673; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw: 674; GFX10-WGP: ; %bb.0: ; %entry 675; GFX10-WGP-NEXT: s_clause 0x1 676; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 677; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 678; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 679; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 680; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 681; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 682; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 683; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 684; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 685; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 686; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 687; GFX10-WGP-NEXT: buffer_gl0_inv 688; GFX10-WGP-NEXT: buffer_gl1_inv 689; GFX10-WGP-NEXT: s_endpgm 690; 691; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw: 692; GFX10-CU: ; %bb.0: ; %entry 693; GFX10-CU-NEXT: s_clause 0x1 694; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 695; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 696; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 697; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 698; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 699; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 700; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 701; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 702; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 703; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 704; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 705; GFX10-CU-NEXT: buffer_gl0_inv 706; GFX10-CU-NEXT: buffer_gl1_inv 707; GFX10-CU-NEXT: s_endpgm 708; 709; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_atomicrmw: 710; SKIP-CACHE-INV: ; %bb.0: ; %entry 711; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 712; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 713; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 714; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 715; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 716; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 717; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 718; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 719; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 720; SKIP-CACHE-INV-NEXT: s_endpgm 721 i32* %out, i32 %in) { 722entry: 723 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel 724 ret void 725} 726 727define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( 728; GFX7-LABEL: flat_agent_seq_cst_atomicrmw: 729; GFX7: ; %bb.0: ; %entry 730; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 731; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 732; GFX7-NEXT: s_waitcnt lgkmcnt(0) 733; GFX7-NEXT: v_mov_b32_e32 v0, s0 734; GFX7-NEXT: v_mov_b32_e32 v1, s1 735; GFX7-NEXT: v_mov_b32_e32 v2, s2 736; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 737; GFX7-NEXT: flat_atomic_swap v[0:1], v2 738; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 739; GFX7-NEXT: buffer_wbinvl1_vol 740; GFX7-NEXT: s_endpgm 741; 742; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw: 743; GFX10-WGP: ; %bb.0: ; %entry 744; GFX10-WGP-NEXT: s_clause 0x1 745; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 746; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 747; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 748; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 749; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 750; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 751; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 752; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 753; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 754; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 755; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 756; GFX10-WGP-NEXT: buffer_gl0_inv 757; GFX10-WGP-NEXT: buffer_gl1_inv 758; GFX10-WGP-NEXT: s_endpgm 759; 760; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw: 761; GFX10-CU: ; %bb.0: ; %entry 762; GFX10-CU-NEXT: s_clause 0x1 763; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 764; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 765; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 766; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 767; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 768; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 769; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 770; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 771; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 772; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 773; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 774; GFX10-CU-NEXT: buffer_gl0_inv 775; GFX10-CU-NEXT: buffer_gl1_inv 776; GFX10-CU-NEXT: s_endpgm 777; 778; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_atomicrmw: 779; SKIP-CACHE-INV: ; %bb.0: ; %entry 780; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 781; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 782; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 783; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 784; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 785; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 786; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 787; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 788; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 789; SKIP-CACHE-INV-NEXT: s_endpgm 790 i32* %out, i32 %in) { 791entry: 792 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst 793 ret void 794} 795 796define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( 797; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw: 798; GFX7: ; %bb.0: ; %entry 799; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 800; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 801; GFX7-NEXT: s_waitcnt lgkmcnt(0) 802; GFX7-NEXT: v_mov_b32_e32 v0, s0 803; GFX7-NEXT: v_mov_b32_e32 v1, s1 804; GFX7-NEXT: v_mov_b32_e32 v2, s2 805; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 806; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 807; GFX7-NEXT: buffer_wbinvl1_vol 808; GFX7-NEXT: s_waitcnt lgkmcnt(0) 809; GFX7-NEXT: flat_store_dword v[0:1], v2 810; GFX7-NEXT: s_endpgm 811; 812; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw: 813; GFX10-WGP: ; %bb.0: ; %entry 814; GFX10-WGP-NEXT: s_clause 0x1 815; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 816; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 817; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 818; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 819; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 820; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 821; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 822; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 823; GFX10-WGP-NEXT: buffer_gl0_inv 824; GFX10-WGP-NEXT: buffer_gl1_inv 825; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 826; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 827; GFX10-WGP-NEXT: s_endpgm 828; 829; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw: 830; GFX10-CU: ; %bb.0: ; %entry 831; GFX10-CU-NEXT: s_clause 0x1 832; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 833; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 834; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 835; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 836; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 837; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 838; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 839; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 840; GFX10-CU-NEXT: buffer_gl0_inv 841; GFX10-CU-NEXT: buffer_gl1_inv 842; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 843; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 844; GFX10-CU-NEXT: s_endpgm 845; 846; SKIP-CACHE-INV-LABEL: flat_agent_acquire_ret_atomicrmw: 847; SKIP-CACHE-INV: ; %bb.0: ; %entry 848; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 849; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 850; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 851; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 852; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 853; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 854; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 855; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 856; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 857; SKIP-CACHE-INV-NEXT: s_endpgm 858 i32* %out, i32 %in) { 859entry: 860 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire 861 store i32 %val, i32* %out, align 4 862 ret void 863} 864 865define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( 866; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw: 867; GFX7: ; %bb.0: ; %entry 868; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 869; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 870; GFX7-NEXT: s_waitcnt lgkmcnt(0) 871; GFX7-NEXT: v_mov_b32_e32 v0, s0 872; GFX7-NEXT: v_mov_b32_e32 v1, s1 873; GFX7-NEXT: v_mov_b32_e32 v2, s2 874; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 875; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 876; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 877; GFX7-NEXT: buffer_wbinvl1_vol 878; GFX7-NEXT: s_waitcnt lgkmcnt(0) 879; GFX7-NEXT: flat_store_dword v[0:1], v2 880; GFX7-NEXT: s_endpgm 881; 882; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw: 883; GFX10-WGP: ; %bb.0: ; %entry 884; GFX10-WGP-NEXT: s_clause 0x1 885; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 886; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 887; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 888; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 889; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 890; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 891; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 892; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 893; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 894; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 895; GFX10-WGP-NEXT: buffer_gl0_inv 896; GFX10-WGP-NEXT: buffer_gl1_inv 897; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 898; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 899; GFX10-WGP-NEXT: s_endpgm 900; 901; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: 902; GFX10-CU: ; %bb.0: ; %entry 903; GFX10-CU-NEXT: s_clause 0x1 904; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 905; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 906; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 907; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 908; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 909; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 910; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 911; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 912; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 913; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 914; GFX10-CU-NEXT: buffer_gl0_inv 915; GFX10-CU-NEXT: buffer_gl1_inv 916; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 917; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 918; GFX10-CU-NEXT: s_endpgm 919; 920; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_ret_atomicrmw: 921; SKIP-CACHE-INV: ; %bb.0: ; %entry 922; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 923; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 924; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 925; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 926; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 927; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 928; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 929; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 930; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 931; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 932; SKIP-CACHE-INV-NEXT: s_endpgm 933 i32* %out, i32 %in) { 934entry: 935 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel 936 store i32 %val, i32* %out, align 4 937 ret void 938} 939 940define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( 941; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw: 942; GFX7: ; %bb.0: ; %entry 943; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 944; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 945; GFX7-NEXT: s_waitcnt lgkmcnt(0) 946; GFX7-NEXT: v_mov_b32_e32 v0, s0 947; GFX7-NEXT: v_mov_b32_e32 v1, s1 948; GFX7-NEXT: v_mov_b32_e32 v2, s2 949; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 950; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 951; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 952; GFX7-NEXT: buffer_wbinvl1_vol 953; GFX7-NEXT: s_waitcnt lgkmcnt(0) 954; GFX7-NEXT: flat_store_dword v[0:1], v2 955; GFX7-NEXT: s_endpgm 956; 957; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw: 958; GFX10-WGP: ; %bb.0: ; %entry 959; GFX10-WGP-NEXT: s_clause 0x1 960; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 961; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 962; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 963; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 964; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 965; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 966; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 967; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 968; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 969; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 970; GFX10-WGP-NEXT: buffer_gl0_inv 971; GFX10-WGP-NEXT: buffer_gl1_inv 972; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 973; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 974; GFX10-WGP-NEXT: s_endpgm 975; 976; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: 977; GFX10-CU: ; %bb.0: ; %entry 978; GFX10-CU-NEXT: s_clause 0x1 979; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 980; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 981; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 982; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 983; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 984; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 985; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 986; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 987; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 988; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 989; GFX10-CU-NEXT: buffer_gl0_inv 990; GFX10-CU-NEXT: buffer_gl1_inv 991; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 992; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 993; GFX10-CU-NEXT: s_endpgm 994; 995; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_ret_atomicrmw: 996; SKIP-CACHE-INV: ; %bb.0: ; %entry 997; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 998; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 999; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1000; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1001; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1002; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1003; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1004; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1005; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1006; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1007; SKIP-CACHE-INV-NEXT: s_endpgm 1008 i32* %out, i32 %in) { 1009entry: 1010 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst 1011 store i32 %val, i32* %out, align 4 1012 ret void 1013} 1014 1015define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( 1016; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg: 1017; GFX7: ; %bb.0: ; %entry 1018; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1019; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1020; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1021; GFX7-NEXT: s_add_u32 s0, s0, 16 1022; GFX7-NEXT: s_addc_u32 s1, s1, 0 1023; GFX7-NEXT: v_mov_b32_e32 v0, s0 1024; GFX7-NEXT: v_mov_b32_e32 v2, s2 1025; GFX7-NEXT: v_mov_b32_e32 v1, s1 1026; GFX7-NEXT: v_mov_b32_e32 v3, s3 1027; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1028; GFX7-NEXT: s_endpgm 1029; 1030; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: 1031; GFX10-WGP: ; %bb.0: ; %entry 1032; GFX10-WGP-NEXT: s_clause 0x1 1033; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1034; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1035; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1036; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1037; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1038; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1039; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1040; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1041; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1042; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1043; GFX10-WGP-NEXT: s_endpgm 1044; 1045; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: 1046; GFX10-CU: ; %bb.0: ; %entry 1047; GFX10-CU-NEXT: s_clause 0x1 1048; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1049; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1050; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1051; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1052; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1053; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1054; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1055; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1056; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1057; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1058; GFX10-CU-NEXT: s_endpgm 1059; 1060; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_monotonic_cmpxchg: 1061; SKIP-CACHE-INV: ; %bb.0: ; %entry 1062; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1063; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1064; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1065; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1066; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1067; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1068; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1069; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1070; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1071; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1072; SKIP-CACHE-INV-NEXT: s_endpgm 1073 i32* %out, i32 %in, i32 %old) { 1074entry: 1075 %gep = getelementptr i32, i32* %out, i32 4 1076 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic 1077 ret void 1078} 1079 1080define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( 1081; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg: 1082; GFX7: ; %bb.0: ; %entry 1083; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1084; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1085; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1086; GFX7-NEXT: s_add_u32 s0, s0, 16 1087; GFX7-NEXT: s_addc_u32 s1, s1, 0 1088; GFX7-NEXT: v_mov_b32_e32 v0, s0 1089; GFX7-NEXT: v_mov_b32_e32 v2, s2 1090; GFX7-NEXT: v_mov_b32_e32 v1, s1 1091; GFX7-NEXT: v_mov_b32_e32 v3, s3 1092; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1093; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1094; GFX7-NEXT: buffer_wbinvl1_vol 1095; GFX7-NEXT: s_endpgm 1096; 1097; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: 1098; GFX10-WGP: ; %bb.0: ; %entry 1099; GFX10-WGP-NEXT: s_clause 0x1 1100; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1101; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1102; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1103; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1104; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1105; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1106; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1107; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1108; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1109; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1110; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1111; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1112; GFX10-WGP-NEXT: buffer_gl0_inv 1113; GFX10-WGP-NEXT: buffer_gl1_inv 1114; GFX10-WGP-NEXT: s_endpgm 1115; 1116; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: 1117; GFX10-CU: ; %bb.0: ; %entry 1118; GFX10-CU-NEXT: s_clause 0x1 1119; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1120; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1121; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1122; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1123; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1124; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1125; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1126; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1127; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1128; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1129; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1130; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1131; GFX10-CU-NEXT: buffer_gl0_inv 1132; GFX10-CU-NEXT: buffer_gl1_inv 1133; GFX10-CU-NEXT: s_endpgm 1134; 1135; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_cmpxchg: 1136; SKIP-CACHE-INV: ; %bb.0: ; %entry 1137; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1138; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1139; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1140; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1141; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1142; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1143; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1144; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1145; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1146; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1147; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1148; SKIP-CACHE-INV-NEXT: s_endpgm 1149 i32* %out, i32 %in, i32 %old) { 1150entry: 1151 %gep = getelementptr i32, i32* %out, i32 4 1152 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic 1153 ret void 1154} 1155 1156define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( 1157; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg: 1158; GFX7: ; %bb.0: ; %entry 1159; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1160; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1161; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1162; GFX7-NEXT: s_add_u32 s0, s0, 16 1163; GFX7-NEXT: s_addc_u32 s1, s1, 0 1164; GFX7-NEXT: v_mov_b32_e32 v0, s0 1165; GFX7-NEXT: v_mov_b32_e32 v2, s2 1166; GFX7-NEXT: v_mov_b32_e32 v1, s1 1167; GFX7-NEXT: v_mov_b32_e32 v3, s3 1168; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1169; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1170; GFX7-NEXT: s_endpgm 1171; 1172; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: 1173; GFX10-WGP: ; %bb.0: ; %entry 1174; GFX10-WGP-NEXT: s_clause 0x1 1175; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1176; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1177; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1178; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1179; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1180; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1181; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1182; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1183; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1184; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1185; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1186; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1187; GFX10-WGP-NEXT: s_endpgm 1188; 1189; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg: 1190; GFX10-CU: ; %bb.0: ; %entry 1191; GFX10-CU-NEXT: s_clause 0x1 1192; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1193; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1194; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1195; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1196; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1197; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1198; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1199; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1200; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1201; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1202; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1203; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1204; GFX10-CU-NEXT: s_endpgm 1205; 1206; SKIP-CACHE-INV-LABEL: flat_agent_release_monotonic_cmpxchg: 1207; SKIP-CACHE-INV: ; %bb.0: ; %entry 1208; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1209; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1210; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1211; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1212; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1213; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1214; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1215; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1216; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1217; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1218; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1219; SKIP-CACHE-INV-NEXT: s_endpgm 1220 i32* %out, i32 %in, i32 %old) { 1221entry: 1222 %gep = getelementptr i32, i32* %out, i32 4 1223 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release monotonic 1224 ret void 1225} 1226 1227define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( 1228; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: 1229; GFX7: ; %bb.0: ; %entry 1230; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1231; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1232; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1233; GFX7-NEXT: s_add_u32 s0, s0, 16 1234; GFX7-NEXT: s_addc_u32 s1, s1, 0 1235; GFX7-NEXT: v_mov_b32_e32 v0, s0 1236; GFX7-NEXT: v_mov_b32_e32 v2, s2 1237; GFX7-NEXT: v_mov_b32_e32 v1, s1 1238; GFX7-NEXT: v_mov_b32_e32 v3, s3 1239; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1240; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1241; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1242; GFX7-NEXT: buffer_wbinvl1_vol 1243; GFX7-NEXT: s_endpgm 1244; 1245; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: 1246; GFX10-WGP: ; %bb.0: ; %entry 1247; GFX10-WGP-NEXT: s_clause 0x1 1248; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1249; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1250; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1251; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1252; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1253; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1254; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1255; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1256; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1257; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1258; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1259; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1260; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1261; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1262; GFX10-WGP-NEXT: buffer_gl0_inv 1263; GFX10-WGP-NEXT: buffer_gl1_inv 1264; GFX10-WGP-NEXT: s_endpgm 1265; 1266; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: 1267; GFX10-CU: ; %bb.0: ; %entry 1268; GFX10-CU-NEXT: s_clause 0x1 1269; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1270; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1271; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1272; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1273; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1274; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1275; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1276; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1277; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1278; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1279; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1280; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1281; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1282; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1283; GFX10-CU-NEXT: buffer_gl0_inv 1284; GFX10-CU-NEXT: buffer_gl1_inv 1285; GFX10-CU-NEXT: s_endpgm 1286; 1287; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: 1288; SKIP-CACHE-INV: ; %bb.0: ; %entry 1289; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1290; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1291; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1292; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1293; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1294; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1295; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1296; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1297; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1298; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1299; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1300; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1301; SKIP-CACHE-INV-NEXT: s_endpgm 1302 i32* %out, i32 %in, i32 %old) { 1303entry: 1304 %gep = getelementptr i32, i32* %out, i32 4 1305 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic 1306 ret void 1307} 1308 1309define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( 1310; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: 1311; GFX7: ; %bb.0: ; %entry 1312; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1313; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1314; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1315; GFX7-NEXT: s_add_u32 s0, s0, 16 1316; GFX7-NEXT: s_addc_u32 s1, s1, 0 1317; GFX7-NEXT: v_mov_b32_e32 v0, s0 1318; GFX7-NEXT: v_mov_b32_e32 v2, s2 1319; GFX7-NEXT: v_mov_b32_e32 v1, s1 1320; GFX7-NEXT: v_mov_b32_e32 v3, s3 1321; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1322; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1323; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1324; GFX7-NEXT: buffer_wbinvl1_vol 1325; GFX7-NEXT: s_endpgm 1326; 1327; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: 1328; GFX10-WGP: ; %bb.0: ; %entry 1329; GFX10-WGP-NEXT: s_clause 0x1 1330; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1331; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1332; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1333; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1334; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1335; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1336; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1337; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1338; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1339; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1340; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1341; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1342; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1343; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1344; GFX10-WGP-NEXT: buffer_gl0_inv 1345; GFX10-WGP-NEXT: buffer_gl1_inv 1346; GFX10-WGP-NEXT: s_endpgm 1347; 1348; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: 1349; GFX10-CU: ; %bb.0: ; %entry 1350; GFX10-CU-NEXT: s_clause 0x1 1351; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1352; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1353; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1354; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1355; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1356; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1357; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1358; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1359; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1360; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1361; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1362; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1363; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1364; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1365; GFX10-CU-NEXT: buffer_gl0_inv 1366; GFX10-CU-NEXT: buffer_gl1_inv 1367; GFX10-CU-NEXT: s_endpgm 1368; 1369; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: 1370; SKIP-CACHE-INV: ; %bb.0: ; %entry 1371; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1372; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1373; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1374; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1375; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1376; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1377; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1378; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1379; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1380; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1381; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1382; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1383; SKIP-CACHE-INV-NEXT: s_endpgm 1384 i32* %out, i32 %in, i32 %old) { 1385entry: 1386 %gep = getelementptr i32, i32* %out, i32 4 1387 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic 1388 ret void 1389} 1390 1391define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( 1392; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg: 1393; GFX7: ; %bb.0: ; %entry 1394; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1395; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1396; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1397; GFX7-NEXT: s_add_u32 s0, s0, 16 1398; GFX7-NEXT: s_addc_u32 s1, s1, 0 1399; GFX7-NEXT: v_mov_b32_e32 v0, s0 1400; GFX7-NEXT: v_mov_b32_e32 v2, s2 1401; GFX7-NEXT: v_mov_b32_e32 v1, s1 1402; GFX7-NEXT: v_mov_b32_e32 v3, s3 1403; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1404; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1405; GFX7-NEXT: buffer_wbinvl1_vol 1406; GFX7-NEXT: s_endpgm 1407; 1408; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: 1409; GFX10-WGP: ; %bb.0: ; %entry 1410; GFX10-WGP-NEXT: s_clause 0x1 1411; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1412; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1413; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1414; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1415; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1416; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1417; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1418; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1419; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1420; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1421; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1422; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1423; GFX10-WGP-NEXT: buffer_gl0_inv 1424; GFX10-WGP-NEXT: buffer_gl1_inv 1425; GFX10-WGP-NEXT: s_endpgm 1426; 1427; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: 1428; GFX10-CU: ; %bb.0: ; %entry 1429; GFX10-CU-NEXT: s_clause 0x1 1430; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1431; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1432; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1433; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1434; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1435; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1436; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1437; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1438; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1439; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1440; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1441; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1442; GFX10-CU-NEXT: buffer_gl0_inv 1443; GFX10-CU-NEXT: buffer_gl1_inv 1444; GFX10-CU-NEXT: s_endpgm 1445; 1446; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_cmpxchg: 1447; SKIP-CACHE-INV: ; %bb.0: ; %entry 1448; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1449; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1450; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1451; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1452; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1453; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1454; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1455; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1456; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1457; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1458; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1459; SKIP-CACHE-INV-NEXT: s_endpgm 1460 i32* %out, i32 %in, i32 %old) { 1461entry: 1462 %gep = getelementptr i32, i32* %out, i32 4 1463 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire 1464 ret void 1465} 1466 1467define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( 1468; GFX7-LABEL: flat_agent_release_acquire_cmpxchg: 1469; GFX7: ; %bb.0: ; %entry 1470; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1471; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1472; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1473; GFX7-NEXT: s_add_u32 s0, s0, 16 1474; GFX7-NEXT: s_addc_u32 s1, s1, 0 1475; GFX7-NEXT: v_mov_b32_e32 v0, s0 1476; GFX7-NEXT: v_mov_b32_e32 v2, s2 1477; GFX7-NEXT: v_mov_b32_e32 v1, s1 1478; GFX7-NEXT: v_mov_b32_e32 v3, s3 1479; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1480; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1481; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1482; GFX7-NEXT: buffer_wbinvl1_vol 1483; GFX7-NEXT: s_endpgm 1484; 1485; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg: 1486; GFX10-WGP: ; %bb.0: ; %entry 1487; GFX10-WGP-NEXT: s_clause 0x1 1488; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1489; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1490; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1491; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1492; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1493; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1494; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1495; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1496; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1497; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1498; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1499; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1500; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1501; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1502; GFX10-WGP-NEXT: buffer_gl0_inv 1503; GFX10-WGP-NEXT: buffer_gl1_inv 1504; GFX10-WGP-NEXT: s_endpgm 1505; 1506; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg: 1507; GFX10-CU: ; %bb.0: ; %entry 1508; GFX10-CU-NEXT: s_clause 0x1 1509; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1510; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1511; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1512; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1513; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1514; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1515; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1516; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1517; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1518; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1519; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1520; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1521; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1522; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1523; GFX10-CU-NEXT: buffer_gl0_inv 1524; GFX10-CU-NEXT: buffer_gl1_inv 1525; GFX10-CU-NEXT: s_endpgm 1526; 1527; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_cmpxchg: 1528; SKIP-CACHE-INV: ; %bb.0: ; %entry 1529; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1530; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1531; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1532; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1533; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1534; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1535; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1536; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1537; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1538; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1539; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1540; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1541; SKIP-CACHE-INV-NEXT: s_endpgm 1542 i32* %out, i32 %in, i32 %old) { 1543entry: 1544 %gep = getelementptr i32, i32* %out, i32 4 1545 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire 1546 ret void 1547} 1548 1549define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( 1550; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg: 1551; GFX7: ; %bb.0: ; %entry 1552; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1553; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1554; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1555; GFX7-NEXT: s_add_u32 s0, s0, 16 1556; GFX7-NEXT: s_addc_u32 s1, s1, 0 1557; GFX7-NEXT: v_mov_b32_e32 v0, s0 1558; GFX7-NEXT: v_mov_b32_e32 v2, s2 1559; GFX7-NEXT: v_mov_b32_e32 v1, s1 1560; GFX7-NEXT: v_mov_b32_e32 v3, s3 1561; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1562; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1563; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1564; GFX7-NEXT: buffer_wbinvl1_vol 1565; GFX7-NEXT: s_endpgm 1566; 1567; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: 1568; GFX10-WGP: ; %bb.0: ; %entry 1569; GFX10-WGP-NEXT: s_clause 0x1 1570; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1571; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1572; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1573; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1574; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1575; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1576; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1577; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1578; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1579; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1580; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1581; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1582; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1583; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1584; GFX10-WGP-NEXT: buffer_gl0_inv 1585; GFX10-WGP-NEXT: buffer_gl1_inv 1586; GFX10-WGP-NEXT: s_endpgm 1587; 1588; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: 1589; GFX10-CU: ; %bb.0: ; %entry 1590; GFX10-CU-NEXT: s_clause 0x1 1591; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1592; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1593; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1594; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1595; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1596; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1597; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1598; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1599; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1600; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1601; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1602; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1603; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1604; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1605; GFX10-CU-NEXT: buffer_gl0_inv 1606; GFX10-CU-NEXT: buffer_gl1_inv 1607; GFX10-CU-NEXT: s_endpgm 1608; 1609; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_cmpxchg: 1610; SKIP-CACHE-INV: ; %bb.0: ; %entry 1611; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1612; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1613; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1614; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1615; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1616; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1617; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1618; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1619; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1620; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1621; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1622; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1623; SKIP-CACHE-INV-NEXT: s_endpgm 1624 i32* %out, i32 %in, i32 %old) { 1625entry: 1626 %gep = getelementptr i32, i32* %out, i32 4 1627 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire 1628 ret void 1629} 1630 1631define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( 1632; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg: 1633; GFX7: ; %bb.0: ; %entry 1634; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1635; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1636; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1637; GFX7-NEXT: s_add_u32 s0, s0, 16 1638; GFX7-NEXT: s_addc_u32 s1, s1, 0 1639; GFX7-NEXT: v_mov_b32_e32 v0, s0 1640; GFX7-NEXT: v_mov_b32_e32 v2, s2 1641; GFX7-NEXT: v_mov_b32_e32 v1, s1 1642; GFX7-NEXT: v_mov_b32_e32 v3, s3 1643; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1644; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1645; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1646; GFX7-NEXT: buffer_wbinvl1_vol 1647; GFX7-NEXT: s_endpgm 1648; 1649; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: 1650; GFX10-WGP: ; %bb.0: ; %entry 1651; GFX10-WGP-NEXT: s_clause 0x1 1652; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1653; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1654; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1655; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1656; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1657; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1658; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1659; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1660; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1661; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1662; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1663; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1664; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1665; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1666; GFX10-WGP-NEXT: buffer_gl0_inv 1667; GFX10-WGP-NEXT: buffer_gl1_inv 1668; GFX10-WGP-NEXT: s_endpgm 1669; 1670; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: 1671; GFX10-CU: ; %bb.0: ; %entry 1672; GFX10-CU-NEXT: s_clause 0x1 1673; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1674; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1675; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1676; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1677; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1678; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1679; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1680; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1681; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1682; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1683; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1684; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1685; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1686; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1687; GFX10-CU-NEXT: buffer_gl0_inv 1688; GFX10-CU-NEXT: buffer_gl1_inv 1689; GFX10-CU-NEXT: s_endpgm 1690; 1691; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_cmpxchg: 1692; SKIP-CACHE-INV: ; %bb.0: ; %entry 1693; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1694; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1695; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1696; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1697; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1698; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1699; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1700; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1701; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1702; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1703; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1704; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1705; SKIP-CACHE-INV-NEXT: s_endpgm 1706 i32* %out, i32 %in, i32 %old) { 1707entry: 1708 %gep = getelementptr i32, i32* %out, i32 4 1709 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire 1710 ret void 1711} 1712 1713define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( 1714; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: 1715; GFX7: ; %bb.0: ; %entry 1716; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1717; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1718; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1719; GFX7-NEXT: s_add_u32 s0, s0, 16 1720; GFX7-NEXT: s_addc_u32 s1, s1, 0 1721; GFX7-NEXT: v_mov_b32_e32 v0, s0 1722; GFX7-NEXT: v_mov_b32_e32 v2, s2 1723; GFX7-NEXT: v_mov_b32_e32 v1, s1 1724; GFX7-NEXT: v_mov_b32_e32 v3, s3 1725; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1726; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1727; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1728; GFX7-NEXT: buffer_wbinvl1_vol 1729; GFX7-NEXT: s_endpgm 1730; 1731; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: 1732; GFX10-WGP: ; %bb.0: ; %entry 1733; GFX10-WGP-NEXT: s_clause 0x1 1734; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1735; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1736; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1737; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 1738; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 1739; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1740; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1741; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1742; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1743; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1744; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1745; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1746; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1747; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1748; GFX10-WGP-NEXT: buffer_gl0_inv 1749; GFX10-WGP-NEXT: buffer_gl1_inv 1750; GFX10-WGP-NEXT: s_endpgm 1751; 1752; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: 1753; GFX10-CU: ; %bb.0: ; %entry 1754; GFX10-CU-NEXT: s_clause 0x1 1755; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1756; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1757; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1758; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 1759; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 1760; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1761; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1762; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1763; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1764; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1765; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1766; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1767; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1768; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1769; GFX10-CU-NEXT: buffer_gl0_inv 1770; GFX10-CU-NEXT: buffer_gl1_inv 1771; GFX10-CU-NEXT: s_endpgm 1772; 1773; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: 1774; SKIP-CACHE-INV: ; %bb.0: ; %entry 1775; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1776; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1777; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1778; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 1779; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 1780; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1781; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1782; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1783; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1784; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1785; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1786; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1787; SKIP-CACHE-INV-NEXT: s_endpgm 1788 i32* %out, i32 %in, i32 %old) { 1789entry: 1790 %gep = getelementptr i32, i32* %out, i32 4 1791 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst 1792 ret void 1793} 1794 1795define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( 1796; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: 1797; GFX7: ; %bb.0: ; %entry 1798; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1799; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1800; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1801; GFX7-NEXT: s_add_u32 s4, s0, 16 1802; GFX7-NEXT: s_addc_u32 s5, s1, 0 1803; GFX7-NEXT: v_mov_b32_e32 v0, s4 1804; GFX7-NEXT: v_mov_b32_e32 v2, s2 1805; GFX7-NEXT: v_mov_b32_e32 v1, s5 1806; GFX7-NEXT: v_mov_b32_e32 v3, s3 1807; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1808; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1809; GFX7-NEXT: buffer_wbinvl1_vol 1810; GFX7-NEXT: v_mov_b32_e32 v0, s0 1811; GFX7-NEXT: v_mov_b32_e32 v1, s1 1812; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1813; GFX7-NEXT: flat_store_dword v[0:1], v2 1814; GFX7-NEXT: s_endpgm 1815; 1816; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: 1817; GFX10-WGP: ; %bb.0: ; %entry 1818; GFX10-WGP-NEXT: s_clause 0x1 1819; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1820; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1821; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1822; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 1823; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 1824; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 1825; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1826; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 1827; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1828; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1829; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1830; GFX10-WGP-NEXT: buffer_gl0_inv 1831; GFX10-WGP-NEXT: buffer_gl1_inv 1832; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1833; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1834; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1835; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1836; GFX10-WGP-NEXT: s_endpgm 1837; 1838; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: 1839; GFX10-CU: ; %bb.0: ; %entry 1840; GFX10-CU-NEXT: s_clause 0x1 1841; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1842; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1843; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1844; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 1845; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 1846; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 1847; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1848; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 1849; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1850; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1851; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1852; GFX10-CU-NEXT: buffer_gl0_inv 1853; GFX10-CU-NEXT: buffer_gl1_inv 1854; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1855; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1856; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1857; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1858; GFX10-CU-NEXT: s_endpgm 1859; 1860; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: 1861; SKIP-CACHE-INV: ; %bb.0: ; %entry 1862; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1863; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1864; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1865; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 1866; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 1867; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 1868; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1869; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 1870; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1871; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1872; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1873; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1874; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1875; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1876; SKIP-CACHE-INV-NEXT: s_endpgm 1877 i32* %out, i32 %in, i32 %old) { 1878entry: 1879 %gep = getelementptr i32, i32* %out, i32 4 1880 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic 1881 %val0 = extractvalue { i32, i1 } %val, 0 1882 store i32 %val0, i32* %out, align 4 1883 ret void 1884} 1885 1886define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( 1887; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: 1888; GFX7: ; %bb.0: ; %entry 1889; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1890; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1891; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1892; GFX7-NEXT: s_add_u32 s4, s0, 16 1893; GFX7-NEXT: s_addc_u32 s5, s1, 0 1894; GFX7-NEXT: v_mov_b32_e32 v0, s4 1895; GFX7-NEXT: v_mov_b32_e32 v2, s2 1896; GFX7-NEXT: v_mov_b32_e32 v1, s5 1897; GFX7-NEXT: v_mov_b32_e32 v3, s3 1898; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1899; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1900; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1901; GFX7-NEXT: buffer_wbinvl1_vol 1902; GFX7-NEXT: v_mov_b32_e32 v0, s0 1903; GFX7-NEXT: v_mov_b32_e32 v1, s1 1904; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1905; GFX7-NEXT: flat_store_dword v[0:1], v2 1906; GFX7-NEXT: s_endpgm 1907; 1908; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: 1909; GFX10-WGP: ; %bb.0: ; %entry 1910; GFX10-WGP-NEXT: s_clause 0x1 1911; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1912; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1913; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1914; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 1915; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 1916; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 1917; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 1918; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 1919; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 1920; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1921; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1922; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1923; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1924; GFX10-WGP-NEXT: buffer_gl0_inv 1925; GFX10-WGP-NEXT: buffer_gl1_inv 1926; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1927; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1928; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1929; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 1930; GFX10-WGP-NEXT: s_endpgm 1931; 1932; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: 1933; GFX10-CU: ; %bb.0: ; %entry 1934; GFX10-CU-NEXT: s_clause 0x1 1935; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1936; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1937; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1938; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 1939; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 1940; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 1941; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 1942; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 1943; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 1944; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1945; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 1946; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1947; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1948; GFX10-CU-NEXT: buffer_gl0_inv 1949; GFX10-CU-NEXT: buffer_gl1_inv 1950; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1951; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1952; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1953; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 1954; GFX10-CU-NEXT: s_endpgm 1955; 1956; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: 1957; SKIP-CACHE-INV: ; %bb.0: ; %entry 1958; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 1959; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1960; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1961; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 1962; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 1963; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 1964; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 1965; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 1966; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 1967; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1968; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1969; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1970; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 1971; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 1972; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 1973; SKIP-CACHE-INV-NEXT: s_endpgm 1974 i32* %out, i32 %in, i32 %old) { 1975entry: 1976 %gep = getelementptr i32, i32* %out, i32 4 1977 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic 1978 %val0 = extractvalue { i32, i1 } %val, 0 1979 store i32 %val0, i32* %out, align 4 1980 ret void 1981} 1982 1983define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( 1984; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: 1985; GFX7: ; %bb.0: ; %entry 1986; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1987; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1988; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1989; GFX7-NEXT: s_add_u32 s4, s0, 16 1990; GFX7-NEXT: s_addc_u32 s5, s1, 0 1991; GFX7-NEXT: v_mov_b32_e32 v0, s4 1992; GFX7-NEXT: v_mov_b32_e32 v2, s2 1993; GFX7-NEXT: v_mov_b32_e32 v1, s5 1994; GFX7-NEXT: v_mov_b32_e32 v3, s3 1995; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1996; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1997; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1998; GFX7-NEXT: buffer_wbinvl1_vol 1999; GFX7-NEXT: v_mov_b32_e32 v0, s0 2000; GFX7-NEXT: v_mov_b32_e32 v1, s1 2001; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2002; GFX7-NEXT: flat_store_dword v[0:1], v2 2003; GFX7-NEXT: s_endpgm 2004; 2005; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: 2006; GFX10-WGP: ; %bb.0: ; %entry 2007; GFX10-WGP-NEXT: s_clause 0x1 2008; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2009; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2010; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2011; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2012; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2013; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2014; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2015; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2016; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2017; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2018; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2019; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2020; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2021; GFX10-WGP-NEXT: buffer_gl0_inv 2022; GFX10-WGP-NEXT: buffer_gl1_inv 2023; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2024; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2025; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2026; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2027; GFX10-WGP-NEXT: s_endpgm 2028; 2029; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: 2030; GFX10-CU: ; %bb.0: ; %entry 2031; GFX10-CU-NEXT: s_clause 0x1 2032; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2033; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2034; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2035; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2036; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2037; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2038; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2039; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2040; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2041; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2042; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2043; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2044; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2045; GFX10-CU-NEXT: buffer_gl0_inv 2046; GFX10-CU-NEXT: buffer_gl1_inv 2047; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2048; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2049; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2050; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2051; GFX10-CU-NEXT: s_endpgm 2052; 2053; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: 2054; SKIP-CACHE-INV: ; %bb.0: ; %entry 2055; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2056; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2057; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2058; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2059; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2060; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2061; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2062; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2063; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2064; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2065; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2066; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2067; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2068; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2069; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2070; SKIP-CACHE-INV-NEXT: s_endpgm 2071 i32* %out, i32 %in, i32 %old) { 2072entry: 2073 %gep = getelementptr i32, i32* %out, i32 4 2074 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic 2075 %val0 = extractvalue { i32, i1 } %val, 0 2076 store i32 %val0, i32* %out, align 4 2077 ret void 2078} 2079 2080define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( 2081; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: 2082; GFX7: ; %bb.0: ; %entry 2083; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2084; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2085; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2086; GFX7-NEXT: s_add_u32 s4, s0, 16 2087; GFX7-NEXT: s_addc_u32 s5, s1, 0 2088; GFX7-NEXT: v_mov_b32_e32 v0, s4 2089; GFX7-NEXT: v_mov_b32_e32 v2, s2 2090; GFX7-NEXT: v_mov_b32_e32 v1, s5 2091; GFX7-NEXT: v_mov_b32_e32 v3, s3 2092; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2093; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2094; GFX7-NEXT: buffer_wbinvl1_vol 2095; GFX7-NEXT: v_mov_b32_e32 v0, s0 2096; GFX7-NEXT: v_mov_b32_e32 v1, s1 2097; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2098; GFX7-NEXT: flat_store_dword v[0:1], v2 2099; GFX7-NEXT: s_endpgm 2100; 2101; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: 2102; GFX10-WGP: ; %bb.0: ; %entry 2103; GFX10-WGP-NEXT: s_clause 0x1 2104; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2105; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2106; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2107; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2108; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2109; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2110; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2111; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2112; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2113; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2114; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2115; GFX10-WGP-NEXT: buffer_gl0_inv 2116; GFX10-WGP-NEXT: buffer_gl1_inv 2117; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2118; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2119; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2120; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2121; GFX10-WGP-NEXT: s_endpgm 2122; 2123; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: 2124; GFX10-CU: ; %bb.0: ; %entry 2125; GFX10-CU-NEXT: s_clause 0x1 2126; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2127; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2128; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2129; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2130; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2131; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2132; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2133; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2134; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2135; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2136; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2137; GFX10-CU-NEXT: buffer_gl0_inv 2138; GFX10-CU-NEXT: buffer_gl1_inv 2139; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2140; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2141; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2142; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2143; GFX10-CU-NEXT: s_endpgm 2144; 2145; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: 2146; SKIP-CACHE-INV: ; %bb.0: ; %entry 2147; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2148; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2149; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2150; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2151; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2152; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2153; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2154; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2155; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2156; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2157; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2158; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2159; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2160; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2161; SKIP-CACHE-INV-NEXT: s_endpgm 2162 i32* %out, i32 %in, i32 %old) { 2163entry: 2164 %gep = getelementptr i32, i32* %out, i32 4 2165 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire 2166 %val0 = extractvalue { i32, i1 } %val, 0 2167 store i32 %val0, i32* %out, align 4 2168 ret void 2169} 2170 2171define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( 2172; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg: 2173; GFX7: ; %bb.0: ; %entry 2174; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2175; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2176; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2177; GFX7-NEXT: s_add_u32 s4, s0, 16 2178; GFX7-NEXT: s_addc_u32 s5, s1, 0 2179; GFX7-NEXT: v_mov_b32_e32 v0, s4 2180; GFX7-NEXT: v_mov_b32_e32 v2, s2 2181; GFX7-NEXT: v_mov_b32_e32 v1, s5 2182; GFX7-NEXT: v_mov_b32_e32 v3, s3 2183; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2184; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2185; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2186; GFX7-NEXT: buffer_wbinvl1_vol 2187; GFX7-NEXT: v_mov_b32_e32 v0, s0 2188; GFX7-NEXT: v_mov_b32_e32 v1, s1 2189; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2190; GFX7-NEXT: flat_store_dword v[0:1], v2 2191; GFX7-NEXT: s_endpgm 2192; 2193; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: 2194; GFX10-WGP: ; %bb.0: ; %entry 2195; GFX10-WGP-NEXT: s_clause 0x1 2196; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2197; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2198; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2199; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2200; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2201; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2202; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2203; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2204; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2205; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2206; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2207; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2208; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2209; GFX10-WGP-NEXT: buffer_gl0_inv 2210; GFX10-WGP-NEXT: buffer_gl1_inv 2211; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2212; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2213; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2214; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2215; GFX10-WGP-NEXT: s_endpgm 2216; 2217; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: 2218; GFX10-CU: ; %bb.0: ; %entry 2219; GFX10-CU-NEXT: s_clause 0x1 2220; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2221; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2222; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2223; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2224; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2225; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2226; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2227; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2228; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2229; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2230; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2231; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2232; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2233; GFX10-CU-NEXT: buffer_gl0_inv 2234; GFX10-CU-NEXT: buffer_gl1_inv 2235; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2236; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2237; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2238; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2239; GFX10-CU-NEXT: s_endpgm 2240; 2241; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_ret_cmpxchg: 2242; SKIP-CACHE-INV: ; %bb.0: ; %entry 2243; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2244; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2245; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2246; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2247; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2248; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2249; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2250; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2251; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2252; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2253; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2254; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2255; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2256; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2257; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2258; SKIP-CACHE-INV-NEXT: s_endpgm 2259 i32* %out, i32 %in, i32 %old) { 2260entry: 2261 %gep = getelementptr i32, i32* %out, i32 4 2262 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire 2263 %val0 = extractvalue { i32, i1 } %val, 0 2264 store i32 %val0, i32* %out, align 4 2265 ret void 2266} 2267 2268define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( 2269; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: 2270; GFX7: ; %bb.0: ; %entry 2271; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2272; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2273; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2274; GFX7-NEXT: s_add_u32 s4, s0, 16 2275; GFX7-NEXT: s_addc_u32 s5, s1, 0 2276; GFX7-NEXT: v_mov_b32_e32 v0, s4 2277; GFX7-NEXT: v_mov_b32_e32 v2, s2 2278; GFX7-NEXT: v_mov_b32_e32 v1, s5 2279; GFX7-NEXT: v_mov_b32_e32 v3, s3 2280; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2281; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2282; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2283; GFX7-NEXT: buffer_wbinvl1_vol 2284; GFX7-NEXT: v_mov_b32_e32 v0, s0 2285; GFX7-NEXT: v_mov_b32_e32 v1, s1 2286; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2287; GFX7-NEXT: flat_store_dword v[0:1], v2 2288; GFX7-NEXT: s_endpgm 2289; 2290; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: 2291; GFX10-WGP: ; %bb.0: ; %entry 2292; GFX10-WGP-NEXT: s_clause 0x1 2293; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2294; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2295; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2296; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2297; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2298; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2299; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2300; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2301; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2302; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2303; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2304; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2305; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2306; GFX10-WGP-NEXT: buffer_gl0_inv 2307; GFX10-WGP-NEXT: buffer_gl1_inv 2308; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2309; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2310; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2311; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2312; GFX10-WGP-NEXT: s_endpgm 2313; 2314; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: 2315; GFX10-CU: ; %bb.0: ; %entry 2316; GFX10-CU-NEXT: s_clause 0x1 2317; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2318; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2319; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2320; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2321; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2322; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2323; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2324; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2325; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2326; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2327; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2328; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2329; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2330; GFX10-CU-NEXT: buffer_gl0_inv 2331; GFX10-CU-NEXT: buffer_gl1_inv 2332; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2333; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2334; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2335; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2336; GFX10-CU-NEXT: s_endpgm 2337; 2338; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: 2339; SKIP-CACHE-INV: ; %bb.0: ; %entry 2340; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2341; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2342; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2343; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2344; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2345; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2346; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2347; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2348; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2349; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2350; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2351; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2352; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2353; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2354; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2355; SKIP-CACHE-INV-NEXT: s_endpgm 2356 i32* %out, i32 %in, i32 %old) { 2357entry: 2358 %gep = getelementptr i32, i32* %out, i32 4 2359 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire 2360 %val0 = extractvalue { i32, i1 } %val, 0 2361 store i32 %val0, i32* %out, align 4 2362 ret void 2363} 2364 2365define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( 2366; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: 2367; GFX7: ; %bb.0: ; %entry 2368; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2369; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2370; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2371; GFX7-NEXT: s_add_u32 s4, s0, 16 2372; GFX7-NEXT: s_addc_u32 s5, s1, 0 2373; GFX7-NEXT: v_mov_b32_e32 v0, s4 2374; GFX7-NEXT: v_mov_b32_e32 v2, s2 2375; GFX7-NEXT: v_mov_b32_e32 v1, s5 2376; GFX7-NEXT: v_mov_b32_e32 v3, s3 2377; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2378; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2379; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2380; GFX7-NEXT: buffer_wbinvl1_vol 2381; GFX7-NEXT: v_mov_b32_e32 v0, s0 2382; GFX7-NEXT: v_mov_b32_e32 v1, s1 2383; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2384; GFX7-NEXT: flat_store_dword v[0:1], v2 2385; GFX7-NEXT: s_endpgm 2386; 2387; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: 2388; GFX10-WGP: ; %bb.0: ; %entry 2389; GFX10-WGP-NEXT: s_clause 0x1 2390; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2391; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2392; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2393; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2394; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2395; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2396; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2397; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2398; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2399; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2400; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2401; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2402; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2403; GFX10-WGP-NEXT: buffer_gl0_inv 2404; GFX10-WGP-NEXT: buffer_gl1_inv 2405; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2406; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2407; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2408; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2409; GFX10-WGP-NEXT: s_endpgm 2410; 2411; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: 2412; GFX10-CU: ; %bb.0: ; %entry 2413; GFX10-CU-NEXT: s_clause 0x1 2414; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2415; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2416; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2417; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2418; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2419; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2420; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2421; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2422; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2423; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2424; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2425; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2426; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2427; GFX10-CU-NEXT: buffer_gl0_inv 2428; GFX10-CU-NEXT: buffer_gl1_inv 2429; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2430; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2431; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2432; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2433; GFX10-CU-NEXT: s_endpgm 2434; 2435; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: 2436; SKIP-CACHE-INV: ; %bb.0: ; %entry 2437; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2438; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2439; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2440; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2441; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2442; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2443; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2444; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2445; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2446; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2447; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2448; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2449; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2450; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2451; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2452; SKIP-CACHE-INV-NEXT: s_endpgm 2453 i32* %out, i32 %in, i32 %old) { 2454entry: 2455 %gep = getelementptr i32, i32* %out, i32 4 2456 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire 2457 %val0 = extractvalue { i32, i1 } %val, 0 2458 store i32 %val0, i32* %out, align 4 2459 ret void 2460} 2461 2462define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( 2463; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: 2464; GFX7: ; %bb.0: ; %entry 2465; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2466; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2467; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2468; GFX7-NEXT: s_add_u32 s4, s0, 16 2469; GFX7-NEXT: s_addc_u32 s5, s1, 0 2470; GFX7-NEXT: v_mov_b32_e32 v0, s4 2471; GFX7-NEXT: v_mov_b32_e32 v2, s2 2472; GFX7-NEXT: v_mov_b32_e32 v1, s5 2473; GFX7-NEXT: v_mov_b32_e32 v3, s3 2474; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2475; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2476; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2477; GFX7-NEXT: buffer_wbinvl1_vol 2478; GFX7-NEXT: v_mov_b32_e32 v0, s0 2479; GFX7-NEXT: v_mov_b32_e32 v1, s1 2480; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2481; GFX7-NEXT: flat_store_dword v[0:1], v2 2482; GFX7-NEXT: s_endpgm 2483; 2484; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: 2485; GFX10-WGP: ; %bb.0: ; %entry 2486; GFX10-WGP-NEXT: s_clause 0x1 2487; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2488; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2489; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2490; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 2491; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 2492; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 2493; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2494; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 2495; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 2496; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2497; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2498; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2499; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2500; GFX10-WGP-NEXT: buffer_gl0_inv 2501; GFX10-WGP-NEXT: buffer_gl1_inv 2502; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2503; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2504; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2505; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2506; GFX10-WGP-NEXT: s_endpgm 2507; 2508; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: 2509; GFX10-CU: ; %bb.0: ; %entry 2510; GFX10-CU-NEXT: s_clause 0x1 2511; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2512; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2513; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2514; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 2515; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 2516; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 2517; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2518; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 2519; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 2520; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2521; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2522; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2523; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2524; GFX10-CU-NEXT: buffer_gl0_inv 2525; GFX10-CU-NEXT: buffer_gl1_inv 2526; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2527; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2528; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2529; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2530; GFX10-CU-NEXT: s_endpgm 2531; 2532; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: 2533; SKIP-CACHE-INV: ; %bb.0: ; %entry 2534; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 2535; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2536; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2537; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 2538; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 2539; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2540; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 2541; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 2542; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 2543; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2544; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2545; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2546; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 2547; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 2548; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2549; SKIP-CACHE-INV-NEXT: s_endpgm 2550 i32* %out, i32 %in, i32 %old) { 2551entry: 2552 %gep = getelementptr i32, i32* %out, i32 4 2553 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst 2554 %val0 = extractvalue { i32, i1 } %val, 0 2555 store i32 %val0, i32* %out, align 4 2556 ret void 2557} 2558 2559define amdgpu_kernel void @flat_agent_one_as_unordered_load( 2560; GFX7-LABEL: flat_agent_one_as_unordered_load: 2561; GFX7: ; %bb.0: ; %entry 2562; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2563; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2564; GFX7-NEXT: v_mov_b32_e32 v0, s0 2565; GFX7-NEXT: v_mov_b32_e32 v1, s1 2566; GFX7-NEXT: flat_load_dword v0, v[0:1] 2567; GFX7-NEXT: v_mov_b32_e32 v2, s2 2568; GFX7-NEXT: v_mov_b32_e32 v3, s3 2569; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2570; GFX7-NEXT: flat_store_dword v[2:3], v0 2571; GFX7-NEXT: s_endpgm 2572; 2573; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load: 2574; GFX10-WGP: ; %bb.0: ; %entry 2575; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2576; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2577; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2578; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2579; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] 2580; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 2581; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 2582; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2583; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2584; GFX10-WGP-NEXT: s_endpgm 2585; 2586; GFX10-CU-LABEL: flat_agent_one_as_unordered_load: 2587; GFX10-CU: ; %bb.0: ; %entry 2588; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2589; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2590; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2591; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2592; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] 2593; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 2594; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 2595; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2596; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2597; GFX10-CU-NEXT: s_endpgm 2598; 2599; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_load: 2600; SKIP-CACHE-INV: ; %bb.0: ; %entry 2601; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2602; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2603; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2604; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2605; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] 2606; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 2607; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 2608; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2609; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 2610; SKIP-CACHE-INV-NEXT: s_endpgm 2611 i32* %in, i32* %out) { 2612entry: 2613 %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4 2614 store i32 %val, i32* %out 2615 ret void 2616} 2617 2618define amdgpu_kernel void @flat_agent_one_as_monotonic_load( 2619; GFX7-LABEL: flat_agent_one_as_monotonic_load: 2620; GFX7: ; %bb.0: ; %entry 2621; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2622; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2623; GFX7-NEXT: v_mov_b32_e32 v0, s0 2624; GFX7-NEXT: v_mov_b32_e32 v1, s1 2625; GFX7-NEXT: flat_load_dword v0, v[0:1] glc 2626; GFX7-NEXT: v_mov_b32_e32 v2, s2 2627; GFX7-NEXT: v_mov_b32_e32 v3, s3 2628; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2629; GFX7-NEXT: flat_store_dword v[2:3], v0 2630; GFX7-NEXT: s_endpgm 2631; 2632; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load: 2633; GFX10-WGP: ; %bb.0: ; %entry 2634; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2635; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2636; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2637; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2638; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc 2639; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 2640; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 2641; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2642; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2643; GFX10-WGP-NEXT: s_endpgm 2644; 2645; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load: 2646; GFX10-CU: ; %bb.0: ; %entry 2647; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2648; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2649; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2650; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2651; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc 2652; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 2653; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 2654; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2655; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2656; GFX10-CU-NEXT: s_endpgm 2657; 2658; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_load: 2659; SKIP-CACHE-INV: ; %bb.0: ; %entry 2660; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2661; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2662; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2663; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2664; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc 2665; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 2666; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 2667; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2668; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 2669; SKIP-CACHE-INV-NEXT: s_endpgm 2670 i32* %in, i32* %out) { 2671entry: 2672 %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4 2673 store i32 %val, i32* %out 2674 ret void 2675} 2676 2677define amdgpu_kernel void @flat_agent_one_as_acquire_load( 2678; GFX7-LABEL: flat_agent_one_as_acquire_load: 2679; GFX7: ; %bb.0: ; %entry 2680; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2681; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2682; GFX7-NEXT: v_mov_b32_e32 v0, s0 2683; GFX7-NEXT: v_mov_b32_e32 v1, s1 2684; GFX7-NEXT: flat_load_dword v0, v[0:1] glc 2685; GFX7-NEXT: s_waitcnt vmcnt(0) 2686; GFX7-NEXT: buffer_wbinvl1_vol 2687; GFX7-NEXT: v_mov_b32_e32 v2, s2 2688; GFX7-NEXT: v_mov_b32_e32 v3, s3 2689; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2690; GFX7-NEXT: flat_store_dword v[2:3], v0 2691; GFX7-NEXT: s_endpgm 2692; 2693; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load: 2694; GFX10-WGP: ; %bb.0: ; %entry 2695; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2696; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2697; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2698; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2699; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc 2700; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 2701; GFX10-WGP-NEXT: buffer_gl0_inv 2702; GFX10-WGP-NEXT: buffer_gl1_inv 2703; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 2704; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 2705; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2706; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2707; GFX10-WGP-NEXT: s_endpgm 2708; 2709; GFX10-CU-LABEL: flat_agent_one_as_acquire_load: 2710; GFX10-CU: ; %bb.0: ; %entry 2711; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2712; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2713; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2714; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2715; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc 2716; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 2717; GFX10-CU-NEXT: buffer_gl0_inv 2718; GFX10-CU-NEXT: buffer_gl1_inv 2719; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 2720; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 2721; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2722; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2723; GFX10-CU-NEXT: s_endpgm 2724; 2725; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_load: 2726; SKIP-CACHE-INV: ; %bb.0: ; %entry 2727; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2728; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2729; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2730; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2731; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc 2732; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 2733; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 2734; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 2735; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2736; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 2737; SKIP-CACHE-INV-NEXT: s_endpgm 2738 i32* %in, i32* %out) { 2739entry: 2740 %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4 2741 store i32 %val, i32* %out 2742 ret void 2743} 2744 2745define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( 2746; GFX7-LABEL: flat_agent_one_as_seq_cst_load: 2747; GFX7: ; %bb.0: ; %entry 2748; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2749; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2750; GFX7-NEXT: v_mov_b32_e32 v0, s0 2751; GFX7-NEXT: v_mov_b32_e32 v1, s1 2752; GFX7-NEXT: s_waitcnt vmcnt(0) 2753; GFX7-NEXT: flat_load_dword v0, v[0:1] glc 2754; GFX7-NEXT: s_waitcnt vmcnt(0) 2755; GFX7-NEXT: buffer_wbinvl1_vol 2756; GFX7-NEXT: v_mov_b32_e32 v2, s2 2757; GFX7-NEXT: v_mov_b32_e32 v3, s3 2758; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2759; GFX7-NEXT: flat_store_dword v[2:3], v0 2760; GFX7-NEXT: s_endpgm 2761; 2762; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load: 2763; GFX10-WGP: ; %bb.0: ; %entry 2764; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2765; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2766; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2767; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2768; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 2769; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2770; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc 2771; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 2772; GFX10-WGP-NEXT: buffer_gl0_inv 2773; GFX10-WGP-NEXT: buffer_gl1_inv 2774; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 2775; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 2776; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2777; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2778; GFX10-WGP-NEXT: s_endpgm 2779; 2780; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load: 2781; GFX10-CU: ; %bb.0: ; %entry 2782; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2783; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2784; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2785; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2786; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 2787; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2788; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc 2789; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 2790; GFX10-CU-NEXT: buffer_gl0_inv 2791; GFX10-CU-NEXT: buffer_gl1_inv 2792; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 2793; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 2794; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2795; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2796; GFX10-CU-NEXT: s_endpgm 2797; 2798; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_load: 2799; SKIP-CACHE-INV: ; %bb.0: ; %entry 2800; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2801; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2802; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2803; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2804; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 2805; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc 2806; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 2807; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 2808; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 2809; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2810; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 2811; SKIP-CACHE-INV-NEXT: s_endpgm 2812 i32* %in, i32* %out) { 2813entry: 2814 %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4 2815 store i32 %val, i32* %out 2816 ret void 2817} 2818 2819define amdgpu_kernel void @flat_agent_one_as_unordered_store( 2820; GFX7-LABEL: flat_agent_one_as_unordered_store: 2821; GFX7: ; %bb.0: ; %entry 2822; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 2823; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 2824; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2825; GFX7-NEXT: v_mov_b32_e32 v2, s2 2826; GFX7-NEXT: v_mov_b32_e32 v0, s0 2827; GFX7-NEXT: v_mov_b32_e32 v1, s1 2828; GFX7-NEXT: flat_store_dword v[0:1], v2 2829; GFX7-NEXT: s_endpgm 2830; 2831; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store: 2832; GFX10-WGP: ; %bb.0: ; %entry 2833; GFX10-WGP-NEXT: s_clause 0x1 2834; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2835; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 2836; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2837; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2838; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2839; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2840; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2841; GFX10-WGP-NEXT: s_endpgm 2842; 2843; GFX10-CU-LABEL: flat_agent_one_as_unordered_store: 2844; GFX10-CU: ; %bb.0: ; %entry 2845; GFX10-CU-NEXT: s_clause 0x1 2846; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2847; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 2848; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2849; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2850; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2851; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2852; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2853; GFX10-CU-NEXT: s_endpgm 2854; 2855; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_store: 2856; SKIP-CACHE-INV: ; %bb.0: ; %entry 2857; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 2858; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2859; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2860; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 2861; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2862; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2863; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2864; SKIP-CACHE-INV-NEXT: s_endpgm 2865 i32 %in, i32* %out) { 2866entry: 2867 store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4 2868 ret void 2869} 2870 2871define amdgpu_kernel void @flat_agent_one_as_monotonic_store( 2872; GFX7-LABEL: flat_agent_one_as_monotonic_store: 2873; GFX7: ; %bb.0: ; %entry 2874; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 2875; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 2876; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2877; GFX7-NEXT: v_mov_b32_e32 v2, s2 2878; GFX7-NEXT: v_mov_b32_e32 v0, s0 2879; GFX7-NEXT: v_mov_b32_e32 v1, s1 2880; GFX7-NEXT: flat_store_dword v[0:1], v2 2881; GFX7-NEXT: s_endpgm 2882; 2883; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store: 2884; GFX10-WGP: ; %bb.0: ; %entry 2885; GFX10-WGP-NEXT: s_clause 0x1 2886; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2887; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 2888; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2889; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2890; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2891; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2892; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2893; GFX10-WGP-NEXT: s_endpgm 2894; 2895; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store: 2896; GFX10-CU: ; %bb.0: ; %entry 2897; GFX10-CU-NEXT: s_clause 0x1 2898; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2899; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 2900; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2901; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2902; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2903; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2904; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2905; GFX10-CU-NEXT: s_endpgm 2906; 2907; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_store: 2908; SKIP-CACHE-INV: ; %bb.0: ; %entry 2909; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 2910; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2911; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2912; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 2913; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2914; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2915; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2916; SKIP-CACHE-INV-NEXT: s_endpgm 2917 i32 %in, i32* %out) { 2918entry: 2919 store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4 2920 ret void 2921} 2922 2923define amdgpu_kernel void @flat_agent_one_as_release_store( 2924; GFX7-LABEL: flat_agent_one_as_release_store: 2925; GFX7: ; %bb.0: ; %entry 2926; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 2927; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 2928; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2929; GFX7-NEXT: v_mov_b32_e32 v2, s2 2930; GFX7-NEXT: v_mov_b32_e32 v0, s0 2931; GFX7-NEXT: v_mov_b32_e32 v1, s1 2932; GFX7-NEXT: s_waitcnt vmcnt(0) 2933; GFX7-NEXT: flat_store_dword v[0:1], v2 2934; GFX7-NEXT: s_endpgm 2935; 2936; GFX10-WGP-LABEL: flat_agent_one_as_release_store: 2937; GFX10-WGP: ; %bb.0: ; %entry 2938; GFX10-WGP-NEXT: s_clause 0x1 2939; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2940; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 2941; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2942; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2943; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2944; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 2945; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 2946; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2947; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 2948; GFX10-WGP-NEXT: s_endpgm 2949; 2950; GFX10-CU-LABEL: flat_agent_one_as_release_store: 2951; GFX10-CU: ; %bb.0: ; %entry 2952; GFX10-CU-NEXT: s_clause 0x1 2953; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2954; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 2955; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2956; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2957; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2958; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 2959; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 2960; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 2961; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 2962; GFX10-CU-NEXT: s_endpgm 2963; 2964; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_store: 2965; SKIP-CACHE-INV: ; %bb.0: ; %entry 2966; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 2967; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2968; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2969; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 2970; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2971; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2972; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 2973; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 2974; SKIP-CACHE-INV-NEXT: s_endpgm 2975 i32 %in, i32* %out) { 2976entry: 2977 store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4 2978 ret void 2979} 2980 2981define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( 2982; GFX7-LABEL: flat_agent_one_as_seq_cst_store: 2983; GFX7: ; %bb.0: ; %entry 2984; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 2985; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 2986; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2987; GFX7-NEXT: v_mov_b32_e32 v2, s2 2988; GFX7-NEXT: v_mov_b32_e32 v0, s0 2989; GFX7-NEXT: v_mov_b32_e32 v1, s1 2990; GFX7-NEXT: s_waitcnt vmcnt(0) 2991; GFX7-NEXT: flat_store_dword v[0:1], v2 2992; GFX7-NEXT: s_endpgm 2993; 2994; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store: 2995; GFX10-WGP: ; %bb.0: ; %entry 2996; GFX10-WGP-NEXT: s_clause 0x1 2997; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2998; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 2999; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3000; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3001; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3002; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3003; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3004; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3005; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3006; GFX10-WGP-NEXT: s_endpgm 3007; 3008; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store: 3009; GFX10-CU: ; %bb.0: ; %entry 3010; GFX10-CU-NEXT: s_clause 0x1 3011; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3012; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 3013; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3014; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3015; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3016; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3017; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3018; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3019; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3020; GFX10-CU-NEXT: s_endpgm 3021; 3022; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_store: 3023; SKIP-CACHE-INV: ; %bb.0: ; %entry 3024; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 3025; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3026; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3027; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 3028; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3029; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3030; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3031; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3032; SKIP-CACHE-INV-NEXT: s_endpgm 3033 i32 %in, i32* %out) { 3034entry: 3035 store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4 3036 ret void 3037} 3038 3039define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( 3040; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw: 3041; GFX7: ; %bb.0: ; %entry 3042; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3043; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3044; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3045; GFX7-NEXT: v_mov_b32_e32 v0, s0 3046; GFX7-NEXT: v_mov_b32_e32 v1, s1 3047; GFX7-NEXT: v_mov_b32_e32 v2, s2 3048; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3049; GFX7-NEXT: s_endpgm 3050; 3051; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: 3052; GFX10-WGP: ; %bb.0: ; %entry 3053; GFX10-WGP-NEXT: s_clause 0x1 3054; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3055; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3056; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3057; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3058; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3059; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3060; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 3061; GFX10-WGP-NEXT: s_endpgm 3062; 3063; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: 3064; GFX10-CU: ; %bb.0: ; %entry 3065; GFX10-CU-NEXT: s_clause 0x1 3066; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3067; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3068; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3069; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3070; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3071; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3072; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 3073; GFX10-CU-NEXT: s_endpgm 3074; 3075; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_atomicrmw: 3076; SKIP-CACHE-INV: ; %bb.0: ; %entry 3077; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3078; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3079; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3080; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3081; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3082; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3083; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 3084; SKIP-CACHE-INV-NEXT: s_endpgm 3085 i32* %out, i32 %in) { 3086entry: 3087 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic 3088 ret void 3089} 3090 3091define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( 3092; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw: 3093; GFX7: ; %bb.0: ; %entry 3094; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3095; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3096; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3097; GFX7-NEXT: v_mov_b32_e32 v0, s0 3098; GFX7-NEXT: v_mov_b32_e32 v1, s1 3099; GFX7-NEXT: v_mov_b32_e32 v2, s2 3100; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3101; GFX7-NEXT: s_waitcnt vmcnt(0) 3102; GFX7-NEXT: buffer_wbinvl1_vol 3103; GFX7-NEXT: s_endpgm 3104; 3105; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: 3106; GFX10-WGP: ; %bb.0: ; %entry 3107; GFX10-WGP-NEXT: s_clause 0x1 3108; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3109; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3110; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3111; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3112; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3113; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3114; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 3115; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3116; GFX10-WGP-NEXT: buffer_gl0_inv 3117; GFX10-WGP-NEXT: buffer_gl1_inv 3118; GFX10-WGP-NEXT: s_endpgm 3119; 3120; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: 3121; GFX10-CU: ; %bb.0: ; %entry 3122; GFX10-CU-NEXT: s_clause 0x1 3123; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3124; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3125; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3126; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3127; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3128; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3129; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 3130; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3131; GFX10-CU-NEXT: buffer_gl0_inv 3132; GFX10-CU-NEXT: buffer_gl1_inv 3133; GFX10-CU-NEXT: s_endpgm 3134; 3135; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_atomicrmw: 3136; SKIP-CACHE-INV: ; %bb.0: ; %entry 3137; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3138; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3139; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3140; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3141; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3142; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3143; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 3144; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3145; SKIP-CACHE-INV-NEXT: s_endpgm 3146 i32* %out, i32 %in) { 3147entry: 3148 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire 3149 ret void 3150} 3151 3152define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( 3153; GFX7-LABEL: flat_agent_one_as_release_atomicrmw: 3154; GFX7: ; %bb.0: ; %entry 3155; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3156; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3157; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3158; GFX7-NEXT: v_mov_b32_e32 v0, s0 3159; GFX7-NEXT: v_mov_b32_e32 v1, s1 3160; GFX7-NEXT: v_mov_b32_e32 v2, s2 3161; GFX7-NEXT: s_waitcnt vmcnt(0) 3162; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3163; GFX7-NEXT: s_endpgm 3164; 3165; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw: 3166; GFX10-WGP: ; %bb.0: ; %entry 3167; GFX10-WGP-NEXT: s_clause 0x1 3168; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3169; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3170; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3171; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3172; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3173; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3174; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3175; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3176; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 3177; GFX10-WGP-NEXT: s_endpgm 3178; 3179; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw: 3180; GFX10-CU: ; %bb.0: ; %entry 3181; GFX10-CU-NEXT: s_clause 0x1 3182; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3183; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3184; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3185; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3186; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3187; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3188; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3189; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3190; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 3191; GFX10-CU-NEXT: s_endpgm 3192; 3193; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_atomicrmw: 3194; SKIP-CACHE-INV: ; %bb.0: ; %entry 3195; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3196; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3197; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3198; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3199; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3200; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3201; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3202; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 3203; SKIP-CACHE-INV-NEXT: s_endpgm 3204 i32* %out, i32 %in) { 3205entry: 3206 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release 3207 ret void 3208} 3209 3210define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( 3211; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw: 3212; GFX7: ; %bb.0: ; %entry 3213; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3214; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3215; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3216; GFX7-NEXT: v_mov_b32_e32 v0, s0 3217; GFX7-NEXT: v_mov_b32_e32 v1, s1 3218; GFX7-NEXT: v_mov_b32_e32 v2, s2 3219; GFX7-NEXT: s_waitcnt vmcnt(0) 3220; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3221; GFX7-NEXT: s_waitcnt vmcnt(0) 3222; GFX7-NEXT: buffer_wbinvl1_vol 3223; GFX7-NEXT: s_endpgm 3224; 3225; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: 3226; GFX10-WGP: ; %bb.0: ; %entry 3227; GFX10-WGP-NEXT: s_clause 0x1 3228; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3229; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3230; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3231; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3232; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3233; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3234; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3235; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3236; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 3237; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3238; GFX10-WGP-NEXT: buffer_gl0_inv 3239; GFX10-WGP-NEXT: buffer_gl1_inv 3240; GFX10-WGP-NEXT: s_endpgm 3241; 3242; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: 3243; GFX10-CU: ; %bb.0: ; %entry 3244; GFX10-CU-NEXT: s_clause 0x1 3245; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3246; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3247; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3248; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3249; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3250; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3251; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3252; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3253; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 3254; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3255; GFX10-CU-NEXT: buffer_gl0_inv 3256; GFX10-CU-NEXT: buffer_gl1_inv 3257; GFX10-CU-NEXT: s_endpgm 3258; 3259; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_atomicrmw: 3260; SKIP-CACHE-INV: ; %bb.0: ; %entry 3261; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3262; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3263; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3264; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3265; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3266; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3267; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3268; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 3269; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3270; SKIP-CACHE-INV-NEXT: s_endpgm 3271 i32* %out, i32 %in) { 3272entry: 3273 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel 3274 ret void 3275} 3276 3277define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( 3278; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw: 3279; GFX7: ; %bb.0: ; %entry 3280; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3281; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3282; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3283; GFX7-NEXT: v_mov_b32_e32 v0, s0 3284; GFX7-NEXT: v_mov_b32_e32 v1, s1 3285; GFX7-NEXT: v_mov_b32_e32 v2, s2 3286; GFX7-NEXT: s_waitcnt vmcnt(0) 3287; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3288; GFX7-NEXT: s_waitcnt vmcnt(0) 3289; GFX7-NEXT: buffer_wbinvl1_vol 3290; GFX7-NEXT: s_endpgm 3291; 3292; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: 3293; GFX10-WGP: ; %bb.0: ; %entry 3294; GFX10-WGP-NEXT: s_clause 0x1 3295; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3296; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3297; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3298; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3299; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3300; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3301; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3302; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3303; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 3304; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3305; GFX10-WGP-NEXT: buffer_gl0_inv 3306; GFX10-WGP-NEXT: buffer_gl1_inv 3307; GFX10-WGP-NEXT: s_endpgm 3308; 3309; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: 3310; GFX10-CU: ; %bb.0: ; %entry 3311; GFX10-CU-NEXT: s_clause 0x1 3312; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3313; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3314; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3315; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3316; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3317; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3318; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3319; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3320; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 3321; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3322; GFX10-CU-NEXT: buffer_gl0_inv 3323; GFX10-CU-NEXT: buffer_gl1_inv 3324; GFX10-CU-NEXT: s_endpgm 3325; 3326; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_atomicrmw: 3327; SKIP-CACHE-INV: ; %bb.0: ; %entry 3328; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3329; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3330; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3331; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3332; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3333; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3334; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3335; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 3336; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3337; SKIP-CACHE-INV-NEXT: s_endpgm 3338 i32* %out, i32 %in) { 3339entry: 3340 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst 3341 ret void 3342} 3343 3344define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( 3345; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: 3346; GFX7: ; %bb.0: ; %entry 3347; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3348; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3349; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3350; GFX7-NEXT: v_mov_b32_e32 v0, s0 3351; GFX7-NEXT: v_mov_b32_e32 v1, s1 3352; GFX7-NEXT: v_mov_b32_e32 v2, s2 3353; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3354; GFX7-NEXT: s_waitcnt vmcnt(0) 3355; GFX7-NEXT: buffer_wbinvl1_vol 3356; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3357; GFX7-NEXT: flat_store_dword v[0:1], v2 3358; GFX7-NEXT: s_endpgm 3359; 3360; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: 3361; GFX10-WGP: ; %bb.0: ; %entry 3362; GFX10-WGP-NEXT: s_clause 0x1 3363; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3364; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3365; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3366; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3367; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3368; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3369; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3370; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3371; GFX10-WGP-NEXT: buffer_gl0_inv 3372; GFX10-WGP-NEXT: buffer_gl1_inv 3373; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3374; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3375; GFX10-WGP-NEXT: s_endpgm 3376; 3377; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: 3378; GFX10-CU: ; %bb.0: ; %entry 3379; GFX10-CU-NEXT: s_clause 0x1 3380; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3381; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3382; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3383; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3384; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3385; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3386; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3387; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3388; GFX10-CU-NEXT: buffer_gl0_inv 3389; GFX10-CU-NEXT: buffer_gl1_inv 3390; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3391; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3392; GFX10-CU-NEXT: s_endpgm 3393; 3394; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: 3395; SKIP-CACHE-INV: ; %bb.0: ; %entry 3396; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3397; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3398; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3399; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3400; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3401; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3402; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3403; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3404; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3405; SKIP-CACHE-INV-NEXT: s_endpgm 3406 i32* %out, i32 %in) { 3407entry: 3408 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire 3409 store i32 %val, i32* %out, align 4 3410 ret void 3411} 3412 3413define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( 3414; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: 3415; GFX7: ; %bb.0: ; %entry 3416; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3417; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3418; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3419; GFX7-NEXT: v_mov_b32_e32 v0, s0 3420; GFX7-NEXT: v_mov_b32_e32 v1, s1 3421; GFX7-NEXT: v_mov_b32_e32 v2, s2 3422; GFX7-NEXT: s_waitcnt vmcnt(0) 3423; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3424; GFX7-NEXT: s_waitcnt vmcnt(0) 3425; GFX7-NEXT: buffer_wbinvl1_vol 3426; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3427; GFX7-NEXT: flat_store_dword v[0:1], v2 3428; GFX7-NEXT: s_endpgm 3429; 3430; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: 3431; GFX10-WGP: ; %bb.0: ; %entry 3432; GFX10-WGP-NEXT: s_clause 0x1 3433; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3434; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3435; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3436; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3437; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3438; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3439; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3440; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3441; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3442; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3443; GFX10-WGP-NEXT: buffer_gl0_inv 3444; GFX10-WGP-NEXT: buffer_gl1_inv 3445; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3446; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3447; GFX10-WGP-NEXT: s_endpgm 3448; 3449; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: 3450; GFX10-CU: ; %bb.0: ; %entry 3451; GFX10-CU-NEXT: s_clause 0x1 3452; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3453; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3454; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3455; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3456; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3457; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3458; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3459; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3460; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3461; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3462; GFX10-CU-NEXT: buffer_gl0_inv 3463; GFX10-CU-NEXT: buffer_gl1_inv 3464; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3465; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3466; GFX10-CU-NEXT: s_endpgm 3467; 3468; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: 3469; SKIP-CACHE-INV: ; %bb.0: ; %entry 3470; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3471; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3472; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3473; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3474; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3475; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3476; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3477; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3478; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3479; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3480; SKIP-CACHE-INV-NEXT: s_endpgm 3481 i32* %out, i32 %in) { 3482entry: 3483 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel 3484 store i32 %val, i32* %out, align 4 3485 ret void 3486} 3487 3488define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( 3489; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: 3490; GFX7: ; %bb.0: ; %entry 3491; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3492; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3493; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3494; GFX7-NEXT: v_mov_b32_e32 v0, s0 3495; GFX7-NEXT: v_mov_b32_e32 v1, s1 3496; GFX7-NEXT: v_mov_b32_e32 v2, s2 3497; GFX7-NEXT: s_waitcnt vmcnt(0) 3498; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3499; GFX7-NEXT: s_waitcnt vmcnt(0) 3500; GFX7-NEXT: buffer_wbinvl1_vol 3501; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3502; GFX7-NEXT: flat_store_dword v[0:1], v2 3503; GFX7-NEXT: s_endpgm 3504; 3505; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: 3506; GFX10-WGP: ; %bb.0: ; %entry 3507; GFX10-WGP-NEXT: s_clause 0x1 3508; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3509; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3510; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3511; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3512; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3513; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3514; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3515; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3516; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3517; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3518; GFX10-WGP-NEXT: buffer_gl0_inv 3519; GFX10-WGP-NEXT: buffer_gl1_inv 3520; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3521; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 3522; GFX10-WGP-NEXT: s_endpgm 3523; 3524; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: 3525; GFX10-CU: ; %bb.0: ; %entry 3526; GFX10-CU-NEXT: s_clause 0x1 3527; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3528; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3529; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3530; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3531; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3532; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3533; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3534; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3535; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3536; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3537; GFX10-CU-NEXT: buffer_gl0_inv 3538; GFX10-CU-NEXT: buffer_gl1_inv 3539; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3540; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 3541; GFX10-CU-NEXT: s_endpgm 3542; 3543; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: 3544; SKIP-CACHE-INV: ; %bb.0: ; %entry 3545; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3546; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3547; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3548; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3549; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3550; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3551; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3552; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3553; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3554; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 3555; SKIP-CACHE-INV-NEXT: s_endpgm 3556 i32* %out, i32 %in) { 3557entry: 3558 %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst 3559 store i32 %val, i32* %out, align 4 3560 ret void 3561} 3562 3563define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( 3564; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: 3565; GFX7: ; %bb.0: ; %entry 3566; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3567; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3568; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3569; GFX7-NEXT: s_add_u32 s0, s0, 16 3570; GFX7-NEXT: s_addc_u32 s1, s1, 0 3571; GFX7-NEXT: v_mov_b32_e32 v0, s0 3572; GFX7-NEXT: v_mov_b32_e32 v2, s2 3573; GFX7-NEXT: v_mov_b32_e32 v1, s1 3574; GFX7-NEXT: v_mov_b32_e32 v3, s3 3575; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3576; GFX7-NEXT: s_endpgm 3577; 3578; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: 3579; GFX10-WGP: ; %bb.0: ; %entry 3580; GFX10-WGP-NEXT: s_clause 0x1 3581; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3582; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3583; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3584; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3585; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3586; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3587; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3588; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3589; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3590; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3591; GFX10-WGP-NEXT: s_endpgm 3592; 3593; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: 3594; GFX10-CU: ; %bb.0: ; %entry 3595; GFX10-CU-NEXT: s_clause 0x1 3596; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3597; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3598; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3599; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3600; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3601; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3602; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3603; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3604; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3605; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3606; GFX10-CU-NEXT: s_endpgm 3607; 3608; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: 3609; SKIP-CACHE-INV: ; %bb.0: ; %entry 3610; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3611; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3612; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3613; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3614; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3615; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3616; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3617; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3618; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3619; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3620; SKIP-CACHE-INV-NEXT: s_endpgm 3621 i32* %out, i32 %in, i32 %old) { 3622entry: 3623 %gep = getelementptr i32, i32* %out, i32 4 3624 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic 3625 ret void 3626} 3627 3628define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( 3629; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: 3630; GFX7: ; %bb.0: ; %entry 3631; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3632; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3633; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3634; GFX7-NEXT: s_add_u32 s0, s0, 16 3635; GFX7-NEXT: s_addc_u32 s1, s1, 0 3636; GFX7-NEXT: v_mov_b32_e32 v0, s0 3637; GFX7-NEXT: v_mov_b32_e32 v2, s2 3638; GFX7-NEXT: v_mov_b32_e32 v1, s1 3639; GFX7-NEXT: v_mov_b32_e32 v3, s3 3640; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3641; GFX7-NEXT: s_waitcnt vmcnt(0) 3642; GFX7-NEXT: buffer_wbinvl1_vol 3643; GFX7-NEXT: s_endpgm 3644; 3645; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: 3646; GFX10-WGP: ; %bb.0: ; %entry 3647; GFX10-WGP-NEXT: s_clause 0x1 3648; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3649; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3650; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3651; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3652; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3653; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3654; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3655; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3656; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3657; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3658; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3659; GFX10-WGP-NEXT: buffer_gl0_inv 3660; GFX10-WGP-NEXT: buffer_gl1_inv 3661; GFX10-WGP-NEXT: s_endpgm 3662; 3663; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: 3664; GFX10-CU: ; %bb.0: ; %entry 3665; GFX10-CU-NEXT: s_clause 0x1 3666; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3667; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3668; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3669; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3670; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3671; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3672; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3673; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3674; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3675; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3676; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3677; GFX10-CU-NEXT: buffer_gl0_inv 3678; GFX10-CU-NEXT: buffer_gl1_inv 3679; GFX10-CU-NEXT: s_endpgm 3680; 3681; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: 3682; SKIP-CACHE-INV: ; %bb.0: ; %entry 3683; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3684; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3685; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3686; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3687; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3688; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3689; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3690; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3691; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3692; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3693; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3694; SKIP-CACHE-INV-NEXT: s_endpgm 3695 i32* %out, i32 %in, i32 %old) { 3696entry: 3697 %gep = getelementptr i32, i32* %out, i32 4 3698 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic 3699 ret void 3700} 3701 3702define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( 3703; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: 3704; GFX7: ; %bb.0: ; %entry 3705; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3706; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3707; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3708; GFX7-NEXT: s_add_u32 s0, s0, 16 3709; GFX7-NEXT: s_addc_u32 s1, s1, 0 3710; GFX7-NEXT: v_mov_b32_e32 v0, s0 3711; GFX7-NEXT: v_mov_b32_e32 v2, s2 3712; GFX7-NEXT: v_mov_b32_e32 v1, s1 3713; GFX7-NEXT: v_mov_b32_e32 v3, s3 3714; GFX7-NEXT: s_waitcnt vmcnt(0) 3715; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3716; GFX7-NEXT: s_endpgm 3717; 3718; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: 3719; GFX10-WGP: ; %bb.0: ; %entry 3720; GFX10-WGP-NEXT: s_clause 0x1 3721; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3722; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3723; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3724; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3725; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3726; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3727; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3728; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3729; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3730; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3731; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3732; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3733; GFX10-WGP-NEXT: s_endpgm 3734; 3735; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: 3736; GFX10-CU: ; %bb.0: ; %entry 3737; GFX10-CU-NEXT: s_clause 0x1 3738; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3739; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3740; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3741; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3742; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3743; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3744; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3745; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3746; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3747; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3748; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3749; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3750; GFX10-CU-NEXT: s_endpgm 3751; 3752; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: 3753; SKIP-CACHE-INV: ; %bb.0: ; %entry 3754; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3755; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3756; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3757; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3758; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3759; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3760; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3761; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3762; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3763; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3764; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3765; SKIP-CACHE-INV-NEXT: s_endpgm 3766 i32* %out, i32 %in, i32 %old) { 3767entry: 3768 %gep = getelementptr i32, i32* %out, i32 4 3769 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic 3770 ret void 3771} 3772 3773define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( 3774; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: 3775; GFX7: ; %bb.0: ; %entry 3776; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3777; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3778; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3779; GFX7-NEXT: s_add_u32 s0, s0, 16 3780; GFX7-NEXT: s_addc_u32 s1, s1, 0 3781; GFX7-NEXT: v_mov_b32_e32 v0, s0 3782; GFX7-NEXT: v_mov_b32_e32 v2, s2 3783; GFX7-NEXT: v_mov_b32_e32 v1, s1 3784; GFX7-NEXT: v_mov_b32_e32 v3, s3 3785; GFX7-NEXT: s_waitcnt vmcnt(0) 3786; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3787; GFX7-NEXT: s_waitcnt vmcnt(0) 3788; GFX7-NEXT: buffer_wbinvl1_vol 3789; GFX7-NEXT: s_endpgm 3790; 3791; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: 3792; GFX10-WGP: ; %bb.0: ; %entry 3793; GFX10-WGP-NEXT: s_clause 0x1 3794; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3795; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3796; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3797; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3798; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3799; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3800; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3801; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3802; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3803; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3804; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3805; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3806; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3807; GFX10-WGP-NEXT: buffer_gl0_inv 3808; GFX10-WGP-NEXT: buffer_gl1_inv 3809; GFX10-WGP-NEXT: s_endpgm 3810; 3811; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: 3812; GFX10-CU: ; %bb.0: ; %entry 3813; GFX10-CU-NEXT: s_clause 0x1 3814; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3815; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3816; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3817; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3818; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3819; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3820; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3821; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3822; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3823; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3824; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3825; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3826; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3827; GFX10-CU-NEXT: buffer_gl0_inv 3828; GFX10-CU-NEXT: buffer_gl1_inv 3829; GFX10-CU-NEXT: s_endpgm 3830; 3831; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: 3832; SKIP-CACHE-INV: ; %bb.0: ; %entry 3833; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3834; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3835; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3836; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3837; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3838; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3839; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3840; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3841; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3842; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3843; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3844; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3845; SKIP-CACHE-INV-NEXT: s_endpgm 3846 i32* %out, i32 %in, i32 %old) { 3847entry: 3848 %gep = getelementptr i32, i32* %out, i32 4 3849 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic 3850 ret void 3851} 3852 3853define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( 3854; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: 3855; GFX7: ; %bb.0: ; %entry 3856; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3857; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3858; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3859; GFX7-NEXT: s_add_u32 s0, s0, 16 3860; GFX7-NEXT: s_addc_u32 s1, s1, 0 3861; GFX7-NEXT: v_mov_b32_e32 v0, s0 3862; GFX7-NEXT: v_mov_b32_e32 v2, s2 3863; GFX7-NEXT: v_mov_b32_e32 v1, s1 3864; GFX7-NEXT: v_mov_b32_e32 v3, s3 3865; GFX7-NEXT: s_waitcnt vmcnt(0) 3866; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3867; GFX7-NEXT: s_waitcnt vmcnt(0) 3868; GFX7-NEXT: buffer_wbinvl1_vol 3869; GFX7-NEXT: s_endpgm 3870; 3871; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: 3872; GFX10-WGP: ; %bb.0: ; %entry 3873; GFX10-WGP-NEXT: s_clause 0x1 3874; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3875; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3876; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3877; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3878; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3879; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3880; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3881; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3882; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3883; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3884; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3885; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3886; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3887; GFX10-WGP-NEXT: buffer_gl0_inv 3888; GFX10-WGP-NEXT: buffer_gl1_inv 3889; GFX10-WGP-NEXT: s_endpgm 3890; 3891; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: 3892; GFX10-CU: ; %bb.0: ; %entry 3893; GFX10-CU-NEXT: s_clause 0x1 3894; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3895; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3896; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3897; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3898; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3899; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3900; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3901; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3902; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3903; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3904; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3905; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3906; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3907; GFX10-CU-NEXT: buffer_gl0_inv 3908; GFX10-CU-NEXT: buffer_gl1_inv 3909; GFX10-CU-NEXT: s_endpgm 3910; 3911; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: 3912; SKIP-CACHE-INV: ; %bb.0: ; %entry 3913; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3914; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3915; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3916; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3917; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3918; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3919; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3920; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3921; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3922; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3923; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3924; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3925; SKIP-CACHE-INV-NEXT: s_endpgm 3926 i32* %out, i32 %in, i32 %old) { 3927entry: 3928 %gep = getelementptr i32, i32* %out, i32 4 3929 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic 3930 ret void 3931} 3932 3933define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( 3934; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: 3935; GFX7: ; %bb.0: ; %entry 3936; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3937; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3938; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3939; GFX7-NEXT: s_add_u32 s0, s0, 16 3940; GFX7-NEXT: s_addc_u32 s1, s1, 0 3941; GFX7-NEXT: v_mov_b32_e32 v0, s0 3942; GFX7-NEXT: v_mov_b32_e32 v2, s2 3943; GFX7-NEXT: v_mov_b32_e32 v1, s1 3944; GFX7-NEXT: v_mov_b32_e32 v3, s3 3945; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3946; GFX7-NEXT: s_waitcnt vmcnt(0) 3947; GFX7-NEXT: buffer_wbinvl1_vol 3948; GFX7-NEXT: s_endpgm 3949; 3950; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: 3951; GFX10-WGP: ; %bb.0: ; %entry 3952; GFX10-WGP-NEXT: s_clause 0x1 3953; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3954; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3955; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3956; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 3957; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 3958; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3959; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 3960; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3961; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 3962; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3963; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3964; GFX10-WGP-NEXT: buffer_gl0_inv 3965; GFX10-WGP-NEXT: buffer_gl1_inv 3966; GFX10-WGP-NEXT: s_endpgm 3967; 3968; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: 3969; GFX10-CU: ; %bb.0: ; %entry 3970; GFX10-CU-NEXT: s_clause 0x1 3971; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3972; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 3973; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3974; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 3975; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 3976; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3977; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 3978; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3979; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 3980; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3981; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 3982; GFX10-CU-NEXT: buffer_gl0_inv 3983; GFX10-CU-NEXT: buffer_gl1_inv 3984; GFX10-CU-NEXT: s_endpgm 3985; 3986; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: 3987; SKIP-CACHE-INV: ; %bb.0: ; %entry 3988; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 3989; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3990; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3991; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 3992; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 3993; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 3994; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 3995; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 3996; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 3997; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3998; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3999; SKIP-CACHE-INV-NEXT: s_endpgm 4000 i32* %out, i32 %in, i32 %old) { 4001entry: 4002 %gep = getelementptr i32, i32* %out, i32 4 4003 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire 4004 ret void 4005} 4006 4007define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( 4008; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg: 4009; GFX7: ; %bb.0: ; %entry 4010; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4011; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4012; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4013; GFX7-NEXT: s_add_u32 s0, s0, 16 4014; GFX7-NEXT: s_addc_u32 s1, s1, 0 4015; GFX7-NEXT: v_mov_b32_e32 v0, s0 4016; GFX7-NEXT: v_mov_b32_e32 v2, s2 4017; GFX7-NEXT: v_mov_b32_e32 v1, s1 4018; GFX7-NEXT: v_mov_b32_e32 v3, s3 4019; GFX7-NEXT: s_waitcnt vmcnt(0) 4020; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4021; GFX7-NEXT: s_waitcnt vmcnt(0) 4022; GFX7-NEXT: buffer_wbinvl1_vol 4023; GFX7-NEXT: s_endpgm 4024; 4025; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: 4026; GFX10-WGP: ; %bb.0: ; %entry 4027; GFX10-WGP-NEXT: s_clause 0x1 4028; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4029; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4030; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4031; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 4032; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 4033; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4034; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4035; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4036; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4037; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4038; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4039; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4040; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4041; GFX10-WGP-NEXT: buffer_gl0_inv 4042; GFX10-WGP-NEXT: buffer_gl1_inv 4043; GFX10-WGP-NEXT: s_endpgm 4044; 4045; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: 4046; GFX10-CU: ; %bb.0: ; %entry 4047; GFX10-CU-NEXT: s_clause 0x1 4048; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4049; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4050; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4051; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 4052; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 4053; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4054; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4055; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4056; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4057; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4058; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4059; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4060; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4061; GFX10-CU-NEXT: buffer_gl0_inv 4062; GFX10-CU-NEXT: buffer_gl1_inv 4063; GFX10-CU-NEXT: s_endpgm 4064; 4065; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_cmpxchg: 4066; SKIP-CACHE-INV: ; %bb.0: ; %entry 4067; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4068; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4069; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4070; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 4071; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 4072; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4073; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4074; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4075; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4076; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4077; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4078; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4079; SKIP-CACHE-INV-NEXT: s_endpgm 4080 i32* %out, i32 %in, i32 %old) { 4081entry: 4082 %gep = getelementptr i32, i32* %out, i32 4 4083 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire 4084 ret void 4085} 4086 4087define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( 4088; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: 4089; GFX7: ; %bb.0: ; %entry 4090; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4091; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4092; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4093; GFX7-NEXT: s_add_u32 s0, s0, 16 4094; GFX7-NEXT: s_addc_u32 s1, s1, 0 4095; GFX7-NEXT: v_mov_b32_e32 v0, s0 4096; GFX7-NEXT: v_mov_b32_e32 v2, s2 4097; GFX7-NEXT: v_mov_b32_e32 v1, s1 4098; GFX7-NEXT: v_mov_b32_e32 v3, s3 4099; GFX7-NEXT: s_waitcnt vmcnt(0) 4100; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4101; GFX7-NEXT: s_waitcnt vmcnt(0) 4102; GFX7-NEXT: buffer_wbinvl1_vol 4103; GFX7-NEXT: s_endpgm 4104; 4105; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: 4106; GFX10-WGP: ; %bb.0: ; %entry 4107; GFX10-WGP-NEXT: s_clause 0x1 4108; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4109; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4110; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4111; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 4112; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 4113; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4114; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4115; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4116; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4117; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4118; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4119; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4120; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4121; GFX10-WGP-NEXT: buffer_gl0_inv 4122; GFX10-WGP-NEXT: buffer_gl1_inv 4123; GFX10-WGP-NEXT: s_endpgm 4124; 4125; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: 4126; GFX10-CU: ; %bb.0: ; %entry 4127; GFX10-CU-NEXT: s_clause 0x1 4128; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4129; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4130; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4131; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 4132; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 4133; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4134; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4135; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4136; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4137; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4138; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4139; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4140; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4141; GFX10-CU-NEXT: buffer_gl0_inv 4142; GFX10-CU-NEXT: buffer_gl1_inv 4143; GFX10-CU-NEXT: s_endpgm 4144; 4145; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: 4146; SKIP-CACHE-INV: ; %bb.0: ; %entry 4147; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4148; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4149; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4150; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 4151; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 4152; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4153; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4154; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4155; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4156; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4157; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4158; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4159; SKIP-CACHE-INV-NEXT: s_endpgm 4160 i32* %out, i32 %in, i32 %old) { 4161entry: 4162 %gep = getelementptr i32, i32* %out, i32 4 4163 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire 4164 ret void 4165} 4166 4167define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( 4168; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: 4169; GFX7: ; %bb.0: ; %entry 4170; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4171; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4172; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4173; GFX7-NEXT: s_add_u32 s0, s0, 16 4174; GFX7-NEXT: s_addc_u32 s1, s1, 0 4175; GFX7-NEXT: v_mov_b32_e32 v0, s0 4176; GFX7-NEXT: v_mov_b32_e32 v2, s2 4177; GFX7-NEXT: v_mov_b32_e32 v1, s1 4178; GFX7-NEXT: v_mov_b32_e32 v3, s3 4179; GFX7-NEXT: s_waitcnt vmcnt(0) 4180; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4181; GFX7-NEXT: s_waitcnt vmcnt(0) 4182; GFX7-NEXT: buffer_wbinvl1_vol 4183; GFX7-NEXT: s_endpgm 4184; 4185; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: 4186; GFX10-WGP: ; %bb.0: ; %entry 4187; GFX10-WGP-NEXT: s_clause 0x1 4188; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4189; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4190; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4191; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 4192; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 4193; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4194; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4195; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4196; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4197; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4198; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4199; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4200; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4201; GFX10-WGP-NEXT: buffer_gl0_inv 4202; GFX10-WGP-NEXT: buffer_gl1_inv 4203; GFX10-WGP-NEXT: s_endpgm 4204; 4205; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: 4206; GFX10-CU: ; %bb.0: ; %entry 4207; GFX10-CU-NEXT: s_clause 0x1 4208; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4209; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4210; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4211; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 4212; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 4213; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4214; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4215; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4216; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4217; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4218; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4219; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4220; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4221; GFX10-CU-NEXT: buffer_gl0_inv 4222; GFX10-CU-NEXT: buffer_gl1_inv 4223; GFX10-CU-NEXT: s_endpgm 4224; 4225; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: 4226; SKIP-CACHE-INV: ; %bb.0: ; %entry 4227; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4228; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4229; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4230; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 4231; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 4232; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4233; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4234; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4235; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4236; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4237; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4238; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4239; SKIP-CACHE-INV-NEXT: s_endpgm 4240 i32* %out, i32 %in, i32 %old) { 4241entry: 4242 %gep = getelementptr i32, i32* %out, i32 4 4243 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire 4244 ret void 4245} 4246 4247define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( 4248; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: 4249; GFX7: ; %bb.0: ; %entry 4250; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4251; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4252; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4253; GFX7-NEXT: s_add_u32 s0, s0, 16 4254; GFX7-NEXT: s_addc_u32 s1, s1, 0 4255; GFX7-NEXT: v_mov_b32_e32 v0, s0 4256; GFX7-NEXT: v_mov_b32_e32 v2, s2 4257; GFX7-NEXT: v_mov_b32_e32 v1, s1 4258; GFX7-NEXT: v_mov_b32_e32 v3, s3 4259; GFX7-NEXT: s_waitcnt vmcnt(0) 4260; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4261; GFX7-NEXT: s_waitcnt vmcnt(0) 4262; GFX7-NEXT: buffer_wbinvl1_vol 4263; GFX7-NEXT: s_endpgm 4264; 4265; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: 4266; GFX10-WGP: ; %bb.0: ; %entry 4267; GFX10-WGP-NEXT: s_clause 0x1 4268; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4269; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4270; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4271; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 4272; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 4273; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4274; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4275; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4276; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4277; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4278; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4279; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4280; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4281; GFX10-WGP-NEXT: buffer_gl0_inv 4282; GFX10-WGP-NEXT: buffer_gl1_inv 4283; GFX10-WGP-NEXT: s_endpgm 4284; 4285; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: 4286; GFX10-CU: ; %bb.0: ; %entry 4287; GFX10-CU-NEXT: s_clause 0x1 4288; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4289; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4290; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4291; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 4292; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 4293; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4294; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4295; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4296; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4297; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4298; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4299; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4300; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4301; GFX10-CU-NEXT: buffer_gl0_inv 4302; GFX10-CU-NEXT: buffer_gl1_inv 4303; GFX10-CU-NEXT: s_endpgm 4304; 4305; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: 4306; SKIP-CACHE-INV: ; %bb.0: ; %entry 4307; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4308; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4309; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4310; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 4311; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 4312; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4313; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4314; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4315; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4316; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4317; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4318; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4319; SKIP-CACHE-INV-NEXT: s_endpgm 4320 i32* %out, i32 %in, i32 %old) { 4321entry: 4322 %gep = getelementptr i32, i32* %out, i32 4 4323 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst 4324 ret void 4325} 4326 4327define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( 4328; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: 4329; GFX7: ; %bb.0: ; %entry 4330; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4331; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4332; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4333; GFX7-NEXT: s_add_u32 s4, s0, 16 4334; GFX7-NEXT: s_addc_u32 s5, s1, 0 4335; GFX7-NEXT: v_mov_b32_e32 v0, s4 4336; GFX7-NEXT: v_mov_b32_e32 v2, s2 4337; GFX7-NEXT: v_mov_b32_e32 v1, s5 4338; GFX7-NEXT: v_mov_b32_e32 v3, s3 4339; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4340; GFX7-NEXT: s_waitcnt vmcnt(0) 4341; GFX7-NEXT: buffer_wbinvl1_vol 4342; GFX7-NEXT: v_mov_b32_e32 v0, s0 4343; GFX7-NEXT: v_mov_b32_e32 v1, s1 4344; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4345; GFX7-NEXT: flat_store_dword v[0:1], v2 4346; GFX7-NEXT: s_endpgm 4347; 4348; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: 4349; GFX10-WGP: ; %bb.0: ; %entry 4350; GFX10-WGP-NEXT: s_clause 0x1 4351; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4352; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4353; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4354; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4355; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4356; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4357; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4358; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4359; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4360; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4361; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4362; GFX10-WGP-NEXT: buffer_gl0_inv 4363; GFX10-WGP-NEXT: buffer_gl1_inv 4364; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4365; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4366; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4367; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4368; GFX10-WGP-NEXT: s_endpgm 4369; 4370; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: 4371; GFX10-CU: ; %bb.0: ; %entry 4372; GFX10-CU-NEXT: s_clause 0x1 4373; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4374; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4375; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4376; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4377; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4378; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4379; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4380; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4381; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4382; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4383; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4384; GFX10-CU-NEXT: buffer_gl0_inv 4385; GFX10-CU-NEXT: buffer_gl1_inv 4386; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4387; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4388; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4389; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4390; GFX10-CU-NEXT: s_endpgm 4391; 4392; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: 4393; SKIP-CACHE-INV: ; %bb.0: ; %entry 4394; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4395; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4396; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4397; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4398; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4399; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4400; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4401; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4402; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4403; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4404; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4405; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4406; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4407; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4408; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4409; SKIP-CACHE-INV-NEXT: s_endpgm 4410 i32* %out, i32 %in, i32 %old) { 4411entry: 4412 %gep = getelementptr i32, i32* %out, i32 4 4413 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic 4414 %val0 = extractvalue { i32, i1 } %val, 0 4415 store i32 %val0, i32* %out, align 4 4416 ret void 4417} 4418 4419define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( 4420; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: 4421; GFX7: ; %bb.0: ; %entry 4422; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4423; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4424; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4425; GFX7-NEXT: s_add_u32 s4, s0, 16 4426; GFX7-NEXT: s_addc_u32 s5, s1, 0 4427; GFX7-NEXT: v_mov_b32_e32 v0, s4 4428; GFX7-NEXT: v_mov_b32_e32 v2, s2 4429; GFX7-NEXT: v_mov_b32_e32 v1, s5 4430; GFX7-NEXT: v_mov_b32_e32 v3, s3 4431; GFX7-NEXT: s_waitcnt vmcnt(0) 4432; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4433; GFX7-NEXT: s_waitcnt vmcnt(0) 4434; GFX7-NEXT: buffer_wbinvl1_vol 4435; GFX7-NEXT: v_mov_b32_e32 v0, s0 4436; GFX7-NEXT: v_mov_b32_e32 v1, s1 4437; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4438; GFX7-NEXT: flat_store_dword v[0:1], v2 4439; GFX7-NEXT: s_endpgm 4440; 4441; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: 4442; GFX10-WGP: ; %bb.0: ; %entry 4443; GFX10-WGP-NEXT: s_clause 0x1 4444; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4445; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4446; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4447; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4448; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4449; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4450; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4451; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4452; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4453; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4454; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4455; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4456; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4457; GFX10-WGP-NEXT: buffer_gl0_inv 4458; GFX10-WGP-NEXT: buffer_gl1_inv 4459; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4460; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4461; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4462; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4463; GFX10-WGP-NEXT: s_endpgm 4464; 4465; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: 4466; GFX10-CU: ; %bb.0: ; %entry 4467; GFX10-CU-NEXT: s_clause 0x1 4468; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4469; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4470; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4471; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4472; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4473; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4474; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4475; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4476; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4477; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4478; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4479; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4480; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4481; GFX10-CU-NEXT: buffer_gl0_inv 4482; GFX10-CU-NEXT: buffer_gl1_inv 4483; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4484; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4485; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4486; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4487; GFX10-CU-NEXT: s_endpgm 4488; 4489; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: 4490; SKIP-CACHE-INV: ; %bb.0: ; %entry 4491; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4492; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4493; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4494; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4495; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4496; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4497; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4498; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4499; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4500; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4501; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4502; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4503; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4504; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4505; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4506; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4507; SKIP-CACHE-INV-NEXT: s_endpgm 4508 i32* %out, i32 %in, i32 %old) { 4509entry: 4510 %gep = getelementptr i32, i32* %out, i32 4 4511 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic 4512 %val0 = extractvalue { i32, i1 } %val, 0 4513 store i32 %val0, i32* %out, align 4 4514 ret void 4515} 4516 4517define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( 4518; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: 4519; GFX7: ; %bb.0: ; %entry 4520; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4521; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4522; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4523; GFX7-NEXT: s_add_u32 s4, s0, 16 4524; GFX7-NEXT: s_addc_u32 s5, s1, 0 4525; GFX7-NEXT: v_mov_b32_e32 v0, s4 4526; GFX7-NEXT: v_mov_b32_e32 v2, s2 4527; GFX7-NEXT: v_mov_b32_e32 v1, s5 4528; GFX7-NEXT: v_mov_b32_e32 v3, s3 4529; GFX7-NEXT: s_waitcnt vmcnt(0) 4530; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4531; GFX7-NEXT: s_waitcnt vmcnt(0) 4532; GFX7-NEXT: buffer_wbinvl1_vol 4533; GFX7-NEXT: v_mov_b32_e32 v0, s0 4534; GFX7-NEXT: v_mov_b32_e32 v1, s1 4535; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4536; GFX7-NEXT: flat_store_dword v[0:1], v2 4537; GFX7-NEXT: s_endpgm 4538; 4539; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: 4540; GFX10-WGP: ; %bb.0: ; %entry 4541; GFX10-WGP-NEXT: s_clause 0x1 4542; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4543; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4544; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4545; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4546; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4547; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4548; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4549; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4550; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4551; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4552; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4553; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4554; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4555; GFX10-WGP-NEXT: buffer_gl0_inv 4556; GFX10-WGP-NEXT: buffer_gl1_inv 4557; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4558; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4559; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4560; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4561; GFX10-WGP-NEXT: s_endpgm 4562; 4563; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: 4564; GFX10-CU: ; %bb.0: ; %entry 4565; GFX10-CU-NEXT: s_clause 0x1 4566; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4567; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4568; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4569; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4570; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4571; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4572; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4573; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4574; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4575; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4576; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4577; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4578; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4579; GFX10-CU-NEXT: buffer_gl0_inv 4580; GFX10-CU-NEXT: buffer_gl1_inv 4581; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4582; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4583; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4584; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4585; GFX10-CU-NEXT: s_endpgm 4586; 4587; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: 4588; SKIP-CACHE-INV: ; %bb.0: ; %entry 4589; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4590; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4591; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4592; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4593; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4594; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4595; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4596; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4597; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4598; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4599; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4600; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4601; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4602; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4603; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4604; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4605; SKIP-CACHE-INV-NEXT: s_endpgm 4606 i32* %out, i32 %in, i32 %old) { 4607entry: 4608 %gep = getelementptr i32, i32* %out, i32 4 4609 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic 4610 %val0 = extractvalue { i32, i1 } %val, 0 4611 store i32 %val0, i32* %out, align 4 4612 ret void 4613} 4614 4615define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( 4616; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: 4617; GFX7: ; %bb.0: ; %entry 4618; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4619; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4620; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4621; GFX7-NEXT: s_add_u32 s4, s0, 16 4622; GFX7-NEXT: s_addc_u32 s5, s1, 0 4623; GFX7-NEXT: v_mov_b32_e32 v0, s4 4624; GFX7-NEXT: v_mov_b32_e32 v2, s2 4625; GFX7-NEXT: v_mov_b32_e32 v1, s5 4626; GFX7-NEXT: v_mov_b32_e32 v3, s3 4627; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4628; GFX7-NEXT: s_waitcnt vmcnt(0) 4629; GFX7-NEXT: buffer_wbinvl1_vol 4630; GFX7-NEXT: v_mov_b32_e32 v0, s0 4631; GFX7-NEXT: v_mov_b32_e32 v1, s1 4632; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4633; GFX7-NEXT: flat_store_dword v[0:1], v2 4634; GFX7-NEXT: s_endpgm 4635; 4636; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: 4637; GFX10-WGP: ; %bb.0: ; %entry 4638; GFX10-WGP-NEXT: s_clause 0x1 4639; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4640; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4641; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4642; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4643; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4644; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4645; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4646; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4647; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4648; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4649; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4650; GFX10-WGP-NEXT: buffer_gl0_inv 4651; GFX10-WGP-NEXT: buffer_gl1_inv 4652; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4653; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4654; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4655; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4656; GFX10-WGP-NEXT: s_endpgm 4657; 4658; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: 4659; GFX10-CU: ; %bb.0: ; %entry 4660; GFX10-CU-NEXT: s_clause 0x1 4661; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4662; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4663; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4664; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4665; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4666; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4667; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4668; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4669; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4670; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4671; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4672; GFX10-CU-NEXT: buffer_gl0_inv 4673; GFX10-CU-NEXT: buffer_gl1_inv 4674; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4675; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4676; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4677; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4678; GFX10-CU-NEXT: s_endpgm 4679; 4680; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: 4681; SKIP-CACHE-INV: ; %bb.0: ; %entry 4682; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4683; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4684; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4685; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4686; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4687; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4688; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4689; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4690; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4691; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4692; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4693; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4694; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4695; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4696; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4697; SKIP-CACHE-INV-NEXT: s_endpgm 4698 i32* %out, i32 %in, i32 %old) { 4699entry: 4700 %gep = getelementptr i32, i32* %out, i32 4 4701 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire 4702 %val0 = extractvalue { i32, i1 } %val, 0 4703 store i32 %val0, i32* %out, align 4 4704 ret void 4705} 4706 4707define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( 4708; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: 4709; GFX7: ; %bb.0: ; %entry 4710; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4711; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4712; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4713; GFX7-NEXT: s_add_u32 s4, s0, 16 4714; GFX7-NEXT: s_addc_u32 s5, s1, 0 4715; GFX7-NEXT: v_mov_b32_e32 v0, s4 4716; GFX7-NEXT: v_mov_b32_e32 v2, s2 4717; GFX7-NEXT: v_mov_b32_e32 v1, s5 4718; GFX7-NEXT: v_mov_b32_e32 v3, s3 4719; GFX7-NEXT: s_waitcnt vmcnt(0) 4720; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4721; GFX7-NEXT: s_waitcnt vmcnt(0) 4722; GFX7-NEXT: buffer_wbinvl1_vol 4723; GFX7-NEXT: v_mov_b32_e32 v0, s0 4724; GFX7-NEXT: v_mov_b32_e32 v1, s1 4725; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4726; GFX7-NEXT: flat_store_dword v[0:1], v2 4727; GFX7-NEXT: s_endpgm 4728; 4729; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: 4730; GFX10-WGP: ; %bb.0: ; %entry 4731; GFX10-WGP-NEXT: s_clause 0x1 4732; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4733; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4734; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4735; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4736; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4737; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4738; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4739; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4740; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4741; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4742; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4743; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4744; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4745; GFX10-WGP-NEXT: buffer_gl0_inv 4746; GFX10-WGP-NEXT: buffer_gl1_inv 4747; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4748; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4749; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4750; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4751; GFX10-WGP-NEXT: s_endpgm 4752; 4753; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: 4754; GFX10-CU: ; %bb.0: ; %entry 4755; GFX10-CU-NEXT: s_clause 0x1 4756; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4757; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4758; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4759; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4760; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4761; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4762; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4763; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4764; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4765; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4766; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4767; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4768; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4769; GFX10-CU-NEXT: buffer_gl0_inv 4770; GFX10-CU-NEXT: buffer_gl1_inv 4771; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4772; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4773; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4774; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4775; GFX10-CU-NEXT: s_endpgm 4776; 4777; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: 4778; SKIP-CACHE-INV: ; %bb.0: ; %entry 4779; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4780; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4781; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4782; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4783; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4784; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4785; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4786; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4787; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4788; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4789; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4790; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4791; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4792; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4793; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4794; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4795; SKIP-CACHE-INV-NEXT: s_endpgm 4796 i32* %out, i32 %in, i32 %old) { 4797entry: 4798 %gep = getelementptr i32, i32* %out, i32 4 4799 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire 4800 %val0 = extractvalue { i32, i1 } %val, 0 4801 store i32 %val0, i32* %out, align 4 4802 ret void 4803} 4804 4805define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( 4806; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: 4807; GFX7: ; %bb.0: ; %entry 4808; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4809; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4810; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4811; GFX7-NEXT: s_add_u32 s4, s0, 16 4812; GFX7-NEXT: s_addc_u32 s5, s1, 0 4813; GFX7-NEXT: v_mov_b32_e32 v0, s4 4814; GFX7-NEXT: v_mov_b32_e32 v2, s2 4815; GFX7-NEXT: v_mov_b32_e32 v1, s5 4816; GFX7-NEXT: v_mov_b32_e32 v3, s3 4817; GFX7-NEXT: s_waitcnt vmcnt(0) 4818; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4819; GFX7-NEXT: s_waitcnt vmcnt(0) 4820; GFX7-NEXT: buffer_wbinvl1_vol 4821; GFX7-NEXT: v_mov_b32_e32 v0, s0 4822; GFX7-NEXT: v_mov_b32_e32 v1, s1 4823; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4824; GFX7-NEXT: flat_store_dword v[0:1], v2 4825; GFX7-NEXT: s_endpgm 4826; 4827; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: 4828; GFX10-WGP: ; %bb.0: ; %entry 4829; GFX10-WGP-NEXT: s_clause 0x1 4830; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4831; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4832; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4833; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4834; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4835; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4836; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4837; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4838; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4839; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4840; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4841; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4842; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4843; GFX10-WGP-NEXT: buffer_gl0_inv 4844; GFX10-WGP-NEXT: buffer_gl1_inv 4845; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4846; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4847; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4848; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4849; GFX10-WGP-NEXT: s_endpgm 4850; 4851; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: 4852; GFX10-CU: ; %bb.0: ; %entry 4853; GFX10-CU-NEXT: s_clause 0x1 4854; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4855; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4856; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4857; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4858; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4859; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4860; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4861; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4862; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4863; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4864; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4865; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4866; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4867; GFX10-CU-NEXT: buffer_gl0_inv 4868; GFX10-CU-NEXT: buffer_gl1_inv 4869; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4870; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4871; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4872; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4873; GFX10-CU-NEXT: s_endpgm 4874; 4875; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: 4876; SKIP-CACHE-INV: ; %bb.0: ; %entry 4877; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4878; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4879; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4880; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4881; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4882; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4883; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4884; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4885; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4886; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4887; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4888; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4889; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4890; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4891; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4892; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4893; SKIP-CACHE-INV-NEXT: s_endpgm 4894 i32* %out, i32 %in, i32 %old) { 4895entry: 4896 %gep = getelementptr i32, i32* %out, i32 4 4897 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire 4898 %val0 = extractvalue { i32, i1 } %val, 0 4899 store i32 %val0, i32* %out, align 4 4900 ret void 4901} 4902 4903define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( 4904; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: 4905; GFX7: ; %bb.0: ; %entry 4906; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4907; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4908; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4909; GFX7-NEXT: s_add_u32 s4, s0, 16 4910; GFX7-NEXT: s_addc_u32 s5, s1, 0 4911; GFX7-NEXT: v_mov_b32_e32 v0, s4 4912; GFX7-NEXT: v_mov_b32_e32 v2, s2 4913; GFX7-NEXT: v_mov_b32_e32 v1, s5 4914; GFX7-NEXT: v_mov_b32_e32 v3, s3 4915; GFX7-NEXT: s_waitcnt vmcnt(0) 4916; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4917; GFX7-NEXT: s_waitcnt vmcnt(0) 4918; GFX7-NEXT: buffer_wbinvl1_vol 4919; GFX7-NEXT: v_mov_b32_e32 v0, s0 4920; GFX7-NEXT: v_mov_b32_e32 v1, s1 4921; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4922; GFX7-NEXT: flat_store_dword v[0:1], v2 4923; GFX7-NEXT: s_endpgm 4924; 4925; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: 4926; GFX10-WGP: ; %bb.0: ; %entry 4927; GFX10-WGP-NEXT: s_clause 0x1 4928; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4929; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4930; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4931; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 4932; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 4933; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 4934; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 4935; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 4936; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 4937; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4938; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4939; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4940; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4941; GFX10-WGP-NEXT: buffer_gl0_inv 4942; GFX10-WGP-NEXT: buffer_gl1_inv 4943; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4944; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4945; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4946; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 4947; GFX10-WGP-NEXT: s_endpgm 4948; 4949; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: 4950; GFX10-CU: ; %bb.0: ; %entry 4951; GFX10-CU-NEXT: s_clause 0x1 4952; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4953; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4954; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4955; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 4956; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 4957; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 4958; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 4959; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 4960; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 4961; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4962; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 4963; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4964; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4965; GFX10-CU-NEXT: buffer_gl0_inv 4966; GFX10-CU-NEXT: buffer_gl1_inv 4967; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4968; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4969; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4970; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 4971; GFX10-CU-NEXT: s_endpgm 4972; 4973; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: 4974; SKIP-CACHE-INV: ; %bb.0: ; %entry 4975; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 4976; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4977; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4978; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 4979; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 4980; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 4981; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 4982; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 4983; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 4984; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4985; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4986; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4987; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 4988; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 4989; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4990; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 4991; SKIP-CACHE-INV-NEXT: s_endpgm 4992 i32* %out, i32 %in, i32 %old) { 4993entry: 4994 %gep = getelementptr i32, i32* %out, i32 4 4995 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire 4996 %val0 = extractvalue { i32, i1 } %val, 0 4997 store i32 %val0, i32* %out, align 4 4998 ret void 4999} 5000 5001define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( 5002; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: 5003; GFX7: ; %bb.0: ; %entry 5004; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5005; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5006; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5007; GFX7-NEXT: s_add_u32 s4, s0, 16 5008; GFX7-NEXT: s_addc_u32 s5, s1, 0 5009; GFX7-NEXT: v_mov_b32_e32 v0, s4 5010; GFX7-NEXT: v_mov_b32_e32 v2, s2 5011; GFX7-NEXT: v_mov_b32_e32 v1, s5 5012; GFX7-NEXT: v_mov_b32_e32 v3, s3 5013; GFX7-NEXT: s_waitcnt vmcnt(0) 5014; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5015; GFX7-NEXT: s_waitcnt vmcnt(0) 5016; GFX7-NEXT: buffer_wbinvl1_vol 5017; GFX7-NEXT: v_mov_b32_e32 v0, s0 5018; GFX7-NEXT: v_mov_b32_e32 v1, s1 5019; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5020; GFX7-NEXT: flat_store_dword v[0:1], v2 5021; GFX7-NEXT: s_endpgm 5022; 5023; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: 5024; GFX10-WGP: ; %bb.0: ; %entry 5025; GFX10-WGP-NEXT: s_clause 0x1 5026; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5027; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5028; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5029; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 5030; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 5031; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 5032; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 5033; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 5034; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 5035; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5036; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5037; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5038; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5039; GFX10-WGP-NEXT: buffer_gl0_inv 5040; GFX10-WGP-NEXT: buffer_gl1_inv 5041; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5042; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5043; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5044; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 5045; GFX10-WGP-NEXT: s_endpgm 5046; 5047; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: 5048; GFX10-CU: ; %bb.0: ; %entry 5049; GFX10-CU-NEXT: s_clause 0x1 5050; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5051; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 5052; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5053; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 5054; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 5055; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 5056; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 5057; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 5058; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 5059; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5060; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 5061; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5062; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5063; GFX10-CU-NEXT: buffer_gl0_inv 5064; GFX10-CU-NEXT: buffer_gl1_inv 5065; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5066; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5067; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5068; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 5069; GFX10-CU-NEXT: s_endpgm 5070; 5071; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: 5072; SKIP-CACHE-INV: ; %bb.0: ; %entry 5073; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 5074; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5075; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5076; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 5077; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 5078; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 5079; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 5080; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 5081; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 5082; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5083; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5084; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5085; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 5086; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 5087; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5088; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 5089; SKIP-CACHE-INV-NEXT: s_endpgm 5090 i32* %out, i32 %in, i32 %old) { 5091entry: 5092 %gep = getelementptr i32, i32* %out, i32 4 5093 %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst 5094 %val0 = extractvalue { i32, i1 } %val, 0 5095 store i32 %val0, i32* %out, align 4 5096 ret void 5097} 5098 5099