1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s 3; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s 4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s 5 6define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { 7; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: 8; SI: ; %bb.0: 9; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 10; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 11; SI-NEXT: v_mov_b32_e32 v1, 0 12; SI-NEXT: s_mov_b32 s10, 0 13; SI-NEXT: s_mov_b32 s11, 0xf000 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 16; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 17; SI-NEXT: s_mov_b64 s[8:9], s[4:5] 18; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 19; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 20; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 21; SI-NEXT: s_waitcnt vmcnt(2) 22; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 23; SI-NEXT: s_waitcnt vmcnt(0) 24; SI-NEXT: v_med3_f32 v2, v2, v3, v4 25; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 26; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 27; SI-NEXT: s_endpgm 28; 29; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: 30; VI: ; %bb.0: 31; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 32; VI-NEXT: v_lshlrev_b32_e32 v8, 2, v0 33; VI-NEXT: s_waitcnt lgkmcnt(0) 34; VI-NEXT: v_mov_b32_e32 v0, s2 35; VI-NEXT: v_mov_b32_e32 v1, s3 36; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v8 37; VI-NEXT: v_mov_b32_e32 v2, s4 38; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 39; VI-NEXT: v_mov_b32_e32 v3, s5 40; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8 41; VI-NEXT: v_mov_b32_e32 v4, s6 42; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 43; VI-NEXT: v_mov_b32_e32 v5, s7 44; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v8 45; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 46; VI-NEXT: flat_load_dword v0, v[0:1] 47; VI-NEXT: flat_load_dword v1, v[2:3] 48; VI-NEXT: flat_load_dword v2, v[4:5] 49; VI-NEXT: v_mov_b32_e32 v7, s1 50; VI-NEXT: v_mov_b32_e32 v6, s0 51; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v8 52; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc 53; VI-NEXT: s_waitcnt vmcnt(2) 54; VI-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 55; VI-NEXT: s_waitcnt vmcnt(0) 56; VI-NEXT: v_med3_f32 v0, v0, v1, v2 57; VI-NEXT: flat_store_dword v[6:7], v0 58; VI-NEXT: s_endpgm 59; 60; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: 61; GFX9: ; %bb.0: 62; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 63; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 64; GFX9-NEXT: s_waitcnt lgkmcnt(0) 65; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 66; GFX9-NEXT: global_load_dword v2, v0, s[4:5] 67; GFX9-NEXT: global_load_dword v3, v0, s[6:7] 68; GFX9-NEXT: s_waitcnt vmcnt(2) 69; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 70; GFX9-NEXT: s_waitcnt vmcnt(0) 71; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 72; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 73; GFX9-NEXT: s_endpgm 74 %tid = call i32 @llvm.amdgcn.workitem.id.x() 75 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 76 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid 77 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid 78 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid 79 %a = load volatile float, float addrspace(1)* %gep0 80 %b = load volatile float, float addrspace(1)* %gep1 81 %c = load volatile float, float addrspace(1)* %gep2 82 %a.fneg = fsub float -0.0, %a 83 %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) 84 %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b) 85 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) 86 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) 87 store float %med3, float addrspace(1)* %outgep 88 ret void 89} 90 91define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { 92; SI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: 93; SI: ; %bb.0: 94; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 95; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 96; SI-NEXT: v_mov_b32_e32 v1, 0 97; SI-NEXT: s_mov_b32 s10, 0 98; SI-NEXT: s_mov_b32 s11, 0xf000 99; SI-NEXT: s_waitcnt lgkmcnt(0) 100; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 101; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 102; SI-NEXT: s_mov_b64 s[8:9], s[4:5] 103; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 104; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 105; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 106; SI-NEXT: s_waitcnt vmcnt(2) 107; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 108; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 109; SI-NEXT: s_waitcnt vmcnt(1) 110; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 111; SI-NEXT: v_min_f32_e32 v5, v2, v3 112; SI-NEXT: v_max_f32_e32 v2, v2, v3 113; SI-NEXT: s_waitcnt vmcnt(0) 114; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 115; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 116; SI-NEXT: v_min_f32_e32 v2, v2, v3 117; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 118; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 119; SI-NEXT: v_max_f32_e32 v2, v3, v2 120; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 121; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 122; SI-NEXT: s_endpgm 123; 124; VI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: 125; VI: ; %bb.0: 126; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 127; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 128; VI-NEXT: s_waitcnt lgkmcnt(0) 129; VI-NEXT: v_mov_b32_e32 v0, s2 130; VI-NEXT: v_mov_b32_e32 v1, s3 131; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 132; VI-NEXT: v_mov_b32_e32 v2, s4 133; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 134; VI-NEXT: v_mov_b32_e32 v3, s5 135; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 136; VI-NEXT: v_mov_b32_e32 v4, s6 137; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 138; VI-NEXT: v_mov_b32_e32 v5, s7 139; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 140; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 141; VI-NEXT: flat_load_dword v7, v[0:1] 142; VI-NEXT: flat_load_dword v2, v[2:3] 143; VI-NEXT: flat_load_dword v3, v[4:5] 144; VI-NEXT: v_mov_b32_e32 v0, s0 145; VI-NEXT: v_mov_b32_e32 v1, s1 146; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 147; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 148; VI-NEXT: s_waitcnt vmcnt(2) 149; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7 150; VI-NEXT: s_waitcnt vmcnt(1) 151; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 152; VI-NEXT: v_mul_f32_e32 v4, 1.0, v4 153; VI-NEXT: v_min_f32_e32 v5, v4, v2 154; VI-NEXT: v_max_f32_e32 v2, v4, v2 155; VI-NEXT: s_waitcnt vmcnt(0) 156; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 157; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 158; VI-NEXT: v_min_f32_e32 v2, v2, v3 159; VI-NEXT: v_mul_f32_e32 v3, 1.0, v5 160; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 161; VI-NEXT: v_max_f32_e32 v2, v3, v2 162; VI-NEXT: flat_store_dword v[0:1], v2 163; VI-NEXT: s_endpgm 164; 165; GFX9-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: 166; GFX9: ; %bb.0: 167; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 168; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 170; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 171; GFX9-NEXT: global_load_dword v2, v0, s[4:5] 172; GFX9-NEXT: global_load_dword v3, v0, s[6:7] 173; GFX9-NEXT: s_waitcnt vmcnt(2) 174; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 175; GFX9-NEXT: s_waitcnt vmcnt(1) 176; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 177; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 178; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 179; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 180; GFX9-NEXT: s_waitcnt vmcnt(0) 181; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 182; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 183; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 184; GFX9-NEXT: v_max_f32_e32 v2, v4, v4 185; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 186; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 187; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 188; GFX9-NEXT: s_endpgm 189 %tid = call i32 @llvm.amdgcn.workitem.id.x() 190 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 191 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid 192 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid 193 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid 194 %a = load volatile float, float addrspace(1)* %gep0 195 %b = load volatile float, float addrspace(1)* %gep1 196 %c = load volatile float, float addrspace(1)* %gep2 197 %a.fneg = fsub float -0.0, %a 198 %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) 199 %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b) 200 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) 201 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) 202 store float %med3, float addrspace(1)* %outgep 203 ret void 204} 205 206define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { 207; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: 208; SI: ; %bb.0: 209; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 210; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 211; SI-NEXT: v_mov_b32_e32 v1, 0 212; SI-NEXT: s_mov_b32 s10, 0 213; SI-NEXT: s_mov_b32 s11, 0xf000 214; SI-NEXT: s_waitcnt lgkmcnt(0) 215; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 216; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 217; SI-NEXT: s_mov_b64 s[8:9], s[4:5] 218; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 219; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 220; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 221; SI-NEXT: s_mov_b32 s2, 0x80000000 222; SI-NEXT: s_waitcnt vmcnt(2) 223; SI-NEXT: v_sub_f32_e32 v2, s2, v2 224; SI-NEXT: s_waitcnt vmcnt(0) 225; SI-NEXT: v_sub_f32_e64 v4, s2, |v4| 226; SI-NEXT: v_med3_f32 v2, v2, |v3|, v4 227; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 228; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 229; SI-NEXT: s_endpgm 230; 231; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: 232; VI: ; %bb.0: 233; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 234; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 235; VI-NEXT: s_waitcnt lgkmcnt(0) 236; VI-NEXT: v_mov_b32_e32 v0, s2 237; VI-NEXT: v_mov_b32_e32 v1, s3 238; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 239; VI-NEXT: v_mov_b32_e32 v2, s4 240; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 241; VI-NEXT: v_mov_b32_e32 v3, s5 242; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 243; VI-NEXT: v_mov_b32_e32 v4, s6 244; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 245; VI-NEXT: v_mov_b32_e32 v5, s7 246; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 247; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 248; VI-NEXT: flat_load_dword v7, v[0:1] 249; VI-NEXT: flat_load_dword v2, v[2:3] 250; VI-NEXT: flat_load_dword v3, v[4:5] 251; VI-NEXT: v_mov_b32_e32 v0, s0 252; VI-NEXT: s_mov_b32 s2, 0x80000000 253; VI-NEXT: v_mov_b32_e32 v1, s1 254; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 255; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 256; VI-NEXT: s_waitcnt vmcnt(2) 257; VI-NEXT: v_sub_f32_e32 v4, s2, v7 258; VI-NEXT: s_waitcnt vmcnt(0) 259; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| 260; VI-NEXT: v_med3_f32 v2, v4, |v2|, v3 261; VI-NEXT: flat_store_dword v[0:1], v2 262; VI-NEXT: s_endpgm 263; 264; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: 265; GFX9: ; %bb.0: 266; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 267; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 268; GFX9-NEXT: s_waitcnt lgkmcnt(0) 269; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 270; GFX9-NEXT: global_load_dword v2, v0, s[4:5] 271; GFX9-NEXT: global_load_dword v3, v0, s[6:7] 272; GFX9-NEXT: s_mov_b32 s2, 0x80000000 273; GFX9-NEXT: s_waitcnt vmcnt(2) 274; GFX9-NEXT: v_sub_f32_e32 v1, s2, v1 275; GFX9-NEXT: s_waitcnt vmcnt(0) 276; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3| 277; GFX9-NEXT: v_med3_f32 v1, v1, |v2|, v3 278; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 279; GFX9-NEXT: s_endpgm 280 %tid = call i32 @llvm.amdgcn.workitem.id.x() 281 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 282 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid 283 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid 284 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid 285 %a = load volatile float, float addrspace(1)* %gep0 286 %b = load volatile float, float addrspace(1)* %gep1 287 %c = load volatile float, float addrspace(1)* %gep2 288 289 %a.fneg = fsub float -0.0, %a 290 %b.fabs = call float @llvm.fabs.f32(float %b) 291 %c.fabs = call float @llvm.fabs.f32(float %c) 292 %c.fabs.fneg = fsub float -0.0, %c.fabs 293 294 %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs) 295 %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs) 296 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) 297 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) 298 299 store float %med3, float addrspace(1)* %outgep 300 ret void 301} 302 303define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { 304; SI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: 305; SI: ; %bb.0: 306; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 307; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 308; SI-NEXT: v_mov_b32_e32 v1, 0 309; SI-NEXT: s_mov_b32 s10, 0 310; SI-NEXT: s_mov_b32 s11, 0xf000 311; SI-NEXT: s_waitcnt lgkmcnt(0) 312; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 313; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 314; SI-NEXT: s_mov_b64 s[8:9], s[4:5] 315; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 316; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 317; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 318; SI-NEXT: s_mov_b32 s2, 0x80000000 319; SI-NEXT: s_waitcnt vmcnt(2) 320; SI-NEXT: v_sub_f32_e64 v2, s2, |v2| 321; SI-NEXT: s_waitcnt vmcnt(1) 322; SI-NEXT: v_sub_f32_e64 v3, s2, |v3| 323; SI-NEXT: s_waitcnt vmcnt(0) 324; SI-NEXT: v_sub_f32_e64 v4, s2, |v4| 325; SI-NEXT: v_med3_f32 v2, v2, v3, v4 326; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 327; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 328; SI-NEXT: s_endpgm 329; 330; VI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: 331; VI: ; %bb.0: 332; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 333; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 334; VI-NEXT: s_waitcnt lgkmcnt(0) 335; VI-NEXT: v_mov_b32_e32 v0, s2 336; VI-NEXT: v_mov_b32_e32 v1, s3 337; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 338; VI-NEXT: v_mov_b32_e32 v2, s4 339; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 340; VI-NEXT: v_mov_b32_e32 v3, s5 341; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 342; VI-NEXT: v_mov_b32_e32 v4, s6 343; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 344; VI-NEXT: v_mov_b32_e32 v5, s7 345; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 346; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 347; VI-NEXT: flat_load_dword v7, v[0:1] 348; VI-NEXT: flat_load_dword v2, v[2:3] 349; VI-NEXT: flat_load_dword v3, v[4:5] 350; VI-NEXT: s_mov_b32 s2, 0x80000000 351; VI-NEXT: v_mov_b32_e32 v0, s0 352; VI-NEXT: v_mov_b32_e32 v1, s1 353; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 354; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 355; VI-NEXT: s_waitcnt vmcnt(2) 356; VI-NEXT: v_sub_f32_e64 v4, s2, |v7| 357; VI-NEXT: s_waitcnt vmcnt(1) 358; VI-NEXT: v_sub_f32_e64 v2, s2, |v2| 359; VI-NEXT: s_waitcnt vmcnt(0) 360; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| 361; VI-NEXT: v_med3_f32 v2, v4, v2, v3 362; VI-NEXT: flat_store_dword v[0:1], v2 363; VI-NEXT: s_endpgm 364; 365; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: 366; GFX9: ; %bb.0: 367; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 368; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 369; GFX9-NEXT: s_waitcnt lgkmcnt(0) 370; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 371; GFX9-NEXT: global_load_dword v2, v0, s[4:5] 372; GFX9-NEXT: global_load_dword v3, v0, s[6:7] 373; GFX9-NEXT: s_mov_b32 s2, 0x80000000 374; GFX9-NEXT: s_waitcnt vmcnt(2) 375; GFX9-NEXT: v_sub_f32_e64 v1, s2, |v1| 376; GFX9-NEXT: s_waitcnt vmcnt(1) 377; GFX9-NEXT: v_sub_f32_e64 v2, s2, |v2| 378; GFX9-NEXT: s_waitcnt vmcnt(0) 379; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3| 380; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 381; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 382; GFX9-NEXT: s_endpgm 383 %tid = call i32 @llvm.amdgcn.workitem.id.x() 384 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 385 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid 386 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid 387 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid 388 %a = load volatile float, float addrspace(1)* %gep0 389 %b = load volatile float, float addrspace(1)* %gep1 390 %c = load volatile float, float addrspace(1)* %gep2 391 392 %a.fabs = call float @llvm.fabs.f32(float %a) 393 %a.fabs.fneg = fsub float -0.0, %a.fabs 394 %b.fabs = call float @llvm.fabs.f32(float %b) 395 %b.fabs.fneg = fsub float -0.0, %b.fabs 396 %c.fabs = call float @llvm.fabs.f32(float %c) 397 %c.fabs.fneg = fsub float -0.0, %c.fabs 398 399 %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) 400 %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) 401 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) 402 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) 403 404 store float %med3, float addrspace(1)* %outgep 405 ret void 406} 407 408define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { 409; SI-LABEL: v_nnan_inputs_med3_f32_pat0: 410; SI: ; %bb.0: 411; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 412; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 413; SI-NEXT: v_mov_b32_e32 v1, 0 414; SI-NEXT: s_mov_b32 s10, 0 415; SI-NEXT: s_mov_b32 s11, 0xf000 416; SI-NEXT: s_waitcnt lgkmcnt(0) 417; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 418; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 419; SI-NEXT: s_mov_b64 s[8:9], s[4:5] 420; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 421; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 422; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 423; SI-NEXT: s_waitcnt vmcnt(2) 424; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 425; SI-NEXT: s_waitcnt vmcnt(1) 426; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 427; SI-NEXT: s_waitcnt vmcnt(0) 428; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 429; SI-NEXT: v_min_f32_e32 v5, v2, v3 430; SI-NEXT: v_max_f32_e32 v2, v2, v3 431; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 432; SI-NEXT: v_min_f32_e32 v2, v2, v4 433; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 434; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 435; SI-NEXT: v_max_f32_e32 v2, v3, v2 436; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 437; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 438; SI-NEXT: s_endpgm 439; 440; VI-LABEL: v_nnan_inputs_med3_f32_pat0: 441; VI: ; %bb.0: 442; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 443; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 444; VI-NEXT: s_waitcnt lgkmcnt(0) 445; VI-NEXT: v_mov_b32_e32 v0, s2 446; VI-NEXT: v_mov_b32_e32 v1, s3 447; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 448; VI-NEXT: v_mov_b32_e32 v2, s4 449; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 450; VI-NEXT: v_mov_b32_e32 v3, s5 451; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 452; VI-NEXT: v_mov_b32_e32 v4, s6 453; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 454; VI-NEXT: v_mov_b32_e32 v5, s7 455; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 456; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 457; VI-NEXT: flat_load_dword v7, v[0:1] 458; VI-NEXT: flat_load_dword v2, v[2:3] 459; VI-NEXT: flat_load_dword v3, v[4:5] 460; VI-NEXT: v_mov_b32_e32 v0, s0 461; VI-NEXT: v_mov_b32_e32 v1, s1 462; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 463; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 464; VI-NEXT: s_waitcnt vmcnt(2) 465; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 466; VI-NEXT: s_waitcnt vmcnt(1) 467; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 468; VI-NEXT: v_min_f32_e32 v5, v4, v2 469; VI-NEXT: v_max_f32_e32 v2, v4, v2 470; VI-NEXT: s_waitcnt vmcnt(0) 471; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 472; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 473; VI-NEXT: v_min_f32_e32 v2, v2, v3 474; VI-NEXT: v_mul_f32_e32 v3, 1.0, v5 475; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 476; VI-NEXT: v_max_f32_e32 v2, v3, v2 477; VI-NEXT: flat_store_dword v[0:1], v2 478; VI-NEXT: s_endpgm 479; 480; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0: 481; GFX9: ; %bb.0: 482; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 483; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 484; GFX9-NEXT: s_waitcnt lgkmcnt(0) 485; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 486; GFX9-NEXT: global_load_dword v2, v0, s[4:5] 487; GFX9-NEXT: global_load_dword v3, v0, s[6:7] 488; GFX9-NEXT: s_waitcnt vmcnt(2) 489; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 490; GFX9-NEXT: s_waitcnt vmcnt(1) 491; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 492; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 493; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 494; GFX9-NEXT: s_waitcnt vmcnt(0) 495; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 496; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 497; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 498; GFX9-NEXT: v_max_f32_e32 v2, v4, v4 499; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 500; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 501; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 502; GFX9-NEXT: s_endpgm 503 %tid = call i32 @llvm.amdgcn.workitem.id.x() 504 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 505 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid 506 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid 507 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid 508 %a = load volatile float, float addrspace(1)* %gep0 509 %b = load volatile float, float addrspace(1)* %gep1 510 %c = load volatile float, float addrspace(1)* %gep2 511 512 %a.nnan = fadd nnan float %a, 1.0 513 %b.nnan = fadd nnan float %b, 2.0 514 %c.nnan = fadd nnan float %c, 4.0 515 516 %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan) 517 %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan) 518 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan) 519 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) 520 store float %med3, float addrspace(1)* %outgep 521 ret void 522} 523 524 525; --------------------------------------------------------------------- 526; Negative patterns 527; --------------------------------------------------------------------- 528 529define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { 530; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: 531; SI: ; %bb.0: 532; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 533; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 534; SI-NEXT: v_mov_b32_e32 v1, 0 535; SI-NEXT: s_mov_b32 s10, 0 536; SI-NEXT: s_mov_b32 s11, 0xf000 537; SI-NEXT: s_waitcnt lgkmcnt(0) 538; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 539; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 540; SI-NEXT: s_mov_b64 s[8:9], s[4:5] 541; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 542; SI-NEXT: s_mov_b32 s2, -1 543; SI-NEXT: s_mov_b32 s3, s11 544; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 545; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 546; SI-NEXT: s_waitcnt vmcnt(2) 547; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 548; SI-NEXT: s_waitcnt vmcnt(1) 549; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 550; SI-NEXT: v_min_f32_e32 v5, v2, v3 551; SI-NEXT: v_max_f32_e32 v2, v2, v3 552; SI-NEXT: s_waitcnt vmcnt(0) 553; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 554; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 555; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 556; SI-NEXT: v_min_f32_e32 v2, v2, v3 557; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 558; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 559; SI-NEXT: v_max_f32_e32 v2, v3, v2 560; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 561; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 562; SI-NEXT: s_endpgm 563; 564; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: 565; VI: ; %bb.0: 566; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 567; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 568; VI-NEXT: s_waitcnt lgkmcnt(0) 569; VI-NEXT: v_mov_b32_e32 v0, s2 570; VI-NEXT: v_mov_b32_e32 v1, s3 571; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 572; VI-NEXT: v_mov_b32_e32 v2, s4 573; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 574; VI-NEXT: v_mov_b32_e32 v3, s5 575; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 576; VI-NEXT: v_mov_b32_e32 v4, s6 577; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 578; VI-NEXT: v_mov_b32_e32 v5, s7 579; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 580; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 581; VI-NEXT: flat_load_dword v7, v[0:1] 582; VI-NEXT: flat_load_dword v2, v[2:3] 583; VI-NEXT: flat_load_dword v3, v[4:5] 584; VI-NEXT: v_mov_b32_e32 v0, s0 585; VI-NEXT: v_mov_b32_e32 v1, s1 586; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 587; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 588; VI-NEXT: s_waitcnt vmcnt(2) 589; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 590; VI-NEXT: s_waitcnt vmcnt(1) 591; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 592; VI-NEXT: v_min_f32_e32 v5, v4, v2 593; VI-NEXT: v_max_f32_e32 v2, v4, v2 594; VI-NEXT: s_waitcnt vmcnt(0) 595; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 596; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 597; VI-NEXT: v_min_f32_e32 v2, v2, v3 598; VI-NEXT: v_mul_f32_e32 v3, 1.0, v5 599; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 600; VI-NEXT: v_max_f32_e32 v2, v3, v2 601; VI-NEXT: flat_store_dword v[0:1], v5 602; VI-NEXT: flat_store_dword v[0:1], v2 603; VI-NEXT: s_endpgm 604; 605; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: 606; GFX9: ; %bb.0: 607; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 608; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 609; GFX9-NEXT: s_waitcnt lgkmcnt(0) 610; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 611; GFX9-NEXT: global_load_dword v2, v0, s[4:5] 612; GFX9-NEXT: global_load_dword v3, v0, s[6:7] 613; GFX9-NEXT: s_waitcnt vmcnt(2) 614; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 615; GFX9-NEXT: s_waitcnt vmcnt(1) 616; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 617; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 618; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 619; GFX9-NEXT: global_store_dword v[0:1], v4, off 620; GFX9-NEXT: s_waitcnt vmcnt(1) 621; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 622; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 623; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 624; GFX9-NEXT: v_max_f32_e32 v2, v4, v4 625; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 626; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 627; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 628; GFX9-NEXT: s_endpgm 629 %tid = call i32 @llvm.amdgcn.workitem.id.x() 630 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 631 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid 632 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid 633 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid 634 %a = load volatile float, float addrspace(1)* %gep0 635 %b = load volatile float, float addrspace(1)* %gep1 636 %c = load volatile float, float addrspace(1)* %gep2 637 %tmp0 = call float @llvm.minnum.f32(float %a, float %b) 638 store volatile float %tmp0, float addrspace(1)* undef 639 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) 640 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) 641 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) 642 store float %med3, float addrspace(1)* %outgep 643 ret void 644} 645 646declare i32 @llvm.amdgcn.workitem.id.x() #0 647declare float @llvm.fabs.f32(float) #0 648declare float @llvm.minnum.f32(float, float) #0 649declare float @llvm.maxnum.f32(float, float) #0 650declare double @llvm.minnum.f64(double, double) #0 651declare double @llvm.maxnum.f64(double, double) #0 652declare half @llvm.fabs.f16(half) #0 653declare half @llvm.minnum.f16(half, half) #0 654declare half @llvm.maxnum.f16(half, half) #0 655 656attributes #0 = { nounwind readnone } 657attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } 658attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } 659