1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s 2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s 3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s 4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s 6 7declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 8declare float @llvm.fabs.f32(float) nounwind readnone 9 10; GCN-LABEL: {{^}}madak_f32: 11; GFX6: buffer_load_dword [[VA:v[0-9]+]] 12; GFX6: buffer_load_dword [[VB:v[0-9]+]] 13; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] 14; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] 15; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] 16; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] 17; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] 18; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] 19; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 20; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 21; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 22define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { 23 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 24 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid 25 %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid 26 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 27 28 %a = load float, float addrspace(1)* %in.a.gep, align 4 29 %b = load float, float addrspace(1)* %in.b.gep, align 4 30 31 %mul = fmul float %a, %b 32 %madak = fadd float %mul, 10.0 33 store float %madak, float addrspace(1)* %out.gep, align 4 34 ret void 35} 36 37; Make sure this is only folded with one use. This is a code size 38; optimization and if we fold the immediate multiple times, we'll undo 39; it. 40 41; GCN-LABEL: {{^}}madak_2_use_f32: 42; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 43; GFX10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 44; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 45; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 46; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 47; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]], 48; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]], 49; GFX8_9_10: {{flat|global}}_load_dword [[VC:v[0-9]+]], 50; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 51; GFX8-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 52; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 53; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 54; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 55; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]] 56; FMA-DAG: v_fmac_f32_e32 [[VK]], [[VA]], [[VC]] 57; GCN: s_endpgm 58define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { 59 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 60 61 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 62 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 63 %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2 64 65 %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 66 %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 67 68 %a = load volatile float, float addrspace(1)* %in.gep.0, align 4 69 %b = load volatile float, float addrspace(1)* %in.gep.1, align 4 70 %c = load volatile float, float addrspace(1)* %in.gep.2, align 4 71 72 %mul0 = fmul float %a, %b 73 %mul1 = fmul float %a, %c 74 %madak0 = fadd float %mul0, 10.0 75 %madak1 = fadd float %mul1, 10.0 76 77 store volatile float %madak0, float addrspace(1)* %out.gep.0, align 4 78 store volatile float %madak1, float addrspace(1)* %out.gep.1, align 4 79 ret void 80} 81 82; GCN-LABEL: {{^}}madak_m_inline_imm_f32: 83; GCN: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]] 84; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 85; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 86; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 87define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) #0 { 88 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 89 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid 90 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 91 92 %a = load float, float addrspace(1)* %in.a.gep, align 4 93 94 %mul = fmul float 4.0, %a 95 %madak = fadd float %mul, 10.0 96 store float %madak, float addrspace(1)* %out.gep, align 4 97 ret void 98} 99 100; Make sure nothing weird happens with a value that is also allowed as 101; an inline immediate. 102 103; GCN-LABEL: {{^}}madak_inline_imm_f32: 104; GFX6: buffer_load_dword [[VA:v[0-9]+]] 105; GFX6: buffer_load_dword [[VB:v[0-9]+]] 106; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] 107; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] 108; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] 109; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] 110; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] 111; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] 112; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 113; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 114; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 115define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { 116 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 117 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid 118 %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid 119 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 120 121 %a = load float, float addrspace(1)* %in.a.gep, align 4 122 %b = load float, float addrspace(1)* %in.b.gep, align 4 123 124 %mul = fmul float %a, %b 125 %madak = fadd float %mul, 4.0 126 store float %madak, float addrspace(1)* %out.gep, align 4 127 ret void 128} 129 130; We can't use an SGPR when forming madak 131; GCN-LABEL: {{^}}s_v_madak_f32: 132; GCN-DAG: s_load_dword [[SB:s[0-9]+]] 133; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 134; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]] 135; GCN-NOT: v_madak_f32 136; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] 137; GFX10-MAD: v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000 138; FMA: v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000 139define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) #0 { 140 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 141 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid 142 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 143 144 %a = load float, float addrspace(1)* %in.a.gep, align 4 145 146 %mul = fmul float %a, %b 147 %madak = fadd float %mul, 10.0 148 store float %madak, float addrspace(1)* %out.gep, align 4 149 ret void 150} 151 152; GCN-LABEL: @v_s_madak_f32 153; GCN-DAG: s_load_dword [[SB:s[0-9]+]] 154; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 155; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]] 156; GFX6_8_9-NOT: v_madak_f32 157; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] 158; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 159; FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 160define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) #0 { 161 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 162 %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid 163 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 164 165 %b = load float, float addrspace(1)* %in.b.gep, align 4 166 167 %mul = fmul float %a, %b 168 %madak = fadd float %mul, 10.0 169 store float %madak, float addrspace(1)* %out.gep, align 4 170 ret void 171} 172 173; GCN-LABEL: {{^}}s_s_madak_f32: 174; GCN-NOT: v_madak_f32 175; GFX8_9: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} 176; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 177; FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 178define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) #0 { 179 %mul = fmul float %a, %b 180 %madak = fadd float %mul, 10.0 181 store float %madak, float addrspace(1)* %out, align 4 182 ret void 183} 184 185; GCN-LABEL: {{^}}no_madak_src0_modifier_f32: 186; GFX6: buffer_load_dword [[VA:v[0-9]+]] 187; GFX6: buffer_load_dword [[VB:v[0-9]+]] 188; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]] 189; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]] 190; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} 191; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 192; FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 193; GCN: s_endpgm 194define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { 195 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 196 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid 197 %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid 198 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 199 200 %a = load float, float addrspace(1)* %in.a.gep, align 4 201 %b = load float, float addrspace(1)* %in.b.gep, align 4 202 203 %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone 204 205 %mul = fmul float %a.fabs, %b 206 %madak = fadd float %mul, 10.0 207 store float %madak, float addrspace(1)* %out.gep, align 4 208 ret void 209} 210 211; GCN-LABEL: {{^}}no_madak_src1_modifier_f32: 212; GFX6: buffer_load_dword [[VA:v[0-9]+]] 213; GFX6: buffer_load_dword [[VB:v[0-9]+]] 214; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]] 215; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]] 216; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} 217; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 218; FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 219; GCN: s_endpgm 220define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { 221 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 222 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid 223 %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid 224 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 225 226 %a = load float, float addrspace(1)* %in.a.gep, align 4 227 %b = load float, float addrspace(1)* %in.b.gep, align 4 228 229 %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone 230 231 %mul = fmul float %a, %b.fabs 232 %madak = fadd float %mul, 10.0 233 store float %madak, float addrspace(1)* %out.gep, align 4 234 ret void 235} 236 237; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10 238; because the implicit immediate already uses the constant bus. 239; On GFX10+ we can use two scalar operands. 240; GCN-LABEL: {{^}}madak_constant_bus_violation: 241; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} 242 243; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]] 244; MAD: v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000 245; MAD: v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5 246; GFX10: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] 247; GFX10-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 248; FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 249; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] 250; GFX6: buffer_store_dword [[MUL]] 251; GFX8_9_10: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]] 252define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { 253bb: 254 %tmp = icmp eq i32 %arg1, 0 255 br i1 %tmp, label %bb3, label %bb4 256 257bb3: 258 store volatile float 0.0, float addrspace(1)* undef 259 br label %bb4 260 261bb4: 262 %vgpr = load volatile float, float addrspace(1)* undef 263 %tmp0 = fmul float %sgpr0, 0.5 264 %tmp1 = fadd float %tmp0, 42.0 265 %tmp2 = fmul float %tmp1, %vgpr 266 store volatile float %tmp2, float addrspace(1)* undef, align 4 267 ret void 268} 269 270attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 271