1; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma. 2 3; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s 4; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s 5; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s 6 7; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs 8 9; Make sure we don't form mad with denormals 10; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s 11; RUN: llc -march=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s 12 13declare i32 @llvm.amdgcn.workitem.id.x() #0 14declare float @llvm.fabs.f32(float) #0 15declare float @llvm.fma.f32(float, float, float) #0 16declare float @llvm.fmuladd.f32(float, float, float) #0 17 18; (fadd (fmul x, y), z) -> (fma x, y, z) 19; FUNC-LABEL: {{^}}combine_to_mad_f32_0: 20; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 21; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 22; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 23 24; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] 25 26; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 27 28; SI-DENORM-SLOWFMAF-NOT: v_fma 29; SI-DENORM-SLOWFMAF-NOT: v_mad 30 31; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 32; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 33 34; SI-DENORM: buffer_store_dword [[RESULT]] 35; SI-STD: buffer_store_dword [[C]] 36define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 37 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 38 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 39 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 40 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 41 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 42 43 %a = load volatile float, float addrspace(1)* %gep.0 44 %b = load volatile float, float addrspace(1)* %gep.1 45 %c = load volatile float, float addrspace(1)* %gep.2 46 47 %mul = fmul float %a, %b 48 %fma = fadd float %mul, %c 49 store float %fma, float addrspace(1)* %gep.out 50 ret void 51} 52 53; (fadd (fmul x, y), z) -> (fma x, y, z) 54; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use: 55; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 56; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 57; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 58; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 59 60; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]] 61; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]] 62 63; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] 64; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] 65 66; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 67; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] 68; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 69 70; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 71; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 72; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 73; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 74; SI: s_endpgm 75define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 76 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 77 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 78 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 79 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 80 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 81 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 82 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 83 84 %a = load volatile float, float addrspace(1)* %gep.0 85 %b = load volatile float, float addrspace(1)* %gep.1 86 %c = load volatile float, float addrspace(1)* %gep.2 87 %d = load volatile float, float addrspace(1)* %gep.3 88 89 %mul = fmul float %a, %b 90 %fma0 = fadd float %mul, %c 91 %fma1 = fadd float %mul, %d 92 93 store volatile float %fma0, float addrspace(1)* %gep.out.0 94 store volatile float %fma1, float addrspace(1)* %gep.out.1 95 ret void 96} 97 98; (fadd x, (fmul y, z)) -> (fma y, z, x) 99; FUNC-LABEL: {{^}}combine_to_mad_f32_1: 100; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 101; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 102; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 103 104; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] 105; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 106 107; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 108; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] 109 110; SI-DENORM: buffer_store_dword [[RESULT]] 111; SI-STD: buffer_store_dword [[C]] 112define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 113 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 114 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 115 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 116 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 117 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 118 119 %a = load volatile float, float addrspace(1)* %gep.0 120 %b = load volatile float, float addrspace(1)* %gep.1 121 %c = load volatile float, float addrspace(1)* %gep.2 122 123 %mul = fmul float %a, %b 124 %fma = fadd float %c, %mul 125 store float %fma, float addrspace(1)* %gep.out 126 ret void 127} 128 129; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 130; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32: 131; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 132; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 133; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 134 135; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] 136; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] 137 138; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 139; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 140 141; SI: buffer_store_dword [[RESULT]] 142define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 143 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 144 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 145 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 146 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 147 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 148 149 %a = load volatile float, float addrspace(1)* %gep.0 150 %b = load volatile float, float addrspace(1)* %gep.1 151 %c = load volatile float, float addrspace(1)* %gep.2 152 153 %mul = fmul float %a, %b 154 %fma = fsub float %mul, %c 155 store float %fma, float addrspace(1)* %gep.out 156 ret void 157} 158 159; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 160; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use: 161; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 162; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 163; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 164; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 165 166; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] 167; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 168 169; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] 170; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 171 172; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 173; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] 174; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 175 176; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 177; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 178; SI: s_endpgm 179define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 180 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 181 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 182 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 183 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 184 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 185 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 186 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 187 188 %a = load volatile float, float addrspace(1)* %gep.0 189 %b = load volatile float, float addrspace(1)* %gep.1 190 %c = load volatile float, float addrspace(1)* %gep.2 191 %d = load volatile float, float addrspace(1)* %gep.3 192 193 %mul = fmul float %a, %b 194 %fma0 = fsub float %mul, %c 195 %fma1 = fsub float %mul, %d 196 store volatile float %fma0, float addrspace(1)* %gep.out.0 197 store volatile float %fma1, float addrspace(1)* %gep.out.1 198 ret void 199} 200 201; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 202; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32: 203; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 204; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 205; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 206 207; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] 208; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] 209 210; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 211; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] 212 213; SI: buffer_store_dword [[RESULT]] 214define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 215 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 216 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 217 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 218 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 219 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 220 221 %a = load volatile float, float addrspace(1)* %gep.0 222 %b = load volatile float, float addrspace(1)* %gep.1 223 %c = load volatile float, float addrspace(1)* %gep.2 224 225 %mul = fmul float %a, %b 226 %fma = fsub float %c, %mul 227 store float %fma, float addrspace(1)* %gep.out 228 ret void 229} 230 231; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 232; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use: 233; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 234; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 235; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 236; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 237 238; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] 239; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] 240 241; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] 242; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] 243 244; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 245; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] 246; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] 247 248; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 249; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 250; SI: s_endpgm 251define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 252 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 253 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 254 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 255 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 256 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 257 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 258 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 259 260 %a = load volatile float, float addrspace(1)* %gep.0 261 %b = load volatile float, float addrspace(1)* %gep.1 262 %c = load volatile float, float addrspace(1)* %gep.2 263 %d = load volatile float, float addrspace(1)* %gep.3 264 265 %mul = fmul float %a, %b 266 %fma0 = fsub float %c, %mul 267 %fma1 = fsub float %d, %mul 268 store volatile float %fma0, float addrspace(1)* %gep.out.0 269 store volatile float %fma1, float addrspace(1)* %gep.out.1 270 ret void 271} 272 273; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 274; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32: 275; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 276; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 277; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 278 279; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] 280 281; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] 282 283; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]] 284; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] 285 286; SI: buffer_store_dword [[RESULT]] 287define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 288 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 289 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 290 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 291 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 292 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 293 294 %a = load volatile float, float addrspace(1)* %gep.0 295 %b = load volatile float, float addrspace(1)* %gep.1 296 %c = load volatile float, float addrspace(1)* %gep.2 297 298 %mul = fmul float %a, %b 299 %mul.neg = fsub float -0.0, %mul 300 %fma = fsub float %mul.neg, %c 301 302 store float %fma, float addrspace(1)* %gep.out 303 ret void 304} 305 306; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 307; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg: 308; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 309; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 310; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 311; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 312 313; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]] 314; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]] 315 316; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 317; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] 318 319; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]] 320; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] 321; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 322 323; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 324; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 325; SI: s_endpgm 326define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 327 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 328 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 329 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 330 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 331 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 332 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 333 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 334 335 %a = load volatile float, float addrspace(1)* %gep.0 336 %b = load volatile float, float addrspace(1)* %gep.1 337 %c = load volatile float, float addrspace(1)* %gep.2 338 %d = load volatile float, float addrspace(1)* %gep.3 339 340 %mul = fmul float %a, %b 341 %mul.neg = fsub float -0.0, %mul 342 %fma0 = fsub float %mul.neg, %c 343 %fma1 = fsub float %mul.neg, %d 344 345 store volatile float %fma0, float addrspace(1)* %gep.out.0 346 store volatile float %fma1, float addrspace(1)* %gep.out.1 347 ret void 348} 349 350; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 351; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul: 352; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 353; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 354; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 355; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 356 357; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 358; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 359 360; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] 361; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] 362 363; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] 364; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] 365; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] 366 367; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 368; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 369; SI: s_endpgm 370define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 371 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 372 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 373 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 374 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 375 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 376 %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid 377 %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 378 379 %a = load volatile float, float addrspace(1)* %gep.0 380 %b = load volatile float, float addrspace(1)* %gep.1 381 %c = load volatile float, float addrspace(1)* %gep.2 382 %d = load volatile float, float addrspace(1)* %gep.3 383 384 %mul = fmul float %a, %b 385 %mul.neg = fsub float -0.0, %mul 386 %fma0 = fsub float %mul.neg, %c 387 %fma1 = fsub float %mul, %d 388 389 store volatile float %fma0, float addrspace(1)* %gep.out.0 390 store volatile float %fma1, float addrspace(1)* %gep.out.1 391 ret void 392} 393 394; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 395 396; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32: 397; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 398; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 399; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 400; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 401; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 402 403; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 404; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 405; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] 406 407; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 408; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 409; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] 410 411; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 412define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 413 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 414 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 415 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 416 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 417 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 418 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 419 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 420 421 %x = load volatile float, float addrspace(1)* %gep.0 422 %y = load volatile float, float addrspace(1)* %gep.1 423 %z = load volatile float, float addrspace(1)* %gep.2 424 %u = load volatile float, float addrspace(1)* %gep.3 425 %v = load volatile float, float addrspace(1)* %gep.4 426 427 %tmp0 = fmul float %u, %v 428 %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0 429 %tmp2 = fsub float %tmp1, %z 430 431 store float %tmp2, float addrspace(1)* %gep.out 432 ret void 433} 434 435; fold (fsub x, (fma y, z, (fmul u, v))) 436; -> (fma (fneg y), z, (fma (fneg u), v, x)) 437 438; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32: 439; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 440; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 441; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 442; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 443; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 444 445; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 446; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 447; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] 448 449; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 450; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 451; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] 452 453; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 454; SI: s_endpgm 455define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 456 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 457 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 458 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 459 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 460 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 461 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 462 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 463 464 %x = load volatile float, float addrspace(1)* %gep.0 465 %y = load volatile float, float addrspace(1)* %gep.1 466 %z = load volatile float, float addrspace(1)* %gep.2 467 %u = load volatile float, float addrspace(1)* %gep.3 468 %v = load volatile float, float addrspace(1)* %gep.4 469 470 %tmp0 = fmul float %u, %v 471 %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0 472 %tmp2 = fsub float %x, %tmp1 473 474 store float %tmp2, float addrspace(1)* %gep.out 475 ret void 476} 477 478; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 479 480; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32: 481; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 482; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 483; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 484; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 485; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 486 487; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 488; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]] 489; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]] 490 491; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]] 492; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]] 493 494; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 495; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] 496; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] 497 498; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 499; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]] 500; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]] 501; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]] 502 503; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 504; SI: s_endpgm 505define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 506 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 507 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 508 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 509 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 510 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 511 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 512 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 513 514 %x = load volatile float, float addrspace(1)* %gep.0 515 %y = load volatile float, float addrspace(1)* %gep.1 516 %z = load volatile float, float addrspace(1)* %gep.2 517 %u = load volatile float, float addrspace(1)* %gep.3 518 %v = load volatile float, float addrspace(1)* %gep.4 519 520 %tmp0 = fmul float %u, %v 521 %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0 522 %tmp2 = fsub float %tmp1, %z 523 524 store float %tmp2, float addrspace(1)* %gep.out 525 ret void 526} 527 528; fold (fsub x, (fmuladd y, z, (fmul u, v))) 529; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x)) 530 531; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32: 532; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 533; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 534; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} 535; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} 536; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} 537 538; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 539; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]] 540; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]] 541 542; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] 543; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] 544 545; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 546; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] 547; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] 548 549; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] 550; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]] 551; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]] 552; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]] 553 554; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 555; SI: s_endpgm 556define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { 557 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 558 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 559 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 560 %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 561 %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 562 %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 563 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid 564 565 %x = load volatile float, float addrspace(1)* %gep.0 566 %y = load volatile float, float addrspace(1)* %gep.1 567 %z = load volatile float, float addrspace(1)* %gep.2 568 %u = load volatile float, float addrspace(1)* %gep.3 569 %v = load volatile float, float addrspace(1)* %gep.4 570 571 ; nsz flag is needed since this combine may change sign of zero 572 %tmp0 = fmul nsz float %u, %v 573 %tmp1 = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 574 %tmp2 = fsub nsz float %x, %tmp1 575 576 store float %tmp2, float addrspace(1)* %gep.out 577 ret void 578} 579 580attributes #0 = { nounwind readnone } 581attributes #1 = { nounwind } 582