1; XUN: llc -mtriple=amdgcn-amd-amdhsa -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VI-DENORM %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VI-FLUSH %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX8_10,GFX10-DENORM %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX8_10,GFX10-FLUSH %s 6 7; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't 8; make add an instruction if the fadd has more than one use. 9 10declare half @llvm.fabs.f16(half) #1 11declare float @llvm.fabs.f32(float) #1 12 13; GCN-LABEL: {{^}}multiple_fadd_use_test_f32: 14; SI: v_max_legacy_f32_e64 [[A16:v[0-9]+]], 15; SI: v_add_f32_e32 [[A17:v[0-9]+]], [[A16]], [[A16]] 16; SI: v_mul_f32_e32 [[A18:v[0-9]+]], [[A17]], [[A17]] 17; SI: v_mad_f32 [[A20:v[0-9]+]], -[[A18]], [[A17]], 1.0 18; SI: buffer_store_dword [[A20]] 19 20; GFX8_10: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0 21; GFX8_10: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0 22; GFX8_10: v_cmp_gt_f32_e64 {{vcc|vcc_lo}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| 23; GFX8_10: v_cndmask_b32_e32 24; GFX8_10: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| 25; GFX8_10: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 26; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 27; GFX10: v_fma_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 28define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { 29 %a11 = fadd float %y, -1.0 30 %a12 = call float @llvm.fabs.f32(float %a11) 31 %a13 = fadd float %x, -1.0 32 %a14 = call float @llvm.fabs.f32(float %a13) 33 %a15 = fcmp ogt float %a12, %a14 34 %a16 = select i1 %a15, float %a12, float %a14 35 %a17 = fmul float %a16, 2.0 36 %a18 = fmul float %a17, %a17 37 %a19 = fmul float %a18, %a17 38 %a20 = fsub float 1.0, %a19 39 store float %a20, float addrspace(1)* %out 40 ret void 41} 42 43; GCN-LABEL: {{^}}multiple_use_fadd_fmac_f32: 44; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], [[X:s[0-9]+]], s{{[0-9]+}} 45; SIVI-DAG: v_mac_f32_e64 [[MAD:v[0-9]+]], [[X]], 2.0 46; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], [[X]], 2.0, s{{[0-9]+}} 47; GCN-DAG: buffer_store_dword [[MUL2]] 48; GCN-DAG: buffer_store_dword [[MAD]] 49; GCN: s_endpgm 50define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, [8 x i32], float %y) #0 { 51 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 52 %mul2 = fmul fast float %x, 2.0 53 %mad = fadd fast float %mul2, %y 54 store volatile float %mul2, float addrspace(1)* %out 55 store volatile float %mad, float addrspace(1)* %out.gep.1 56 ret void 57} 58 59; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f32: 60; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}| 61; SIVI-DAG: v_mad_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} 62; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}} 63; GCN-DAG: buffer_store_dword [[MUL2]] 64; GCN-DAG: buffer_store_dword [[MAD]] 65; GCN: s_endpgm 66define amdgpu_kernel void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 { 67 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 68 %x.abs = call float @llvm.fabs.f32(float %x) 69 %mul2 = fmul fast float %x.abs, 2.0 70 %mad = fadd fast float %mul2, %y 71 store volatile float %mul2, float addrspace(1)* %out 72 store volatile float %mad, float addrspace(1)* %out.gep.1 73 ret void 74} 75 76; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32: 77; SIVI: v_mad_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}} 78; SIVI: v_mad_f32 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}} 79; GFX10: v_fma_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, {{s[0-9]+}} 80; GFX10: v_fma_f32 {{v[0-9]+}}, |[[X]]|, 2.0, {{s[0-9]+}} 81define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { 82 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 83 %x.abs = call float @llvm.fabs.f32(float %x) 84 %mul2 = fmul fast float %x.abs, 2.0 85 %mad0 = fadd fast float %mul2, %y 86 %mad1 = fadd fast float %mul2, %z 87 store volatile float %mad0, float addrspace(1)* %out 88 store volatile float %mad1, float addrspace(1)* %out.gep.1 89 ret void 90} 91 92; GCN-LABEL: {{^}}fmul_x2_xn2_f32: 93; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0 94; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] 95; GCN: buffer_store_dword [[RESULT]] 96define amdgpu_kernel void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 { 97 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 98 %mul2 = fmul fast float %x, 2.0 99 %muln2 = fmul fast float %x, -2.0 100 %mul = fmul fast float %mul2, %muln2 101 store volatile float %mul, float addrspace(1)* %out 102 ret void 103} 104 105; GCN-LABEL: {{^}}fmul_x2_xn3_f32: 106; SIVI: v_mov_b32_e32 [[K:v[0-9]+]], 0xc0c00000 107; SIVI: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]] 108; GFX10: v_mul_f32_e64 [[TMP0:v[0-9]+]], 0xc0c00000, [[X:s[0-9]+]] 109; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] 110; GCN: buffer_store_dword [[RESULT]] 111define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 { 112 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 113 %mul2 = fmul fast float %x, 2.0 114 %muln2 = fmul fast float %x, -3.0 115 %mul = fmul fast float %mul2, %muln2 116 store volatile float %mul, float addrspace(1)* %out 117 ret void 118} 119 120; GCN-LABEL: {{^}}multiple_fadd_use_test_f16: 121; GFX8_10: v_add_f16_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0 122; GFX8_10: v_add_f16_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0 123; GFX8_10: v_cmp_gt_f16_e64 {{vcc|vcc_lo}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| 124; GFX8_10: v_cndmask_b32_e32 125; GFX8_10: v_add_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| 126; GFX8_10: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 127; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 128; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 129; GFX10-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 130; GFX10-FLUSH: v_sub_f16_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} 131define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { 132 %x = bitcast i16 %x.arg to half 133 %y = bitcast i16 %y.arg to half 134 %z = bitcast i16 %z.arg to half 135 %a11 = fadd half %y, -1.0 136 %a12 = call half @llvm.fabs.f16(half %a11) 137 %a13 = fadd half %x, -1.0 138 %a14 = call half @llvm.fabs.f16(half %a13) 139 %a15 = fcmp ogt half %a12, %a14 140 %a16 = select i1 %a15, half %a12, half %a14 141 %a17 = fmul half %a16, 2.0 142 %a18 = fmul half %a17, %a17 143 %a19 = fmul half %a18, %a17 144 %a20 = fsub half 1.0, %a19 145 store half %a20, half addrspace(1)* %out 146 ret void 147} 148 149; GCN-LABEL: {{^}}multiple_use_fadd_fmac_f16: 150; GCN-DAG: v_add_f16_e64 [[MUL2:v[0-9]+]], [[X:s[0-9]+]], s{{[0-9]+}} 151 152; VI-FLUSH-DAG: v_mac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0 153; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, v{{[0-9]+}} 154; GFX10-FLUSH-DAG: v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]] 155; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, s{{[0-9]+}} 156 157; GCN-DAG: buffer_store_short [[MUL2]] 158; GCN-DAG: buffer_store_short [[MAD]] 159; GCN: s_endpgm 160define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { 161 %x = bitcast i16 %x.arg to half 162 %y = bitcast i16 %y.arg to half 163 %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 164 %mul2 = fmul fast half %x, 2.0 165 %mad = fadd fast half %mul2, %y 166 store volatile half %mul2, half addrspace(1)* %out 167 store volatile half %mad, half addrspace(1)* %out.gep.1 168 ret void 169} 170 171; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f16: 172; GCN-DAG: v_add_f16_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}| 173 174; VI-FLUSH-DAG: v_mad_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} 175; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} 176; GFX10-FLUSH-DAG: v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]] 177; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}} 178 179; GCN-DAG: buffer_store_short [[MUL2]] 180; GCN-DAG: buffer_store_short [[MAD]] 181; GCN: s_endpgm 182define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { 183 %x = bitcast i16 %x.arg to half 184 %y = bitcast i16 %y.arg to half 185 %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 186 %x.abs = call half @llvm.fabs.f16(half %x) 187 %mul2 = fmul fast half %x.abs, 2.0 188 %mad = fadd fast half %mul2, %y 189 store volatile half %mul2, half addrspace(1)* %out 190 store volatile half %mad, half addrspace(1)* %out.gep.1 191 ret void 192} 193 194; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f16: 195; VI-FLUSH: v_mad_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}} 196; VI-FLUSH: v_mad_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}} 197 198; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}} 199; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}} 200 201; GFX10-FLUSH: v_add_f16_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |{{s[0-9]+}}| 202; GFX10-FLUSH: v_add_f16_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[MUL2]] 203; GFX10-FLUSH: v_add_f16_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[MUL2]] 204; GFX10-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, s{{[0-9]+}} 205; GFX10-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, s{{[0-9]+}} 206 207define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { 208 %x = bitcast i16 %x.arg to half 209 %y = bitcast i16 %y.arg to half 210 %z = bitcast i16 %z.arg to half 211 %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 212 %x.abs = call half @llvm.fabs.f16(half %x) 213 %mul2 = fmul fast half %x.abs, 2.0 214 %mad0 = fadd fast half %mul2, %y 215 %mad1 = fadd fast half %mul2, %z 216 store volatile half %mad0, half addrspace(1)* %out 217 store volatile half %mad1, half addrspace(1)* %out.gep.1 218 ret void 219} 220 221; GCN-LABEL: {{^}}fmul_x2_xn2_f16: 222; GCN: v_mul_f16_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0 223; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] 224; GCN: buffer_store_short [[RESULT]] 225define amdgpu_kernel void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { 226 %x = bitcast i16 %x.arg to half 227 %y = bitcast i16 %y.arg to half 228 %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 229 %mul2 = fmul fast half %x, 2.0 230 %muln2 = fmul fast half %x, -2.0 231 %mul = fmul fast half %mul2, %muln2 232 store volatile half %mul, half addrspace(1)* %out 233 ret void 234} 235 236; GCN-LABEL: {{^}}fmul_x2_xn3_f16: 237; SIVI: v_mov_b32_e32 [[K:v[0-9]+]], 0xc600 238; SIVI: v_mul_f16_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]] 239; GFX10: v_mul_f16_e64 [[TMP0:v[0-9]+]], 0xc600, [[X:s[0-9]+]] 240; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] 241; GCN: buffer_store_short [[RESULT]] 242define amdgpu_kernel void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { 243 %x = bitcast i16 %x.arg to half 244 %y = bitcast i16 %y.arg to half 245 %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 246 %mul2 = fmul fast half %x, 2.0 247 %muln2 = fmul fast half %x, -3.0 248 %mul = fmul fast half %mul2, %muln2 249 store volatile half %mul, half addrspace(1)* %out 250 ret void 251} 252 253attributes #0 = { nounwind "unsafe-fp-math"="true" } 254attributes #1 = { nounwind readnone } 255