1; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s 2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s 3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s 4 5; DAGCombiner will transform: 6; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF)) 7; unless isFabsFree returns true 8 9; GCN-LABEL: {{^}}s_fabs_free_f16: 10; GCN: s_load_dword [[VAL:s[0-9]+]] 11; GCN: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff 12; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]] 13; GCN: {{flat|global}}_store_short v{{.+}}, [[V_RESULT]] 14define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) { 15 %bc= bitcast i16 %in to half 16 %fabs = call half @llvm.fabs.f16(half %bc) 17 store half %fabs, half addrspace(1)* %out 18 ret void 19} 20 21; GCN-LABEL: {{^}}s_fabs_f16: 22; GCN: s_load_dword [[VAL:s[0-9]+]] 23; GCN: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff 24; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]] 25; GCN: {{flat|global}}_store_short v{{.+}}, [[V_RESULT]] 26define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) { 27 %fabs = call half @llvm.fabs.f16(half %in) 28 store half %fabs, half addrspace(1)* %out 29 ret void 30} 31 32; GCN-LABEL: {{^}}s_fabs_v2f16: 33; GCN: s_load_dword [[VAL:s[0-9]+]] 34; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff 35define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { 36 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) 37 store <2 x half> %fabs, <2 x half> addrspace(1)* %out 38 ret void 39} 40 41; GCN-LABEL: {{^}}s_fabs_v4f16: 42; CI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2 43; GFX89: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 44 45; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff 46; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]] 47; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]] 48; GCN: {{flat|global}}_store_dwordx2 49define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { 50 %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) 51 store <4 x half> %fabs, <4 x half> addrspace(1)* %out 52 ret void 53} 54 55; GCN-LABEL: {{^}}fabs_fold_f16: 56; GCN: s_load_dword [[IN0:s[0-9]+]] 57; GCN-DAG: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16 58 59; CI-DAG: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], |[[IN0]]| 60; CI-DAG: v_cvt_f32_f16_e32 [[ABS_CVT1:v[0-9]+]], [[IN1]] 61; CI-DAG: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]] 62; CI-DAG: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]] 63; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]] 64 65; GFX89-NOT: and 66; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]] 67; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]] 68; GFX89: {{flat|global}}_store_short v{{.+}}, [[RESULT]] 69define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) { 70 %fabs = call half @llvm.fabs.f16(half %in0) 71 %fmul = fmul half %fabs, %in1 72 store half %fmul, half addrspace(1)* %out 73 ret void 74} 75 76; GCN-LABEL: {{^}}v_fabs_v2f16: 77; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] 78; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, [[VAL]] 79define amdgpu_kernel void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 80 %tid = call i32 @llvm.amdgcn.workitem.id.x() 81 %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 82 %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 83 %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2 84 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 85 store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out 86 ret void 87} 88 89; GCN-LABEL: {{^}}fabs_free_v2f16: 90; GCN: s_load_dword [[VAL:s[0-9]+]] 91; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff 92define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { 93 %bc = bitcast i32 %in to <2 x half> 94 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc) 95 store <2 x half> %fabs, <2 x half> addrspace(1)* %out 96 ret void 97} 98 99; FIXME: Should do fabs after conversion to avoid converting multiple 100; times in this particular case. 101 102; GCN-LABEL: {{^}}v_fabs_fold_self_v2f16: 103; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] 104 105; CI: v_lshrrev_b32_e32 [[VREG:v[0-9]+]], 16, v{{[0-9]+}} 106; CI: v_cvt_f32_f16_e32 [[NORM:v[0-9]+]], [[VREG]] 107; CI: v_cvt_f32_f16_e64 [[ABS:v[0-9]+]], {{\|}}[[VREG]]{{\|}} 108; CI: v_mul_f32_e32 v{{[0-9]+}}, [[ABS]], [[NORM]] 109; CI: v_cvt_f16_f32 110; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 111; CI: v_cvt_f16_f32 112 113; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 114; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} 115 116; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]] 117; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}} 118define amdgpu_kernel void @v_fabs_fold_self_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 119 %tid = call i32 @llvm.amdgcn.workitem.id.x() 120 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 121 %val = load <2 x half>, <2 x half> addrspace(1)* %gep 122 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 123 %fmul = fmul <2 x half> %fabs, %val 124 store <2 x half> %fmul, <2 x half> addrspace(1)* %out 125 ret void 126} 127 128; GCN-LABEL: {{^}}v_fabs_fold_v2f16: 129; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] 130 131; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 132; CI: v_cvt_f32_f16_e32 133; CI: v_cvt_f32_f16_e32 134; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 135; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 136; CI: v_cvt_f16_f32 137; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 138; CI: v_cvt_f16_f32 139 140; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 141; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, s{{[0-9]+}} 142 143; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]] 144; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], s{{[0-9]+$}} 145define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %other.val) #0 { 146 %tid = call i32 @llvm.amdgcn.workitem.id.x() 147 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 148 %val = load <2 x half>, <2 x half> addrspace(1)* %gep 149 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 150 %other.val.cvt = bitcast i32 %other.val to <2 x half> 151 %fmul = fmul <2 x half> %fabs, %other.val.cvt 152 store <2 x half> %fmul, <2 x half> addrspace(1)* %out 153 ret void 154} 155 156; GCN-LABEL: {{^}}v_extract_fabs_fold_v2f16: 157; GCN-DAG: {{flat|global}}_load_dword [[VAL:v[0-9]+]] 158; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} 159; CI-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} 160 161; GFX89-DAG: v_mul_f16_e64 v{{[0-9]+}}, |[[VAL]]|, 4.0 162; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 163; GFX89-DAG: v_add_f16_sdwa v{{[0-9]+}}, |[[VAL]]|, [[CONST2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 164define amdgpu_kernel void @v_extract_fabs_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { 165 %tid = call i32 @llvm.amdgcn.workitem.id.x() 166 %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 167 %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in 168 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 169 %elt0 = extractelement <2 x half> %fabs, i32 0 170 %elt1 = extractelement <2 x half> %fabs, i32 1 171 172 %fmul0 = fmul half %elt0, 4.0 173 %fadd1 = fadd half %elt1, 2.0 174 store volatile half %fmul0, half addrspace(1)* undef 175 store volatile half %fadd1, half addrspace(1)* undef 176 ret void 177} 178 179; GCN-LABEL: {{^}}v_extract_fabs_no_fold_v2f16: 180; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] 181; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 0x7fff7fff, [[VAL]] 182 183 184; VI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 15 185; VI: flat_store_short 186 187; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[AND]], off 188define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { 189 %tid = call i32 @llvm.amdgcn.workitem.id.x() 190 %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 191 %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in 192 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 193 %elt0 = extractelement <2 x half> %fabs, i32 0 194 %elt1 = extractelement <2 x half> %fabs, i32 1 195 store volatile half %elt0, half addrspace(1)* undef 196 store volatile half %elt1, half addrspace(1)* undef 197 ret void 198} 199 200declare half @llvm.fabs.f16(half) #1 201declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 202declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1 203declare i32 @llvm.amdgcn.workitem.id.x() #1 204 205attributes #0 = { nounwind } 206attributes #1 = { nounwind readnone } 207