1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s 5 6declare half @llvm.fmuladd.f16(half %a, half %b, half %c) 7declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) 8 9; GCN-LABEL: {{^}}fmuladd_f16 10; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 11; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 12; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 13; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 14; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 15; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 16; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] 17; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] 18; SI: buffer_store_short v[[R_F16]] 19 20; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] 21; VI-FLUSH: buffer_store_short v[[C_F16]] 22 23; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] 24; VI-DENORM: buffer_store_short [[RESULT]] 25 26; GCN: s_endpgm 27define amdgpu_kernel void @fmuladd_f16( 28 half addrspace(1)* %r, 29 half addrspace(1)* %a, 30 half addrspace(1)* %b, 31 half addrspace(1)* %c) { 32 %a.val = load half, half addrspace(1)* %a 33 %b.val = load half, half addrspace(1)* %b 34 %c.val = load half, half addrspace(1)* %c 35 %r.val = call half @llvm.fmuladd.f16(half %a.val, half %b.val, half %c.val) 36 store half %r.val, half addrspace(1)* %r 37 ret void 38} 39 40; GCN-LABEL: {{^}}fmuladd_f16_imm_a 41; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 42; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 43; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 44; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 45; SI: v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[B_F32]] 46; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] 47; SI: buffer_store_short v[[R_F16]] 48 49; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]] 50; VI-FLUSH: buffer_store_short v[[C_F16]] 51 52; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200 53; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]] 54; VI-DENORM: buffer_store_short [[RESULT]] 55 56; GCN: s_endpgm 57define amdgpu_kernel void @fmuladd_f16_imm_a( 58 half addrspace(1)* %r, 59 half addrspace(1)* %b, 60 half addrspace(1)* %c) { 61 %b.val = load volatile half, half addrspace(1)* %b 62 %c.val = load volatile half, half addrspace(1)* %c 63 %r.val = call half @llvm.fmuladd.f16(half 3.0, half %b.val, half %c.val) 64 store half %r.val, half addrspace(1)* %r 65 ret void 66} 67 68; GCN-LABEL: {{^}}fmuladd_f16_imm_b 69; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 70; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 71; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 72; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 73; SI: v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[A_F32]] 74; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] 75; SI: buffer_store_short v[[R_F16]] 76 77; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]] 78; VI-FLUSH: buffer_store_short v[[C_F16]] 79 80; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200 81; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]] 82; VI-DENORM buffer_store_short [[RESULT]] 83 84 85; GCN: s_endpgm 86define amdgpu_kernel void @fmuladd_f16_imm_b( 87 half addrspace(1)* %r, 88 half addrspace(1)* %a, 89 half addrspace(1)* %c) { 90 %a.val = load volatile half, half addrspace(1)* %a 91 %c.val = load volatile half, half addrspace(1)* %c 92 %r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val) 93 store half %r.val, half addrspace(1)* %r 94 ret void 95} 96 97; GCN-LABEL: {{^}}fmuladd_v2f16 98; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] 99; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] 100; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] 101 102; VI-FLUSH: buffer_load_dword v[[A_V2_F16:[0-9]+]] 103; VI-FLUSH: buffer_load_dword v[[C_V2_F16:[0-9]+]] 104; VI-FLUSH: buffer_load_dword v[[B_V2_F16:[0-9]+]] 105 106; VI-DENORM: buffer_load_dword v[[A_V2_F16:[0-9]+]] 107; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]] 108; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]] 109 110 111; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 112; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 113; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 114; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 115 116; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 117; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] 118 119 120; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 121; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 122; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] 123; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] 124; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] 125; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] 126; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] 127; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 128; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] 129 130 131; VI-FLUSH: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 132; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 133; VI-FLUSH-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]] 134; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] 135; VI-FLUSH-NOT: v_and_b32 136; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]] 137 138; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 139; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 140; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 141; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] 142; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] 143; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]] 144; VI-DENORM-NOT: v_and_b32 145; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]] 146 147; GCN: buffer_store_dword v[[R_V2_F16]] 148define amdgpu_kernel void @fmuladd_v2f16( 149 <2 x half> addrspace(1)* %r, 150 <2 x half> addrspace(1)* %a, 151 <2 x half> addrspace(1)* %b, 152 <2 x half> addrspace(1)* %c) { 153 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 154 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 155 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 156 %r.val = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val) 157 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 158 ret void 159} 160