1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s 4 5; GCN-LABEL: {{^}}fsub_f16: 6; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 7; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 8; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 9; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 10; SI: v_sub_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] 11; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 12; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] 13; GCN: buffer_store_short v[[R_F16]] 14; GCN: s_endpgm 15define amdgpu_kernel void @fsub_f16( 16 half addrspace(1)* %r, 17 half addrspace(1)* %a, 18 half addrspace(1)* %b) { 19entry: 20 %a.val = load volatile half, half addrspace(1)* %a 21 %b.val = load volatile half, half addrspace(1)* %b 22 %r.val = fsub half %a.val, %b.val 23 store half %r.val, half addrspace(1)* %r 24 ret void 25} 26 27; GCN-LABEL: {{^}}fsub_f16_imm_a: 28; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 29; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 30; SI: v_sub_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]] 31; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 32; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] 33; GCN: buffer_store_short v[[R_F16]] 34; GCN: s_endpgm 35define amdgpu_kernel void @fsub_f16_imm_a( 36 half addrspace(1)* %r, 37 half addrspace(1)* %b) { 38entry: 39 %b.val = load volatile half, half addrspace(1)* %b 40 %r.val = fsub half 1.0, %b.val 41 store half %r.val, half addrspace(1)* %r 42 ret void 43} 44 45; GCN-LABEL: {{^}}fsub_f16_imm_b: 46; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 47; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 48; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], -2.0, v[[A_F32]] 49; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 50; GFX89: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]] 51; GCN: buffer_store_short v[[R_F16]] 52; GCN: s_endpgm 53define amdgpu_kernel void @fsub_f16_imm_b( 54 half addrspace(1)* %r, 55 half addrspace(1)* %a) { 56entry: 57 %a.val = load volatile half, half addrspace(1)* %a 58 %r.val = fsub half %a.val, 2.0 59 store half %r.val, half addrspace(1)* %r 60 ret void 61} 62 63; GCN-LABEL: {{^}}fsub_v2f16: 64; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] 65; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] 66 67; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 68; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 69; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 70; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 71 72; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 73; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 74; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] 75; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] 76; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 77; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 78; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 79; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 80 81; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] 82; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] 83 84; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] 85; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 86; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 87 88 89; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] 90; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] 91 92; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1] 93 94; GCN: buffer_store_dword v[[R_V2_F16]] 95; GCN: s_endpgm 96 97define amdgpu_kernel void @fsub_v2f16( 98 <2 x half> addrspace(1)* %r, 99 <2 x half> addrspace(1)* %a, 100 <2 x half> addrspace(1)* %b) { 101entry: 102 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 103 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 104 %r.val = fsub <2 x half> %a.val, %b.val 105 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 106 ret void 107} 108 109; GCN-LABEL: {{^}}fsub_v2f16_imm_a: 110; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] 111 112; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 113; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 114; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 115; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] 116; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 117; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] 118; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 119; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 120; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 121 122; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 123; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 124; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] 125; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 126 127; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00 128; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] neg_lo:[1,0] neg_hi:[1,0] 129 130; GCN: buffer_store_dword v[[R_V2_F16]] 131; GCN: s_endpgm 132 133define amdgpu_kernel void @fsub_v2f16_imm_a( 134 <2 x half> addrspace(1)* %r, 135 <2 x half> addrspace(1)* %b) { 136entry: 137 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 138 %r.val = fsub <2 x half> <half 1.0, half 2.0>, %b.val 139 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 140 ret void 141} 142 143; GCN-LABEL: {{^}}fsub_v2f16_imm_b: 144; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] 145 146; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 147; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 148; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 149; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]] 150; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 151; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]] 152; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 153; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 154; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 155 156; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00 157; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 158; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]] 159; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 160 161; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000 162; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]{{$}} 163 164; GCN: buffer_store_dword v[[R_V2_F16]] 165; GCN: s_endpgm 166 167define amdgpu_kernel void @fsub_v2f16_imm_b( 168 <2 x half> addrspace(1)* %r, 169 <2 x half> addrspace(1)* %a) { 170entry: 171 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 172 %r.val = fsub <2 x half> %a.val, <half 2.0, half 1.0> 173 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 174 ret void 175} 176