1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VIGFX9 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VIGFX9 %s 4 5declare half @llvm.fma.f16(half %a, half %b, half %c) 6declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) 7declare <4 x half> @llvm.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) 8 9; GCN-LABEL: {{^}}fma_f16 10; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 11; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 12; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 13; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 14; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 15; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 16; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] 17; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 18; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] 19; GCN: buffer_store_short v[[R_F16]] 20; GCN: s_endpgm 21define amdgpu_kernel void @fma_f16( 22 half addrspace(1)* %r, 23 half addrspace(1)* %a, 24 half addrspace(1)* %b, 25 half addrspace(1)* %c) { 26 %a.val = load half, half addrspace(1)* %a 27 %b.val = load half, half addrspace(1)* %b 28 %c.val = load half, half addrspace(1)* %c 29 %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half %c.val) 30 store half %r.val, half addrspace(1)* %r 31 ret void 32} 33 34; GCN-LABEL: {{^}}fma_f16_imm_a 35; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 36; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 37 38; SI: s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}} 39; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 40; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 41; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[B_F32:[0-9]]], s[[A_F32:[0-9]]], v[[C_F32:[0-9]]] 42; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 43; VIGFX9: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}} 44; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]] 45; GCN: buffer_store_short v[[R_F16]] 46; GCN: s_endpgm 47define amdgpu_kernel void @fma_f16_imm_a( 48 half addrspace(1)* %r, 49 half addrspace(1)* %b, 50 half addrspace(1)* %c) { 51 %b.val = load half, half addrspace(1)* %b 52 %c.val = load half, half addrspace(1)* %c 53 %r.val = call half @llvm.fma.f16(half 3.0, half %b.val, half %c.val) 54 store half %r.val, half addrspace(1)* %r 55 ret void 56} 57 58; GCN-LABEL: {{^}}fma_f16_imm_b 59; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 60; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 61; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}} 62; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 63; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 64; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], s[[B_F32:[0-9]]], v[[C_F32:[0-9]]] 65; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 66; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} 67; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]] 68; GCN: buffer_store_short v[[R_F16]] 69; GCN: s_endpgm 70define amdgpu_kernel void @fma_f16_imm_b( 71 half addrspace(1)* %r, 72 half addrspace(1)* %a, 73 half addrspace(1)* %c) { 74 %a.val = load half, half addrspace(1)* %a 75 %c.val = load half, half addrspace(1)* %c 76 %r.val = call half @llvm.fma.f16(half %a.val, half 3.0, half %c.val) 77 store half %r.val, half addrspace(1)* %r 78 ret void 79} 80 81; GCN-LABEL: {{^}}fma_f16_imm_c 82; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 83; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 84; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}} 85; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 86; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 87; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], s[[C_F32:[0-9]]] 88; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 89; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} 90; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]] 91; GCN: buffer_store_short v[[R_F16]] 92; GCN: s_endpgm 93define amdgpu_kernel void @fma_f16_imm_c( 94 half addrspace(1)* %r, 95 half addrspace(1)* %a, 96 half addrspace(1)* %b) { 97 %a.val = load half, half addrspace(1)* %a 98 %b.val = load half, half addrspace(1)* %b 99 %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half 3.0) 100 store half %r.val, half addrspace(1)* %r 101 ret void 102} 103 104; GCN-LABEL: {{^}}fma_v2f16 105; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] 106; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] 107; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] 108 109; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 110; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 111; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 112; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 113; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 114; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 115; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] 116 117; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 118; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] 119 120 121; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]] 122; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 123; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]] 124; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 125 126; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 127; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 128; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 129; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] 130; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] 131 132; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] 133 134; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 135; GCN-NOT: and 136; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 137; GCN: buffer_store_dword v[[R_V2_F16]] 138; GCN: s_endpgm 139define amdgpu_kernel void @fma_v2f16( 140 <2 x half> addrspace(1)* %r, 141 <2 x half> addrspace(1)* %a, 142 <2 x half> addrspace(1)* %b, 143 <2 x half> addrspace(1)* %c) { 144 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 145 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 146 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 147 %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val) 148 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 149 ret void 150} 151 152; GCN-LABEL: {{^}}fma_v2f16_imm_a: 153; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] 154; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] 155 156 157; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]] 158; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] 159 160 161; SI: s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}} 162; VIGFX9: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}} 163; SIVI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 164; SIVI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 165 166; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] 167; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 168; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] 169; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 170 171; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], s[[A_F32]], v[[C_F32_1]] 172; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], s[[A_F32]], v[[C_F32_0]] 173; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 174; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 175 176; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], s[[A_F16]], v[[C_F16_1]] 177; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], s[[A_F16]], v[[C_V2_F16]] 178 179; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]] 180 181; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 182; GCN-NOT: and 183; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 184; GCN: buffer_store_dword v[[R_V2_F16]] 185; GCN: s_endpgm 186define amdgpu_kernel void @fma_v2f16_imm_a( 187 <2 x half> addrspace(1)* %r, 188 <2 x half> addrspace(1)* %b, 189 <2 x half> addrspace(1)* %c) { 190 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 191 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 192 %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> <half 3.0, half 3.0>, <2 x half> %b.val, <2 x half> %c.val) 193 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 194 ret void 195} 196 197; GCN-LABEL: {{^}}fma_v2f16_imm_b: 198; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] 199; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] 200 201; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] 202; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] 203; GFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]] 204 205; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}} 206; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} 207 208; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 209; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 210; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] 211; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 212 213; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 214; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] 215; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], s[[B_F32]], v[[C_F32_0]] 216; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 217; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], s[[B_F32]], v[[C_F32_1]] 218; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 219 220; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 221; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] 222; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]] 223; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], s[[B_F16]], v[[C_F16_1]] 224 225; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]] 226 227; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 228; GCN-NOT: and 229; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 230; GCN: buffer_store_dword v[[R_V2_F16]] 231; GCN: s_endpgm 232define amdgpu_kernel void @fma_v2f16_imm_b( 233 <2 x half> addrspace(1)* %r, 234 <2 x half> addrspace(1)* %a, 235 <2 x half> addrspace(1)* %c) { 236 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 237 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 238 %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> <half 3.0, half 3.0>, <2 x half> %c.val) 239 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 240 ret void 241} 242 243; GCN-LABEL: {{^}}fma_v2f16_imm_c: 244; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] 245; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] 246 247; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] 248; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] 249; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] 250 251; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}} 252; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} 253 254; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 255; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 256 257; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 258; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 259 260; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 261; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 262 263; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], s[[C_F32]] 264; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], s[[C_F32]] 265; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 266; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 267; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] 268; GCN-NOT: and 269; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] 270 271; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 272; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 273; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], s[[C_F16]] 274; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], s[[C_F16]] 275; GCN-NOT: and 276; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] 277 278; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], s[[C_F16]] 279 280; GCN: buffer_store_dword v[[R_V2_F16]] 281; GCN: s_endpgm 282define amdgpu_kernel void @fma_v2f16_imm_c( 283 <2 x half> addrspace(1)* %r, 284 <2 x half> addrspace(1)* %a, 285 <2 x half> addrspace(1)* %b) { 286 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 287 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 288 %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> <half 3.0, half 3.0>) 289 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 290 ret void 291} 292 293; GCN-LABEL: {{^}}fma_v4f16 294; GCN: buffer_load_dwordx2 v{{\[}}[[A_V4_F16_LO:[0-9]+]]:[[A_V4_F16_HI:[0-9]+]]{{\]}} 295; GCN: buffer_load_dwordx2 v{{\[}}[[B_V4_F16_LO:[0-9]+]]:[[B_V4_F16_HI:[0-9]+]]{{\]}} 296; GCN: buffer_load_dwordx2 v{{\[}}[[C_V4_F16_LO:[0-9]+]]:[[C_V4_F16_HI:[0-9]+]]{{\]}} 297 298; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V4_F16_LO]] 299; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]] 300; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V4_F16_HI]] 301; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_2:[0-9]+]], 16, v[[A_V4_F16_HI]] 302; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V4_F16_LO]] 303; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V4_F16_LO]] 304; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]] 305; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]] 306; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_V4_F16_HI]] 307; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_V4_F16_HI]] 308; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]] 309; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]] 310; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_2:[0-9]+]], v[[A_V4_F16_LO]] 311; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_3:[0-9]+]], v[[A_V4_F16_HI]] 312; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_2:[0-9]+]], v[[B_V4_F16_LO]] 313; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_3:[0-9]+]], v[[B_V4_F16_HI]] 314; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_2:[0-9]+]], v[[C_V4_F16_LO]] 315; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_3:[0-9]+]], v[[C_V4_F16_HI]] 316 317; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]] 318; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]] 319; SI-DAG: v_fma_f32 v[[R_F32_2:[0-9]+]], v[[A_F32_2]], v[[B_F32_2]], v[[C_F32_2]] 320; SI-DAG: v_fma_f32 v[[R_F32_3:[0-9]+]], v[[A_F32_3]], v[[B_F32_3]], v[[C_F32_3]] 321 322; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] 323; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] 324; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_2:[0-9]+]], v[[R_F32_2]] 325; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_3:[0-9]+]], v[[R_F32_3]] 326 327; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_0:[0-9]]], 16, v[[R_F16_2]] 328; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_1:[0-9]]], 16, v[[R_F16_3]] 329 330; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]] 331; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V4_F16_HI]] 332; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]] 333; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]] 334; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]] 335; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]] 336 337; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]] 338; VI-DAG: v_fma_f16 v[[R1_F16_0:[0-9]+]], v[[A_F16_0]], v[[B_F16_0]], v[[C_F16_0]] 339; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]] 340; VI-DAG: v_fma_f16 v[[R1_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] 341 342; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_LO:[0-9]+]], v[[R_F16_0]], v[[R1_F16_0]] 343; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_HI:[0-9]+]], v[[R_F16_1]], v[[R1_F16_1]] 344 345; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_LO:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]] 346; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_HI:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]] 347 348; GCN: buffer_store_dwordx2 v{{\[}}[[R_V4_F16_LO]]:[[R_V4_F16_HI]]{{\]}} 349; GCN: s_endpgm 350 351define amdgpu_kernel void @fma_v4f16( 352 <4 x half> addrspace(1)* %r, 353 <4 x half> addrspace(1)* %a, 354 <4 x half> addrspace(1)* %b, 355 <4 x half> addrspace(1)* %c) { 356 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a 357 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b 358 %c.val = load <4 x half>, <4 x half> addrspace(1)* %c 359 %r.val = call <4 x half> @llvm.fma.v4f16(<4 x half> %a.val, <4 x half> %b.val, <4 x half> %c.val) 360 store <4 x half> %r.val, <4 x half> addrspace(1)* %r 361 ret void 362} 363