1; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s 3 4; half args should be promoted to float for SI and lower. 5 6; GCN-LABEL: {{^}}load_f16_arg: 7; GCN: s_load_dword [[ARG:s[0-9]+]] 8; GCN: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]] 9; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_ARG]] 10define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { 11 store half %arg, half addrspace(1)* %out 12 ret void 13} 14 15; GCN-LABEL: {{^}}load_v2f16_arg: 16; GCN: s_load_dword [[ARG:s[0-9]+]] 17; GCN: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]] 18; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_ARG]] 19define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { 20 store <2 x half> %arg, <2 x half> addrspace(1)* %out 21 ret void 22} 23 24; GCN-LABEL: {{^}}load_v3f16_arg: 25; GCN: s_load_dwordx2 26; GCN: s_load_dwordx2 27; GCN-NOT: {buffer|flat|global}}_load_ 28 29 30; GCN-NOT: {{flat|global}}_load 31; GCN-DAG: {{flat|global}}_store_dword 32; GCN-DAG: {{flat|global}}_store_short 33; GCN-NOT: {{flat|global}}_store 34; GCN: s_endpgm 35define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { 36 store <3 x half> %arg, <3 x half> addrspace(1)* %out 37 ret void 38} 39 40 41; FIXME: Why not one load? 42; GCN-LABEL: {{^}}load_v4f16_arg: 43; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x2|0x8}} 44; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], s[[ARG0_LO]] 45; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], s[[ARG0_HI]] 46; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}} 47define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { 48 store <4 x half> %arg, <4 x half> addrspace(1)* %out 49 ret void 50} 51 52; GCN-LABEL: {{^}}load_v8f16_arg: 53define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { 54 store <8 x half> %arg, <8 x half> addrspace(1)* %out 55 ret void 56} 57 58; GCN-LABEL: {{^}}extload_v2f16_arg: 59define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { 60 %fpext = fpext <2 x half> %in to <2 x float> 61 store <2 x float> %fpext, <2 x float> addrspace(1)* %out 62 ret void 63} 64 65; GCN-LABEL: {{^}}extload_f16_to_f32_arg: 66define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { 67 %ext = fpext half %arg to float 68 store float %ext, float addrspace(1)* %out 69 ret void 70} 71 72; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: 73define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { 74 %ext = fpext <2 x half> %arg to <2 x float> 75 store <2 x float> %ext, <2 x float> addrspace(1)* %out 76 ret void 77} 78 79; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: 80; GCN: s_load_dwordx2 s 81; GCN: s_load_dwordx2 s 82; GCN-NOT: _load 83; GCN: v_cvt_f32_f16_e32 84; GCN: v_cvt_f32_f16_e32 85; GCN: v_cvt_f32_f16_e32 86; GCN-NOT: v_cvt_f32_f16 87; GCN-DAG: _store_dword 88; GCN-DAG: _store_dwordx2 89; GCN: s_endpgm 90define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { 91 %ext = fpext <3 x half> %arg to <3 x float> 92 store <3 x float> %ext, <3 x float> addrspace(1)* %out 93 ret void 94} 95 96; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: 97define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { 98 %ext = fpext <4 x half> %arg to <4 x float> 99 store <4 x float> %ext, <4 x float> addrspace(1)* %out 100 ret void 101} 102 103; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: 104; GCN: s_load_dwordx4 105 106; GCN: v_cvt_f32_f16_e32 107; GCN: v_cvt_f32_f16_e32 108; GCN: v_cvt_f32_f16_e32 109; GCN: v_cvt_f32_f16_e32 110; GCN: v_cvt_f32_f16_e32 111; GCN: v_cvt_f32_f16_e32 112; GCN: v_cvt_f32_f16_e32 113; GCN: v_cvt_f32_f16_e32 114 115; GCN: flat_store_dwordx4 116; GCN: flat_store_dwordx4 117define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { 118 %ext = fpext <8 x half> %arg to <8 x float> 119 store <8 x float> %ext, <8 x float> addrspace(1)* %out 120 ret void 121} 122 123; GCN-LABEL: {{^}}extload_f16_to_f64_arg: 124; GCN: s_load_dword [[ARG:s[0-9]+]] 125; GCN: v_cvt_f32_f16_e32 v[[ARG_F32:[0-9]+]], [[ARG]] 126; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[ARG_F32]] 127; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 128define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { 129 %ext = fpext half %arg to double 130 store double %ext, double addrspace(1)* %out 131 ret void 132} 133 134; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: 135; GCN-DAG: s_load_dword s 136; GCN: s_lshr_b32 137 138; GCN-DAG: v_cvt_f32_f16_e32 139; GCN-DAG: v_cvt_f32_f16_e32 140; GCN-DAG: v_cvt_f64_f32_e32 141; GCN-DAG: v_cvt_f64_f32_e32 142; GCN: s_endpgm 143define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { 144 %ext = fpext <2 x half> %arg to <2 x double> 145 store <2 x double> %ext, <2 x double> addrspace(1)* %out 146 ret void 147} 148 149; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: 150; GCN: s_load_dwordx2 s 151; GCN: s_load_dwordx2 s 152; GCN-DAG: v_cvt_f32_f16_e32 153; GCN-DAG: v_cvt_f32_f16_e32 154; GCN-DAG: v_cvt_f32_f16_e32 155; GCN-DAG: v_cvt_f64_f32_e32 156; GCN-DAG: v_cvt_f64_f32_e32 157; GCN-DAG: v_cvt_f64_f32_e32 158; GCN: s_endpgm 159define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { 160 %ext = fpext <3 x half> %arg to <3 x double> 161 store <3 x double> %ext, <3 x double> addrspace(1)* %out 162 ret void 163} 164 165; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: 166; GCN: s_load_dwordx2 s 167; GCN: s_load_dwordx2 s 168 169; GCN: v_cvt_f32_f16_e32 170; GCN: v_cvt_f32_f16_e32 171; GCN: v_cvt_f32_f16_e32 172; GCN: v_cvt_f32_f16_e32 173; GCN: v_cvt_f64_f32_e32 174; GCN: v_cvt_f64_f32_e32 175; GCN: v_cvt_f64_f32_e32 176; GCN: v_cvt_f64_f32_e32 177; GCN: s_endpgm 178define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { 179 %ext = fpext <4 x half> %arg to <4 x double> 180 store <4 x double> %ext, <4 x double> addrspace(1)* %out 181 ret void 182} 183 184; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: 185; GCN: s_load_dwordx2 s 186; GCN: s_load_dwordx4 s 187 188; GCN-DAG: v_cvt_f32_f16_e32 189; GCN-DAG: v_cvt_f32_f16_e32 190; GCN-DAG: v_cvt_f32_f16_e32 191; GCN-DAG: v_cvt_f32_f16_e32 192 193; GCN-DAG: v_cvt_f32_f16_e32 194; GCN-DAG: v_cvt_f32_f16_e32 195; GCN-DAG: v_cvt_f32_f16_e32 196; GCN-DAG: v_cvt_f32_f16_e32 197 198; GCN-DAG: v_cvt_f64_f32_e32 199; GCN-DAG: v_cvt_f64_f32_e32 200; GCN-DAG: v_cvt_f64_f32_e32 201; GCN-DAG: v_cvt_f64_f32_e32 202 203; GCN: v_cvt_f64_f32_e32 204; GCN: v_cvt_f64_f32_e32 205; GCN: v_cvt_f64_f32_e32 206; GCN: v_cvt_f64_f32_e32 207 208; GCN: s_endpgm 209define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { 210 %ext = fpext <8 x half> %arg to <8 x double> 211 store <8 x double> %ext, <8 x double> addrspace(1)* %out 212 ret void 213} 214 215; GCN-LABEL: {{^}}global_load_store_f16: 216; GCN: flat_load_ushort [[TMP:v[0-9]+]] 217; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[TMP]] 218define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 219 %val = load half, half addrspace(1)* %in 220 store half %val, half addrspace(1)* %out 221 ret void 222} 223 224; GCN-LABEL: {{^}}global_load_store_v2f16: 225; GCN: flat_load_dword [[TMP:v[0-9]+]] 226; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[TMP]] 227define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 228 %val = load <2 x half>, <2 x half> addrspace(1)* %in 229 store <2 x half> %val, <2 x half> addrspace(1)* %out 230 ret void 231} 232 233; GCN-LABEL: {{^}}global_load_store_v4f16: 234; GCN: flat_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] 235; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[TMP]] 236define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { 237 %val = load <4 x half>, <4 x half> addrspace(1)* %in 238 store <4 x half> %val, <4 x half> addrspace(1)* %out 239 ret void 240} 241 242; GCN-LABEL: {{^}}global_load_store_v8f16: 243; GCN: flat_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] 244; GCN: flat_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, [[TMP:v\[[0-9]+:[0-9]+\]]] 245; GCN: s_endpgm 246define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 247 %val = load <8 x half>, <8 x half> addrspace(1)* %in 248 store <8 x half> %val, <8 x half> addrspace(1)* %out 249 ret void 250} 251 252; GCN-LABEL: {{^}}global_extload_f16_to_f32: 253; GCN: flat_load_ushort [[LOAD:v[0-9]+]] 254; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] 255; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[CVT]] 256define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { 257 %val = load half, half addrspace(1)* %in 258 %cvt = fpext half %val to float 259 store float %cvt, float addrspace(1)* %out 260 ret void 261} 262 263; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: 264; GCN: flat_load_dword [[LOAD:v[0-9]+]], 265 266; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] 267; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] 268 269; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] 270 271; VI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] 272; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 273 274; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} 275; GCN: s_endpgm 276define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 277 %val = load <2 x half>, <2 x half> addrspace(1)* %in 278 %cvt = fpext <2 x half> %val to <2 x float> 279 store <2 x float> %cvt, <2 x float> addrspace(1)* %out 280 ret void 281} 282 283; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: 284define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 285 %val = load <3 x half>, <3 x half> addrspace(1)* %in 286 %cvt = fpext <3 x half> %val to <3 x float> 287 store <3 x float> %cvt, <3 x float> addrspace(1)* %out 288 ret void 289} 290 291; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: 292define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 293 %val = load <4 x half>, <4 x half> addrspace(1)* %in 294 %cvt = fpext <4 x half> %val to <4 x float> 295 store <4 x float> %cvt, <4 x float> addrspace(1)* %out 296 ret void 297} 298 299; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: 300define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 301 %val = load <8 x half>, <8 x half> addrspace(1)* %in 302 %cvt = fpext <8 x half> %val to <8 x float> 303 store <8 x float> %cvt, <8 x float> addrspace(1)* %out 304 ret void 305} 306 307; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: 308; GCN: flat_load_dwordx4 309; GCN: flat_load_dwordx4 310 311; SI: v_cvt_f32_f16_e32 312; SI: v_cvt_f32_f16_e32 313; SI: v_cvt_f32_f16_e32 314; SI: v_cvt_f32_f16_e32 315; SI: v_cvt_f32_f16_e32 316; SI: v_cvt_f32_f16_e32 317 318; GCN: flat_store_dwordx4 319 320; SI: v_cvt_f32_f16_e32 321; SI: v_cvt_f32_f16_e32 322; SI: v_cvt_f32_f16_e32 323; SI: v_cvt_f32_f16_e32 324; SI: v_cvt_f32_f16_e32 325; SI: v_cvt_f32_f16_e32 326; SI: v_cvt_f32_f16_e32 327; SI: v_cvt_f32_f16_e32 328; SI: v_cvt_f32_f16_e32 329; SI: v_cvt_f32_f16_e32 330 331; VI: v_cvt_f32_f16_e32 332; VI: v_cvt_f32_f16_sdwa 333 334 335; GCN: flat_store_dwordx4 336; GCN: flat_store_dwordx4 337; GCN: flat_store_dwordx4 338 339; GCN: s_endpgm 340define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 341 %val = load <16 x half>, <16 x half> addrspace(1)* %in 342 %cvt = fpext <16 x half> %val to <16 x float> 343 store <16 x float> %cvt, <16 x float> addrspace(1)* %out 344 ret void 345} 346 347; GCN-LABEL: {{^}}global_extload_f16_to_f64: 348; GCN: flat_load_ushort [[LOAD:v[0-9]+]] 349; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] 350; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] 351; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]] 352define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { 353 %val = load half, half addrspace(1)* %in 354 %cvt = fpext half %val to double 355 store double %cvt, double addrspace(1)* %out 356 ret void 357} 358 359; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: 360; GCN-DAG: flat_load_dword [[LOAD:v[0-9]+]], 361 362; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] 363; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] 364; SI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] 365; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] 366; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] 367 368; VI-DAG: v_cvt_f32_f16_sdwa v[[CVT0:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 369; VI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD]] 370; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT0]] 371; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT1]] 372 373; GCN-DAG: flat_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} 374; GCN: s_endpgm 375define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 376 %val = load <2 x half>, <2 x half> addrspace(1)* %in 377 %cvt = fpext <2 x half> %val to <2 x double> 378 store <2 x double> %cvt, <2 x double> addrspace(1)* %out 379 ret void 380} 381 382; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: 383 384; XSI: flat_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 385; XSI: v_cvt_f32_f16_e32 386; XSI: v_cvt_f32_f16_e32 387; XSI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} 388; XSI: v_cvt_f32_f16_e32 389; XSI-NOT: v_cvt_f32_f16 390 391; XVI: flat_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 392; XVI: v_cvt_f32_f16_e32 393; XVI: v_cvt_f32_f16_e32 394; XVI: v_cvt_f32_f16_sdwa 395; XVI-NOT: v_cvt_f32_f16 396 397; GCN: flat_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]] 398; GCN: v_cvt_f32_f16_e32 399; GCN: v_cvt_f32_f16_e32 400; SI: v_cvt_f32_f16_e32 401; VI: v_cvt_f32_f16_sdwa 402; GCN-NOT: v_cvt_f32_f16 403 404; GCN: v_cvt_f64_f32_e32 405; GCN: v_cvt_f64_f32_e32 406; GCN: v_cvt_f64_f32_e32 407; GCN-NOT: v_cvt_f64_f32_e32 408 409; GCN-DAG: flat_store_dwordx4 410; GCN-DAG: flat_store_dwordx2 411; GCN: s_endpgm 412define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 413 %val = load <3 x half>, <3 x half> addrspace(1)* %in 414 %cvt = fpext <3 x half> %val to <3 x double> 415 store <3 x double> %cvt, <3 x double> addrspace(1)* %out 416 ret void 417} 418 419; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: 420define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 421 %val = load <4 x half>, <4 x half> addrspace(1)* %in 422 %cvt = fpext <4 x half> %val to <4 x double> 423 store <4 x double> %cvt, <4 x double> addrspace(1)* %out 424 ret void 425} 426 427; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: 428define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 429 %val = load <8 x half>, <8 x half> addrspace(1)* %in 430 %cvt = fpext <8 x half> %val to <8 x double> 431 store <8 x double> %cvt, <8 x double> addrspace(1)* %out 432 ret void 433} 434 435; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: 436define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 437 %val = load <16 x half>, <16 x half> addrspace(1)* %in 438 %cvt = fpext <16 x half> %val to <16 x double> 439 store <16 x double> %cvt, <16 x double> addrspace(1)* %out 440 ret void 441} 442 443; GCN-LABEL: {{^}}global_truncstore_f32_to_f16: 444; GCN: flat_load_dword [[LOAD:v[0-9]+]] 445; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] 446; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT]] 447define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { 448 %val = load float, float addrspace(1)* %in 449 %cvt = fptrunc float %val to half 450 store half %cvt, half addrspace(1)* %out 451 ret void 452} 453 454; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: 455; GCN: flat_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} 456; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] 457 458; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] 459; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] 460; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]] 461 462; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 463; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[CVT1]] 464 465; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[PACKED]] 466; GCN: s_endpgm 467define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { 468 %val = load <2 x float>, <2 x float> addrspace(1)* %in 469 %cvt = fptrunc <2 x float> %val to <2 x half> 470 store <2 x half> %cvt, <2 x half> addrspace(1)* %out 471 ret void 472} 473 474; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: 475; GCN: flat_load_dwordx4 476; GCN-DAG: v_cvt_f16_f32_e32 477; SI-DAG: v_cvt_f16_f32_e32 478; VI-DAG: v_cvt_f16_f32_sdwa 479; GCN-DAG: v_cvt_f16_f32_e32 480; GCN: flat_store_short 481; GCN: flat_store_dword 482; GCN: s_endpgm 483define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { 484 %val = load <3 x float>, <3 x float> addrspace(1)* %in 485 %cvt = fptrunc <3 x float> %val to <3 x half> 486 store <3 x half> %cvt, <3 x half> addrspace(1)* %out 487 ret void 488} 489 490; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: 491; GCN: flat_load_dwordx4 492; GCN-DAG: v_cvt_f16_f32_e32 493; SI-DAG: v_cvt_f16_f32_e32 494; SI-DAG: v_cvt_f16_f32_e32 495; VI-DAG: v_cvt_f16_f32_sdwa 496; VI-DAG: v_cvt_f16_f32_sdwa 497; GCN-DAG: v_cvt_f16_f32_e32 498; GCN: flat_store_dwordx2 499; GCN: s_endpgm 500define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 501 %val = load <4 x float>, <4 x float> addrspace(1)* %in 502 %cvt = fptrunc <4 x float> %val to <4 x half> 503 store <4 x half> %cvt, <4 x half> addrspace(1)* %out 504 ret void 505} 506 507; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: 508; GCN: flat_load_dwordx4 509; GCN: flat_load_dwordx4 510; SI: v_cvt_f16_f32_e32 511; SI: v_cvt_f16_f32_e32 512; SI: v_cvt_f16_f32_e32 513; SI: v_cvt_f16_f32_e32 514; SI: v_cvt_f16_f32_e32 515; SI: v_cvt_f16_f32_e32 516; SI: v_cvt_f16_f32_e32 517; SI: v_cvt_f16_f32_e32 518; VI-DAG: v_cvt_f16_f32_e32 519; VI-DAG: v_cvt_f16_f32_e32 520; VI-DAG: v_cvt_f16_f32_e32 521; VI-DAG: v_cvt_f16_f32_e32 522; VI-DAG: v_cvt_f16_f32_sdwa 523; VI-DAG: v_cvt_f16_f32_sdwa 524; VI-DAG: v_cvt_f16_f32_sdwa 525; VI-DAG: v_cvt_f16_f32_sdwa 526; GCN: flat_store_dwordx4 527; GCN: s_endpgm 528define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { 529 %val = load <8 x float>, <8 x float> addrspace(1)* %in 530 %cvt = fptrunc <8 x float> %val to <8 x half> 531 store <8 x half> %cvt, <8 x half> addrspace(1)* %out 532 ret void 533} 534 535; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: 536; GCN: flat_load_dwordx4 537; GCN: flat_load_dwordx4 538; GCN: flat_load_dwordx4 539; GCN: flat_load_dwordx4 540; GCN-DAG: v_cvt_f16_f32_e32 541; GCN-DAG: v_cvt_f16_f32_e32 542; GCN-DAG: v_cvt_f16_f32_e32 543; GCN-DAG: v_cvt_f16_f32_e32 544; GCN-DAG: v_cvt_f16_f32_e32 545; GCN-DAG: v_cvt_f16_f32_e32 546; GCN-DAG: v_cvt_f16_f32_e32 547; GCN-DAG: v_cvt_f16_f32_e32 548; GCN-DAG: v_cvt_f16_f32_e32 549; GCN-DAG: v_cvt_f16_f32_e32 550; GCN-DAG: v_cvt_f16_f32_e32 551; GCN-DAG: v_cvt_f16_f32_e32 552; GCN-DAG: v_cvt_f16_f32_e32 553; GCN-DAG: v_cvt_f16_f32_e32 554; GCN-DAG: v_cvt_f16_f32_e32 555; GCN-DAG: v_cvt_f16_f32_e32 556; GCN-DAG: flat_store_dwordx4 557; GCN-DAG: flat_store_dwordx4 558; GCN: s_endpgm 559define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { 560 %val = load <16 x float>, <16 x float> addrspace(1)* %in 561 %cvt = fptrunc <16 x float> %val to <16 x half> 562 store <16 x half> %cvt, <16 x half> addrspace(1)* %out 563 ret void 564} 565 566; FIXME: Unsafe math should fold conversions away 567; GCN-LABEL: {{^}}fadd_f16: 568; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 569; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 570; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 571; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 572; SI: v_add_f32 573; GCN: s_endpgm 574define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { 575 %add = fadd half %a, %b 576 store half %add, half addrspace(1)* %out, align 4 577 ret void 578} 579 580; GCN-LABEL: {{^}}fadd_v2f16: 581; SI: v_add_f32 582; SI: v_add_f32 583; GCN: s_endpgm 584define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { 585 %add = fadd <2 x half> %a, %b 586 store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 587 ret void 588} 589 590; GCN-LABEL: {{^}}fadd_v4f16: 591; SI: v_add_f32 592; SI: v_add_f32 593; SI: v_add_f32 594; SI: v_add_f32 595; GCN: s_endpgm 596define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 597 %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 598 %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 599 %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 600 %result = fadd <4 x half> %a, %b 601 store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 602 ret void 603} 604 605; GCN-LABEL: {{^}}fadd_v8f16: 606; SI: v_add_f32 607; SI: v_add_f32 608; SI: v_add_f32 609; SI: v_add_f32 610; SI: v_add_f32 611; SI: v_add_f32 612; SI: v_add_f32 613; SI: v_add_f32 614; GCN: s_endpgm 615define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { 616 %add = fadd <8 x half> %a, %b 617 store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 618 ret void 619} 620 621; GCN-LABEL: {{^}}test_bitcast_from_half: 622; GCN: flat_load_ushort [[TMP:v[0-9]+]] 623; GCN-NOT: [[TMP]] 624; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[TMP]] 625define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { 626 %val = load half, half addrspace(1)* %in 627 %val_int = bitcast half %val to i16 628 store i16 %val_int, i16 addrspace(1)* %out 629 ret void 630} 631 632; GCN-LABEL: {{^}}test_bitcast_to_half: 633; GCN: flat_load_ushort [[TMP:v[0-9]+]] 634; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[TMP]] 635define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 636 %val = load i16, i16 addrspace(1)* %in 637 %val_fp = bitcast i16 %val to half 638 store half %val_fp, half addrspace(1)* %out 639 ret void 640} 641 642attributes #0 = { nounwind } 643