1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3 4; half args should be promoted to float 5 6; GCN-LABEL: {{^}}load_f16_arg: 7; GCN: s_load_dword [[ARG:s[0-9]+]] 8; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] 9; GCN: buffer_store_short [[CVT]] 10define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { 11 store half %arg, half addrspace(1)* %out 12 ret void 13} 14 15; GCN-LABEL: {{^}}load_v2f16_arg: 16; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 17; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 18; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] 19; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] 20; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 21; GCN: s_endpgm 22define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { 23 store <2 x half> %arg, <2 x half> addrspace(1)* %out 24 ret void 25} 26 27; GCN-LABEL: {{^}}load_v3f16_arg: 28; GCN: buffer_load_ushort 29; GCN: buffer_load_ushort 30; GCN: buffer_load_ushort 31; GCN-NOT: buffer_load 32; GCN-DAG: buffer_store_dword 33; GCN-DAG: buffer_store_short 34; GCN-NOT: buffer_store 35; GCN: s_endpgm 36define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { 37 store <3 x half> %arg, <3 x half> addrspace(1)* %out 38 ret void 39} 40 41; GCN-LABEL: {{^}}load_v4f16_arg: 42; GCN: buffer_load_ushort 43; GCN: buffer_load_ushort 44; GCN: buffer_load_ushort 45; GCN: buffer_load_ushort 46; GCN: buffer_store_dwordx2 47; GCN: s_endpgm 48define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { 49 store <4 x half> %arg, <4 x half> addrspace(1)* %out 50 ret void 51} 52 53; GCN-LABEL: {{^}}load_v8f16_arg: 54define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { 55 store <8 x half> %arg, <8 x half> addrspace(1)* %out 56 ret void 57} 58 59; GCN-LABEL: {{^}}extload_v2f16_arg: 60define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { 61 %fpext = fpext <2 x half> %in to <2 x float> 62 store <2 x float> %fpext, <2 x float> addrspace(1)* %out 63 ret void 64} 65 66; GCN-LABEL: {{^}}extload_f16_to_f32_arg: 67define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { 68 %ext = fpext half %arg to float 69 store float %ext, float addrspace(1)* %out 70 ret void 71} 72 73; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: 74define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { 75 %ext = fpext <2 x half> %arg to <2 x float> 76 store <2 x float> %ext, <2 x float> addrspace(1)* %out 77 ret void 78} 79 80; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: 81; GCN: buffer_load_ushort 82; GCN: buffer_load_ushort 83; GCN: buffer_load_ushort 84; GCN-NOT: buffer_load 85; GCN: v_cvt_f32_f16_e32 86; GCN: v_cvt_f32_f16_e32 87; GCN: v_cvt_f32_f16_e32 88; GCN-NOT: v_cvt_f32_f16 89; GCN-DAG: buffer_store_dword 90; GCN-DAG: buffer_store_dwordx2 91; GCN: s_endpgm 92define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { 93 %ext = fpext <3 x half> %arg to <3 x float> 94 store <3 x float> %ext, <3 x float> addrspace(1)* %out 95 ret void 96} 97 98; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: 99define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { 100 %ext = fpext <4 x half> %arg to <4 x float> 101 store <4 x float> %ext, <4 x float> addrspace(1)* %out 102 ret void 103} 104 105; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: 106; GCN: buffer_load_ushort 107; GCN: buffer_load_ushort 108; GCN: buffer_load_ushort 109; GCN: buffer_load_ushort 110; GCN: buffer_load_ushort 111; GCN: buffer_load_ushort 112; GCN: buffer_load_ushort 113; GCN: buffer_load_ushort 114 115; GCN: v_cvt_f32_f16_e32 116; GCN: v_cvt_f32_f16_e32 117; GCN: v_cvt_f32_f16_e32 118; GCN: v_cvt_f32_f16_e32 119; GCN: v_cvt_f32_f16_e32 120; GCN: v_cvt_f32_f16_e32 121; GCN: v_cvt_f32_f16_e32 122; GCN: v_cvt_f32_f16_e32 123 124; GCN: buffer_store_dwordx4 125; GCN: buffer_store_dwordx4 126define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { 127 %ext = fpext <8 x half> %arg to <8 x float> 128 store <8 x float> %ext, <8 x float> addrspace(1)* %out 129 ret void 130} 131 132; GCN-LABEL: {{^}}extload_f16_to_f64_arg: 133; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}} 134; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}} 135; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]] 136; GCN: buffer_store_dwordx2 [[RESULT]] 137define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { 138 %ext = fpext half %arg to double 139 store double %ext, double addrspace(1)* %out 140 ret void 141} 142 143; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: 144; GCN-DAG: buffer_load_ushort v 145; GCN-DAG: buffer_load_ushort v 146; GCN-DAG: v_cvt_f32_f16_e32 147; GCN-DAG: v_cvt_f32_f16_e32 148; GCN-DAG: v_cvt_f64_f32_e32 149; GCN-DAG: v_cvt_f64_f32_e32 150; GCN: s_endpgm 151define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { 152 %ext = fpext <2 x half> %arg to <2 x double> 153 store <2 x double> %ext, <2 x double> addrspace(1)* %out 154 ret void 155} 156 157; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: 158; GCN-DAG: buffer_load_ushort v 159; GCN-DAG: buffer_load_ushort v 160; GCN-DAG: buffer_load_ushort v 161; GCN-DAG: v_cvt_f32_f16_e32 162; GCN-DAG: v_cvt_f32_f16_e32 163; GCN-DAG: v_cvt_f32_f16_e32 164; GCN-DAG: v_cvt_f64_f32_e32 165; GCN-DAG: v_cvt_f64_f32_e32 166; GCN-DAG: v_cvt_f64_f32_e32 167; GCN: s_endpgm 168define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { 169 %ext = fpext <3 x half> %arg to <3 x double> 170 store <3 x double> %ext, <3 x double> addrspace(1)* %out 171 ret void 172} 173 174; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: 175; GCN-DAG: buffer_load_ushort v 176; GCN-DAG: buffer_load_ushort v 177; GCN-DAG: buffer_load_ushort v 178; GCN-DAG: buffer_load_ushort v 179; GCN-DAG: v_cvt_f32_f16_e32 180; GCN-DAG: v_cvt_f32_f16_e32 181; GCN-DAG: v_cvt_f32_f16_e32 182; GCN-DAG: v_cvt_f32_f16_e32 183; GCN-DAG: v_cvt_f64_f32_e32 184; GCN-DAG: v_cvt_f64_f32_e32 185; GCN-DAG: v_cvt_f64_f32_e32 186; GCN-DAG: v_cvt_f64_f32_e32 187; GCN: s_endpgm 188define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { 189 %ext = fpext <4 x half> %arg to <4 x double> 190 store <4 x double> %ext, <4 x double> addrspace(1)* %out 191 ret void 192} 193 194; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: 195; GCN-DAG: buffer_load_ushort v 196; GCN-DAG: buffer_load_ushort v 197; GCN-DAG: buffer_load_ushort v 198; GCN-DAG: buffer_load_ushort v 199 200; GCN-DAG: buffer_load_ushort v 201; GCN-DAG: buffer_load_ushort v 202; GCN-DAG: buffer_load_ushort v 203; GCN-DAG: buffer_load_ushort v 204 205; GCN-DAG: v_cvt_f32_f16_e32 206; GCN-DAG: v_cvt_f32_f16_e32 207; GCN-DAG: v_cvt_f32_f16_e32 208; GCN-DAG: v_cvt_f32_f16_e32 209 210; GCN-DAG: v_cvt_f32_f16_e32 211; GCN-DAG: v_cvt_f32_f16_e32 212; GCN-DAG: v_cvt_f32_f16_e32 213; GCN-DAG: v_cvt_f32_f16_e32 214 215; GCN-DAG: v_cvt_f64_f32_e32 216; GCN-DAG: v_cvt_f64_f32_e32 217; GCN-DAG: v_cvt_f64_f32_e32 218; GCN-DAG: v_cvt_f64_f32_e32 219 220; GCN-DAG: v_cvt_f64_f32_e32 221; GCN-DAG: v_cvt_f64_f32_e32 222; GCN-DAG: v_cvt_f64_f32_e32 223; GCN-DAG: v_cvt_f64_f32_e32 224 225; GCN: s_endpgm 226define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { 227 %ext = fpext <8 x half> %arg to <8 x double> 228 store <8 x double> %ext, <8 x double> addrspace(1)* %out 229 ret void 230} 231 232; GCN-LABEL: {{^}}global_load_store_f16: 233; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 234; GCN: buffer_store_short [[TMP]] 235define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 236 %val = load half, half addrspace(1)* %in 237 store half %val, half addrspace(1)* %out 238 ret void 239} 240 241; GCN-LABEL: {{^}}global_load_store_v2f16: 242; GCN: buffer_load_dword [[TMP:v[0-9]+]] 243; GCN: buffer_store_dword [[TMP]] 244define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 245 %val = load <2 x half>, <2 x half> addrspace(1)* %in 246 store <2 x half> %val, <2 x half> addrspace(1)* %out 247 ret void 248} 249 250; GCN-LABEL: {{^}}global_load_store_v4f16: 251; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] 252; GCN: buffer_store_dwordx2 [[TMP]] 253define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { 254 %val = load <4 x half>, <4 x half> addrspace(1)* %in 255 store <4 x half> %val, <4 x half> addrspace(1)* %out 256 ret void 257} 258 259; GCN-LABEL: {{^}}global_load_store_v8f16: 260; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] 261; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] 262; GCN: s_endpgm 263define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 264 %val = load <8 x half>, <8 x half> addrspace(1)* %in 265 store <8 x half> %val, <8 x half> addrspace(1)* %out 266 ret void 267} 268 269; GCN-LABEL: {{^}}global_extload_f16_to_f32: 270; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] 271; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] 272; GCN: buffer_store_dword [[CVT]] 273define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { 274 %val = load half, half addrspace(1)* %in 275 %cvt = fpext half %val to float 276 store float %cvt, float addrspace(1)* %out 277 ret void 278} 279 280; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: 281; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 282; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] 283; GCN: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] 284; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] 285; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} 286; GCN: s_endpgm 287define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 288 %val = load <2 x half>, <2 x half> addrspace(1)* %in 289 %cvt = fpext <2 x half> %val to <2 x float> 290 store <2 x float> %cvt, <2 x float> addrspace(1)* %out 291 ret void 292} 293 294; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: 295define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 296 %val = load <3 x half>, <3 x half> addrspace(1)* %in 297 %cvt = fpext <3 x half> %val to <3 x float> 298 store <3 x float> %cvt, <3 x float> addrspace(1)* %out 299 ret void 300} 301 302; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: 303define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 304 %val = load <4 x half>, <4 x half> addrspace(1)* %in 305 %cvt = fpext <4 x half> %val to <4 x float> 306 store <4 x float> %cvt, <4 x float> addrspace(1)* %out 307 ret void 308} 309 310; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: 311define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 312 %val = load <8 x half>, <8 x half> addrspace(1)* %in 313 %cvt = fpext <8 x half> %val to <8 x float> 314 store <8 x float> %cvt, <8 x float> addrspace(1)* %out 315 ret void 316} 317 318; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: 319; GCN: buffer_load_dwordx4 320; GCN: buffer_load_dwordx4 321 322; GCN: v_cvt_f32_f16_e32 323; GCN: v_cvt_f32_f16_e32 324; GCN: v_cvt_f32_f16_e32 325; GCN: v_cvt_f32_f16_e32 326; GCN: v_cvt_f32_f16_e32 327; GCN: v_cvt_f32_f16_e32 328; GCN: v_cvt_f32_f16_e32 329; GCN: v_cvt_f32_f16_e32 330; GCN: v_cvt_f32_f16_e32 331; GCN: v_cvt_f32_f16_e32 332; GCN: v_cvt_f32_f16_e32 333; GCN: v_cvt_f32_f16_e32 334; GCN: v_cvt_f32_f16_e32 335; GCN: v_cvt_f32_f16_e32 336; GCN: v_cvt_f32_f16_e32 337; GCN: v_cvt_f32_f16_e32 338 339; GCN: buffer_store_dwordx4 340; GCN: buffer_store_dwordx4 341; GCN: buffer_store_dwordx4 342; GCN: buffer_store_dwordx4 343 344; GCN: s_endpgm 345define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 346 %val = load <16 x half>, <16 x half> addrspace(1)* %in 347 %cvt = fpext <16 x half> %val to <16 x float> 348 store <16 x float> %cvt, <16 x float> addrspace(1)* %out 349 ret void 350} 351 352; GCN-LABEL: {{^}}global_extload_f16_to_f64: 353; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] 354; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] 355; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] 356; GCN: buffer_store_dwordx2 [[CVT1]] 357define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { 358 %val = load half, half addrspace(1)* %in 359 %cvt = fpext half %val to double 360 store double %cvt, double addrspace(1)* %out 361 ret void 362} 363 364; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: 365; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 366; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] 367; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] 368; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] 369; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] 370; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] 371; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} 372; GCN: s_endpgm 373define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 374 %val = load <2 x half>, <2 x half> addrspace(1)* %in 375 %cvt = fpext <2 x half> %val to <2 x double> 376 store <2 x double> %cvt, <2 x double> addrspace(1)* %out 377 ret void 378} 379 380; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: 381 382; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 383; GCN-DAG: v_cvt_f32_f16_e32 384; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} 385; GCN-DAG: v_cvt_f32_f16_e32 386; GCN-DAG: v_cvt_f32_f16_e32 387 388; GCN: v_cvt_f64_f32_e32 389; GCN: v_cvt_f64_f32_e32 390; GCN: v_cvt_f64_f32_e32 391; GCN-NOT: v_cvt_f64_f32_e32 392 393; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 394; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 395; GCN: s_endpgm 396define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 397 %val = load <3 x half>, <3 x half> addrspace(1)* %in 398 %cvt = fpext <3 x half> %val to <3 x double> 399 store <3 x double> %cvt, <3 x double> addrspace(1)* %out 400 ret void 401} 402 403; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: 404define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 405 %val = load <4 x half>, <4 x half> addrspace(1)* %in 406 %cvt = fpext <4 x half> %val to <4 x double> 407 store <4 x double> %cvt, <4 x double> addrspace(1)* %out 408 ret void 409} 410 411; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: 412define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 413 %val = load <8 x half>, <8 x half> addrspace(1)* %in 414 %cvt = fpext <8 x half> %val to <8 x double> 415 store <8 x double> %cvt, <8 x double> addrspace(1)* %out 416 ret void 417} 418 419; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: 420define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 421 %val = load <16 x half>, <16 x half> addrspace(1)* %in 422 %cvt = fpext <16 x half> %val to <16 x double> 423 store <16 x double> %cvt, <16 x double> addrspace(1)* %out 424 ret void 425} 426 427; GCN-LABEL: {{^}}global_truncstore_f32_to_f16: 428; GCN: buffer_load_dword [[LOAD:v[0-9]+]] 429; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] 430; GCN: buffer_store_short [[CVT]] 431define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { 432 %val = load float, float addrspace(1)* %in 433 %cvt = fptrunc float %val to half 434 store half %cvt, half addrspace(1)* %out 435 ret void 436} 437 438; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: 439; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} 440; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] 441; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] 442; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] 443; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]] 444; GCN-DAG: buffer_store_dword [[PACKED]] 445; GCN: s_endpgm 446define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { 447 %val = load <2 x float>, <2 x float> addrspace(1)* %in 448 %cvt = fptrunc <2 x float> %val to <2 x half> 449 store <2 x half> %cvt, <2 x half> addrspace(1)* %out 450 ret void 451} 452 453; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: 454; GCN: buffer_load_dwordx4 455; GCN: v_cvt_f16_f32_e32 456; GCN: v_cvt_f16_f32_e32 457; GCN: v_cvt_f16_f32_e32 458; GCN-NOT: v_cvt_f16_f32_e32 459; GCN: buffer_store_short 460; GCN: buffer_store_dword 461; GCN: s_endpgm 462define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { 463 %val = load <3 x float>, <3 x float> addrspace(1)* %in 464 %cvt = fptrunc <3 x float> %val to <3 x half> 465 store <3 x half> %cvt, <3 x half> addrspace(1)* %out 466 ret void 467} 468 469; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: 470; GCN: buffer_load_dwordx4 471; GCN: v_cvt_f16_f32_e32 472; GCN: v_cvt_f16_f32_e32 473; GCN: v_cvt_f16_f32_e32 474; GCN: v_cvt_f16_f32_e32 475; GCN: buffer_store_dwordx2 476; GCN: s_endpgm 477define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 478 %val = load <4 x float>, <4 x float> addrspace(1)* %in 479 %cvt = fptrunc <4 x float> %val to <4 x half> 480 store <4 x half> %cvt, <4 x half> addrspace(1)* %out 481 ret void 482} 483 484; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: 485; GCN: buffer_load_dwordx4 486; GCN: buffer_load_dwordx4 487; GCN: v_cvt_f16_f32_e32 488; GCN: v_cvt_f16_f32_e32 489; GCN: v_cvt_f16_f32_e32 490; GCN: v_cvt_f16_f32_e32 491; GCN: v_cvt_f16_f32_e32 492; GCN: v_cvt_f16_f32_e32 493; GCN: v_cvt_f16_f32_e32 494; GCN: v_cvt_f16_f32_e32 495; GCN: buffer_store_dwordx4 496; GCN: s_endpgm 497define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { 498 %val = load <8 x float>, <8 x float> addrspace(1)* %in 499 %cvt = fptrunc <8 x float> %val to <8 x half> 500 store <8 x half> %cvt, <8 x half> addrspace(1)* %out 501 ret void 502} 503 504; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: 505; GCN: buffer_load_dwordx4 506; GCN: buffer_load_dwordx4 507; GCN: buffer_load_dwordx4 508; GCN: buffer_load_dwordx4 509; GCN-DAG: v_cvt_f16_f32_e32 510; GCN-DAG: v_cvt_f16_f32_e32 511; GCN-DAG: v_cvt_f16_f32_e32 512; GCN-DAG: v_cvt_f16_f32_e32 513; GCN-DAG: v_cvt_f16_f32_e32 514; GCN-DAG: v_cvt_f16_f32_e32 515; GCN-DAG: v_cvt_f16_f32_e32 516; GCN-DAG: v_cvt_f16_f32_e32 517; GCN-DAG: v_cvt_f16_f32_e32 518; GCN-DAG: v_cvt_f16_f32_e32 519; GCN-DAG: v_cvt_f16_f32_e32 520; GCN-DAG: v_cvt_f16_f32_e32 521; GCN-DAG: v_cvt_f16_f32_e32 522; GCN-DAG: v_cvt_f16_f32_e32 523; GCN-DAG: v_cvt_f16_f32_e32 524; GCN-DAG: v_cvt_f16_f32_e32 525; GCN-DAG: buffer_store_dwordx4 526; GCN-DAG: buffer_store_dwordx4 527; GCN: s_endpgm 528define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { 529 %val = load <16 x float>, <16 x float> addrspace(1)* %in 530 %cvt = fptrunc <16 x float> %val to <16 x half> 531 store <16 x half> %cvt, <16 x half> addrspace(1)* %out 532 ret void 533} 534 535; FIXME: Unsafe math should fold conversions away 536; GCN-LABEL: {{^}}fadd_f16: 537; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 538; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 539; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 540; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 541; SI: v_add_f32 542; GCN: s_endpgm 543define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { 544 %add = fadd half %a, %b 545 store half %add, half addrspace(1)* %out, align 4 546 ret void 547} 548 549; GCN-LABEL: {{^}}fadd_v2f16: 550; SI: v_add_f32 551; SI: v_add_f32 552; GCN: s_endpgm 553define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { 554 %add = fadd <2 x half> %a, %b 555 store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 556 ret void 557} 558 559; GCN-LABEL: {{^}}fadd_v4f16: 560; SI: v_add_f32 561; SI: v_add_f32 562; SI: v_add_f32 563; SI: v_add_f32 564; GCN: s_endpgm 565define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 566 %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 567 %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 568 %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 569 %result = fadd <4 x half> %a, %b 570 store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 571 ret void 572} 573 574; GCN-LABEL: {{^}}fadd_v8f16: 575; SI: v_add_f32 576; SI: v_add_f32 577; SI: v_add_f32 578; SI: v_add_f32 579; SI: v_add_f32 580; SI: v_add_f32 581; SI: v_add_f32 582; SI: v_add_f32 583; GCN: s_endpgm 584define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { 585 %add = fadd <8 x half> %a, %b 586 store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 587 ret void 588} 589 590; GCN-LABEL: {{^}}fsub_f16: 591; GCN: v_subrev_f32_e32 592; GCN: s_endpgm 593define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 594 %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1 595 %a = load half, half addrspace(1)* %in 596 %b = load half, half addrspace(1)* %b_ptr 597 %sub = fsub half %a, %b 598 store half %sub, half addrspace(1)* %out 599 ret void 600} 601 602; GCN-LABEL: {{^}}test_bitcast_from_half: 603; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 604; GCN: buffer_store_short [[TMP]] 605define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { 606 %val = load half, half addrspace(1)* %in 607 %val_int = bitcast half %val to i16 608 store i16 %val_int, i16 addrspace(1)* %out 609 ret void 610} 611 612; GCN-LABEL: {{^}}test_bitcast_to_half: 613; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 614; GCN: buffer_store_short [[TMP]] 615define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 616 %val = load i16, i16 addrspace(1)* %in 617 %val_fp = bitcast i16 %val to half 618 store half %val_fp, half addrspace(1)* %out 619 ret void 620} 621 622attributes #0 = { nounwind } 623