1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-HSA,FUNC %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s 6 7; FIXME: r600 is broken because the bigger testcases spill and it's not implemented 8 9; FUNC-LABEL: {{^}}global_load_i16: 10; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}} 11; GCN-HSA: flat_load_ushort 12 13; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 14define amdgpu_kernel void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { 15entry: 16 %ld = load i16, i16 addrspace(1)* %in 17 store i16 %ld, i16 addrspace(1)* %out 18 ret void 19} 20 21; FUNC-LABEL: {{^}}global_load_v2i16: 22; GCN-NOHSA: buffer_load_dword v 23; GCN-HSA: flat_load_dword v 24 25; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 26define amdgpu_kernel void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { 27entry: 28 %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in 29 store <2 x i16> %ld, <2 x i16> addrspace(1)* %out 30 ret void 31} 32 33; FUNC-LABEL: {{^}}global_load_v3i16: 34; GCN-NOHSA: buffer_load_dwordx2 v 35; GCN-HSA: flat_load_dwordx2 v 36 37; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 38; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 39define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { 40entry: 41 %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in 42 store <3 x i16> %ld, <3 x i16> addrspace(1)* %out 43 ret void 44} 45 46; FUNC-LABEL: {{^}}global_load_v4i16: 47; GCN-NOHSA: buffer_load_dwordx2 48; GCN-HSA: flat_load_dwordx2 49 50; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 51define amdgpu_kernel void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { 52entry: 53 %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in 54 store <4 x i16> %ld, <4 x i16> addrspace(1)* %out 55 ret void 56} 57 58; FUNC-LABEL: {{^}}global_load_v8i16: 59; GCN-NOHSA: buffer_load_dwordx4 60; GCN-HSA: flat_load_dwordx4 61 62; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 63define amdgpu_kernel void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) { 64entry: 65 %ld = load <8 x i16>, <8 x i16> addrspace(1)* %in 66 store <8 x i16> %ld, <8 x i16> addrspace(1)* %out 67 ret void 68} 69 70; FUNC-LABEL: {{^}}global_load_v16i16: 71; GCN-NOHSA: buffer_load_dwordx4 72; GCN-NOHSA: buffer_load_dwordx4 73 74; GCN-HSA: flat_load_dwordx4 75; GCN-HSA: flat_load_dwordx4 76 77; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 78; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 79define amdgpu_kernel void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) { 80entry: 81 %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in 82 store <16 x i16> %ld, <16 x i16> addrspace(1)* %out 83 ret void 84} 85 86; GCN-LABEL: {{^}}global_load_v16i16_align2: 87; GCN-HSA: flat_load_dwordx4 88; GCN-HSA: flat_load_dwordx4 89; GCN-HSA: flat_store_dwordx4 90; GCN-HSA: flat_store_dwordx4 91define amdgpu_kernel void @global_load_v16i16_align2(<16 x i16> addrspace(1)* %in, <16 x i16> addrspace(1)* %out) #0 { 92entry: 93 %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in, align 2 94 store <16 x i16> %ld, <16 x i16> addrspace(1)* %out, align 32 95 ret void 96} 97 98; FUNC-LABEL: {{^}}global_zextload_i16_to_i32: 99; GCN-NOHSA: buffer_load_ushort 100; GCN-NOHSA: buffer_store_dword 101 102; GCN-HSA: flat_load_ushort 103; GCN-HSA: flat_store_dword 104 105; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 106define amdgpu_kernel void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 107 %a = load i16, i16 addrspace(1)* %in 108 %ext = zext i16 %a to i32 109 store i32 %ext, i32 addrspace(1)* %out 110 ret void 111} 112 113; FUNC-LABEL: {{^}}global_sextload_i16_to_i32: 114; GCN-NOHSA: buffer_load_sshort 115; GCN-NOHSA: buffer_store_dword 116 117; GCN-HSA: flat_load_sshort 118; GCN-HSA: flat_store_dword 119 120; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1 121; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal 122; EGCM: 16 123define amdgpu_kernel void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 124 %a = load i16, i16 addrspace(1)* %in 125 %ext = sext i16 %a to i32 126 store i32 %ext, i32 addrspace(1)* %out 127 ret void 128} 129 130; FUNC-LABEL: {{^}}global_zextload_v1i16_to_v1i32: 131; GCN-NOHSA: buffer_load_ushort 132; GCN-HSA: flat_load_ushort 133 134; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 135define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { 136 %load = load <1 x i16>, <1 x i16> addrspace(1)* %in 137 %ext = zext <1 x i16> %load to <1 x i32> 138 store <1 x i32> %ext, <1 x i32> addrspace(1)* %out 139 ret void 140} 141 142; FUNC-LABEL: {{^}}global_sextload_v1i16_to_v1i32: 143; GCN-NOHSA: buffer_load_sshort 144; GCN-HSA: flat_load_sshort 145 146; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1 147; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal 148; EGCM: 16 149define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { 150 %load = load <1 x i16>, <1 x i16> addrspace(1)* %in 151 %ext = sext <1 x i16> %load to <1 x i32> 152 store <1 x i32> %ext, <1 x i32> addrspace(1)* %out 153 ret void 154} 155 156; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i32: 157; GCN-NOHSA: buffer_load_dword 158; GCN-HSA: flat_load_dword 159 160; EGCM: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 161; EGCM: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal 162; EGCM: 16 163define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 164 %load = load <2 x i16>, <2 x i16> addrspace(1)* %in 165 %ext = zext <2 x i16> %load to <2 x i32> 166 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 167 ret void 168} 169 170; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i32: 171; GCN-NOHSA: buffer_load_dword 172 173; GCN-HSA: flat_load_dword 174 175; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XY, {{T[0-9]\.[XYZW]}}, 176; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}} 177; EGCM: VTX_READ_32 [[DST:T[0-9].[XYZW]]], [[DST]], 0, #1 178; TODO: This should use ASHR instead of LSHR + BFE 179; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].X, [[DST]], 0.0, literal 180; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV.[XYZW]}}, 0.0, literal 181; EGCM-DAG: 16 182; EGCM-DAG: 16 183define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 184 %load = load <2 x i16>, <2 x i16> addrspace(1)* %in 185 %ext = sext <2 x i16> %load to <2 x i32> 186 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 187 ret void 188} 189 190; FUNC-LABEL: {{^}}global_zextload_v3i16_to_v3i32: 191; GCN-NOHSA: buffer_load_dwordx2 192; GCN-HSA: flat_load_dwordx2 193 194; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}} 195; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}} 196; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}, 197; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}}, 198; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1 199; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 200; TODO: This should use DST, but for some there are redundant MOVs 201; EGCM: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal 202; EGCM: 16 203define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { 204entry: 205 %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in 206 %ext = zext <3 x i16> %ld to <3 x i32> 207 store <3 x i32> %ext, <3 x i32> addrspace(1)* %out 208 ret void 209} 210 211; FUNC-LABEL: {{^}}global_sextload_v3i16_to_v3i32: 212; GCN-NOHSA: buffer_load_dwordx2 213; GCN-HSA: flat_load_dwordx2 214 215; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}} 216; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}} 217; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}, 218; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}}, 219; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 0, #1 220; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 4, #1 221; TODO: This should use DST, but for some there are redundant MOVs 222; EGCM-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal 223; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal 224; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal 225; EGCM-DAG: 16 226; EGCM-DAG: 16 227define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { 228entry: 229 %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in 230 %ext = sext <3 x i16> %ld to <3 x i32> 231 store <3 x i32> %ext, <3 x i32> addrspace(1)* %out 232 ret void 233} 234 235; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i32: 236; GCN-NOHSA: buffer_load_dwordx2 237 238; GCN-HSA: flat_load_dwordx2 239 240; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}} 241; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, 242; EGCM: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 243; TODO: This should use DST, but for some there are redundant MOVs 244; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST]].Y, {{.*}}, literal 245; EGCM-DAG: 16 246; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST]].W, {{.*}}, literal 247; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].X, {{.*}}, literal 248; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{.*}}, literal 249; EGCM-DAG: 16 250define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { 251 %load = load <4 x i16>, <4 x i16> addrspace(1)* %in 252 %ext = zext <4 x i16> %load to <4 x i32> 253 store <4 x i32> %ext, <4 x i32> addrspace(1)* %out 254 ret void 255} 256 257; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i32: 258; GCN-NOHSA: buffer_load_dwordx2 259 260; GCN-HSA: flat_load_dwordx2 261 262; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}} 263; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, 264; EGCM: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 265; TODO: We should use ASHR instead of LSHR + BFE 266; TODO: This should use DST, but for some there are redundant MOVs 267; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].X, {{.*}}, 0.0, literal 268; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{.*}}, 0.0, literal 269; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Z, {{.*}}, 0.0, literal 270; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].W, {{.*}}, 0.0, literal 271; EGCM-DAG: 16 272; EGCM-DAG: 16 273; EGCM-DAG: 16 274; EGCM-DAG: 16 275define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { 276 %load = load <4 x i16>, <4 x i16> addrspace(1)* %in 277 %ext = sext <4 x i16> %load to <4 x i32> 278 store <4 x i32> %ext, <4 x i32> addrspace(1)* %out 279 ret void 280} 281 282; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i32: 283; GCN-NOHSA: buffer_load_dwordx4 284; GCN-HSA: flat_load_dwordx4 285 286; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}} 287; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]], {{T[0-9]\.[XYZW]}} 288; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, 289; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, 290; EGCM: CF_END 291; EGCM: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 292; TODO: These should use LSHR instead of BFE_UINT 293; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal 294; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].W, {{.*}}, literal 295; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal 296; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].W, {{.*}}, literal 297; EGCM-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, literal 298; EGCM-DAG: AND_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, literal 299; EGCM-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, literal 300; EGCM-DAG: AND_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, literal 301; EGCM-DAG: 65535 302; EGCM-DAG: 65535 303; EGCM-DAG: 65535 304; EGCM-DAG: 65535 305; EGCM-DAG: 16 306; EGCM-DAG: 16 307; EGCM-DAG: 16 308; EGCM-DAG: 16 309define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { 310 %load = load <8 x i16>, <8 x i16> addrspace(1)* %in 311 %ext = zext <8 x i16> %load to <8 x i32> 312 store <8 x i32> %ext, <8 x i32> addrspace(1)* %out 313 ret void 314} 315 316; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i32: 317; GCN-NOHSA: buffer_load_dwordx4 318; GCN-HSA: flat_load_dwordx4 319 320; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}} 321; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]], {{T[0-9]\.[XYZW]}} 322; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, 323; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, 324; EGCM: CF_END 325; EGCM: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 326; TODO: These should use ASHR instead of LSHR + BFE_INT 327; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, {{.*}}, 0.0, literal 328; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].W, {{.*}}, 0.0, literal 329; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Y, {{.*}}, 0.0, literal 330; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].W, {{.*}}, 0.0, literal 331; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, 0.0, literal 332; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, 0.0, literal 333; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, 0.0, literal 334; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, 0.0, literal 335; EGCM-DAG: 16 336; EGCM-DAG: 16 337; EGCM-DAG: 16 338; EGCM-DAG: 16 339; EGCM-DAG: 16 340; EGCM-DAG: 16 341; EGCM-DAG: 16 342; EGCM-DAG: 16 343define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { 344 %load = load <8 x i16>, <8 x i16> addrspace(1)* %in 345 %ext = sext <8 x i16> %load to <8 x i32> 346 store <8 x i32> %ext, <8 x i32> addrspace(1)* %out 347 ret void 348} 349 350; FUNC-LABEL: {{^}}global_zextload_v16i16_to_v16i32: 351; GCN-NOHSA: buffer_load_dwordx4 352; GCN-NOHSA: buffer_load_dwordx4 353 354; GCN-HSA: flat_load_dwordx4 355; GCN-HSA: flat_load_dwordx4 356 357; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 358; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 359define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { 360 %load = load <16 x i16>, <16 x i16> addrspace(1)* %in 361 %ext = zext <16 x i16> %load to <16 x i32> 362 store <16 x i32> %ext, <16 x i32> addrspace(1)* %out 363 ret void 364} 365 366; FUNC-LABEL: {{^}}global_sextload_v16i16_to_v16i32: 367 368; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 369; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 370define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { 371 %load = load <16 x i16>, <16 x i16> addrspace(1)* %in 372 %ext = sext <16 x i16> %load to <16 x i32> 373 store <16 x i32> %ext, <16 x i32> addrspace(1)* %out 374 ret void 375} 376 377; FUNC-LABEL: {{^}}global_zextload_v32i16_to_v32i32: 378; GCN-NOHSA: buffer_load_dwordx4 379; GCN-NOHSA: buffer_load_dwordx4 380; GCN-NOHSA: buffer_load_dwordx4 381; GCN-NOHSA: buffer_load_dwordx4 382 383; GCN-HSA: flat_load_dwordx4 384; GCN-HSA: flat_load_dwordx4 385; GCN-HSA: flat_load_dwordx4 386; GCN-HSA: flat_load_dwordx4 387 388; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 389; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 390; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 391; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 392define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { 393 %load = load <32 x i16>, <32 x i16> addrspace(1)* %in 394 %ext = zext <32 x i16> %load to <32 x i32> 395 store <32 x i32> %ext, <32 x i32> addrspace(1)* %out 396 ret void 397} 398 399; FUNC-LABEL: {{^}}global_sextload_v32i16_to_v32i32: 400; GCN-NOHSA: buffer_load_dwordx4 401; GCN-NOHSA: buffer_load_dwordx4 402; GCN-NOHSA: buffer_load_dwordx4 403; GCN-NOHSA: buffer_load_dwordx4 404 405; GCN-HSA: flat_load_dwordx4 406; GCN-HSA: flat_load_dwordx4 407; GCN-HSA: flat_load_dwordx4 408; GCN-HSA: flat_load_dwordx4 409 410; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 411; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 412; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 413; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 414define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { 415 %load = load <32 x i16>, <32 x i16> addrspace(1)* %in 416 %ext = sext <32 x i16> %load to <32 x i32> 417 store <32 x i32> %ext, <32 x i32> addrspace(1)* %out 418 ret void 419} 420 421; FUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i32: 422; GCN-NOHSA: buffer_load_dwordx4 423; GCN-NOHSA: buffer_load_dwordx4 424; GCN-NOHSA: buffer_load_dwordx4 425; GCN-NOHSA: buffer_load_dwordx4 426; GCN-NOHSA: buffer_load_dwordx4 427; GCN-NOHSA: buffer_load_dwordx4 428; GCN-NOHSA: buffer_load_dwordx4 429; GCN-NOHSA: buffer_load_dwordx4 430 431; GCN-HSA: flat_load_dwordx4 432; GCN-HSA: flat_load_dwordx4 433; GCN-HSA: flat_load_dwordx4 434; GCN-HSA: flat_load_dwordx4 435; GCN-HSA: flat_load_dwordx4 436; GCN-HSA: flat_load_dwordx4 437; GCN-HSA: flat_load_dwordx4 438; GCN-HSA: flat_load_dwordx4 439 440; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 441; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 442; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 443; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 444; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1 445; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1 446; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1 447; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1 448define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { 449 %load = load <64 x i16>, <64 x i16> addrspace(1)* %in 450 %ext = zext <64 x i16> %load to <64 x i32> 451 store <64 x i32> %ext, <64 x i32> addrspace(1)* %out 452 ret void 453} 454 455; FUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i32: 456 457; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1 458; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1 459; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1 460; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1 461; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1 462; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1 463; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1 464; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1 465define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { 466 %load = load <64 x i16>, <64 x i16> addrspace(1)* %in 467 %ext = sext <64 x i16> %load to <64 x i32> 468 store <64 x i32> %ext, <64 x i32> addrspace(1)* %out 469 ret void 470} 471 472; FUNC-LABEL: {{^}}global_zextload_i16_to_i64: 473; GCN-NOHSA-DAG: buffer_load_ushort v[[LO:[0-9]+]], 474; GCN-HSA-DAG: flat_load_ushort v[[LO:[0-9]+]], 475; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} 476 477; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] 478; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} 479 480; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 481; EGCM: MOV {{.*}}, 0.0 482define amdgpu_kernel void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 483 %a = load i16, i16 addrspace(1)* %in 484 %ext = zext i16 %a to i64 485 store i64 %ext, i64 addrspace(1)* %out 486 ret void 487} 488 489; FUNC-LABEL: {{^}}global_sextload_i16_to_i64: 490; FIXME: Need to optimize this sequence to avoid extra bfe: 491; t28: i32,ch = load<LD2[%in(addrspace=1)], anyext from i16> t12, t27, undef:i64 492; t31: i64 = any_extend t28 493; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16 494 495; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]], 496; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]], 497; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]], 498; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16 499; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] 500 501; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] 502; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} 503 504; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 505; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal 506; TODO: These could be expanded earlier using ASHR 15 507; EGCM: 31 508define amdgpu_kernel void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 509 %a = load i16, i16 addrspace(1)* %in 510 %ext = sext i16 %a to i64 511 store i64 %ext, i64 addrspace(1)* %out 512 ret void 513} 514 515; FUNC-LABEL: {{^}}global_zextload_v1i16_to_v1i64: 516 517; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 518; EGCM: MOV {{.*}}, 0.0 519define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { 520 %load = load <1 x i16>, <1 x i16> addrspace(1)* %in 521 %ext = zext <1 x i16> %load to <1 x i64> 522 store <1 x i64> %ext, <1 x i64> addrspace(1)* %out 523 ret void 524} 525 526; FUNC-LABEL: {{^}}global_sextload_v1i16_to_v1i64: 527 528; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 529; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal 530; TODO: These could be expanded earlier using ASHR 15 531; EGCM: 31 532define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 { 533 %load = load <1 x i16>, <1 x i16> addrspace(1)* %in 534 %ext = sext <1 x i16> %load to <1 x i64> 535 store <1 x i64> %ext, <1 x i64> addrspace(1)* %out 536 ret void 537} 538 539; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i64: 540define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 541 %load = load <2 x i16>, <2 x i16> addrspace(1)* %in 542 %ext = zext <2 x i16> %load to <2 x i64> 543 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 544 ret void 545} 546 547; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i64: 548 549; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 550define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 551 %load = load <2 x i16>, <2 x i16> addrspace(1)* %in 552 %ext = sext <2 x i16> %load to <2 x i64> 553 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 554 ret void 555} 556 557; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i64: 558 559; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 560define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { 561 %load = load <4 x i16>, <4 x i16> addrspace(1)* %in 562 %ext = zext <4 x i16> %load to <4 x i64> 563 store <4 x i64> %ext, <4 x i64> addrspace(1)* %out 564 ret void 565} 566 567; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i64: 568 569; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 570define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { 571 %load = load <4 x i16>, <4 x i16> addrspace(1)* %in 572 %ext = sext <4 x i16> %load to <4 x i64> 573 store <4 x i64> %ext, <4 x i64> addrspace(1)* %out 574 ret void 575} 576 577; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i64: 578 579; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 580define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { 581 %load = load <8 x i16>, <8 x i16> addrspace(1)* %in 582 %ext = zext <8 x i16> %load to <8 x i64> 583 store <8 x i64> %ext, <8 x i64> addrspace(1)* %out 584 ret void 585} 586 587; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i64: 588 589; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 590define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { 591 %load = load <8 x i16>, <8 x i16> addrspace(1)* %in 592 %ext = sext <8 x i16> %load to <8 x i64> 593 store <8 x i64> %ext, <8 x i64> addrspace(1)* %out 594 ret void 595} 596 597; FUNC-LABEL: {{^}}global_zextload_v16i16_to_v16i64: 598 599; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 600; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 601define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { 602 %load = load <16 x i16>, <16 x i16> addrspace(1)* %in 603 %ext = zext <16 x i16> %load to <16 x i64> 604 store <16 x i64> %ext, <16 x i64> addrspace(1)* %out 605 ret void 606} 607 608; FUNC-LABEL: {{^}}global_sextload_v16i16_to_v16i64: 609 610; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 611; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 612define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { 613 %load = load <16 x i16>, <16 x i16> addrspace(1)* %in 614 %ext = sext <16 x i16> %load to <16 x i64> 615 store <16 x i64> %ext, <16 x i64> addrspace(1)* %out 616 ret void 617} 618 619; FUNC-LABEL: {{^}}global_zextload_v32i16_to_v32i64: 620 621; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 622; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 623; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 624; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 625define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { 626 %load = load <32 x i16>, <32 x i16> addrspace(1)* %in 627 %ext = zext <32 x i16> %load to <32 x i64> 628 store <32 x i64> %ext, <32 x i64> addrspace(1)* %out 629 ret void 630} 631 632; FUNC-LABEL: {{^}}global_sextload_v32i16_to_v32i64: 633 634; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 635; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 636; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 637; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 638define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { 639 %load = load <32 x i16>, <32 x i16> addrspace(1)* %in 640 %ext = sext <32 x i16> %load to <32 x i64> 641 store <32 x i64> %ext, <32 x i64> addrspace(1)* %out 642 ret void 643} 644 645; ; XFUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i64: 646; define amdgpu_kernel void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { 647; %load = load <64 x i16>, <64 x i16> addrspace(1)* %in 648; %ext = zext <64 x i16> %load to <64 x i64> 649; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out 650; ret void 651; } 652 653; ; XFUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i64: 654; define amdgpu_kernel void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { 655; %load = load <64 x i16>, <64 x i16> addrspace(1)* %in 656; %ext = sext <64 x i16> %load to <64 x i64> 657; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out 658; ret void 659; } 660 661attributes #0 = { nounwind } 662