1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-MUBUF %s 2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s 3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-FLATSCR %s 5 6; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo: 7; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8; GFX900-NEXT: ds_read_u16 v2, v0 9; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 10; GFX900-DAG: s_waitcnt lgkmcnt(0) 11; GFX900-DAG: v_mov_b32_e32 v1, v2 12; GFX900-DAG: ds_read_u16_d16_hi v1, v0 offset:16 13; GFX900: ds_write_b16 [[ZERO]], v2 14; GFX900-NEXT: s_waitcnt lgkmcnt(1) 15; GFX900-NEXT: v_mov_b32_e32 v0, v1 16; GFX900-NEXT: s_waitcnt lgkmcnt(0) 17; GFX900-NEXT: s_setpc_b64 s[30:31] 18define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias %in) #0 { 19entry: 20 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 21 %load.lo = load i16, i16 addrspace(3)* %in 22 %load.hi = load i16, i16 addrspace(3)* %gep 23 store i16 %load.lo, i16 addrspace(3)* null 24 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 25 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 26 ret <2 x i16> %build1 27} 28 29; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi: 30; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31; GFX900-DAG: ds_read_u16 [[LO:v[0-9]+]], v0 32; GFX900-DAG: ds_read_u16 [[HI:v[0-9]+]], v0 offset:16 33; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 34; GFX900-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LO]] 35; GFX900-DAG: s_waitcnt lgkmcnt(0) 36; GFX900-DAG: ds_write_b16 [[ZERO]], [[HI]] 37; GFX900: v_lshl_or_b32 [[HI]], [[HI]], 16, [[AND]] 38; GFX900-NEXT: s_waitcnt lgkmcnt(0) 39; GFX900-NEXT: s_setpc_b64 s[30:31] 40define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 { 41entry: 42 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 43 %load.lo = load i16, i16 addrspace(3)* %in 44 %load.hi = load i16, i16 addrspace(3)* %gep 45 store i16 %load.hi, i16 addrspace(3)* null 46 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 47 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 48 ret <2 x i16> %build1 49} 50 51; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lohi: 52; GFX900: ds_read_u16 v3, v0 53; GFX900-NEXT: ds_read_u16 v0, v0 offset:16 54; GFX900-NEXT: s_waitcnt lgkmcnt(1) 55; GFX900-NEXT: ds_write_b16 v1, v3 56; GFX900-NEXT: s_waitcnt lgkmcnt(1) 57; GFX900-NEXT: ds_write_b16 v2, v0 58; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3 59; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 60; GFX900-NEXT: s_waitcnt lgkmcnt(0) 61; GFX900-NEXT: s_setpc_b64 s[30:31] 62define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalias %in, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 { 63entry: 64 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 65 %load.lo = load i16, i16 addrspace(3)* %in 66 %load.hi = load i16, i16 addrspace(3)* %gep 67 store i16 %load.lo, i16 addrspace(3)* %out0 68 store i16 %load.hi, i16 addrspace(3)* %out1 69 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 70 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 71 ret <2 x i16> %build1 72} 73 74; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo: 75; GCN: s_waitcnt 76; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 77; GFX900-NEXT: s_waitcnt 78; GFX900-NEXT: s_setpc_b64 79 80; NO-D16-HI: ds_read_u16 v 81define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 { 82entry: 83 %load = load i16, i16 addrspace(3)* %in 84 %build = insertelement <2 x i16> undef, i16 %load, i32 1 85 ret <2 x i16> %build 86} 87 88; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo: 89; GCN: s_waitcnt 90; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 91; GFX900-NEXT: s_waitcnt 92; GFX900-NEXT: v_mov_b32_e32 v0, v1 93; GFX900-NEXT: s_setpc_b64 94 95; NO-D16-HI: ds_read_u16 v 96define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 { 97entry: 98 %load = load i16, i16 addrspace(3)* %in 99 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 100 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 101 ret <2 x i16> %build1 102} 103 104; Show that we get reasonable regalloc without physreg constraints. 105; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg: 106; GCN: s_waitcnt 107; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 108; GFX900-NEXT: s_waitcnt 109; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 110; GFX900-NEXT: s_waitcnt 111; GFX900-NEXT: s_setpc_b64 112 113; NO-D16-HI: ds_read_u16 v 114define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 { 115entry: 116 %load = load i16, i16 addrspace(3)* %in 117 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 118 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 119 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 120 ret void 121} 122 123; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo: 124; GCN: s_waitcnt 125; GFX900-NEXT: v_mov_b32_e32 v1, 0 126; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 127; GFX900-NEXT: s_waitcnt 128; GFX900-NEXT: v_mov_b32_e32 v0, v1 129; GFX900-NEXT: s_setpc_b64 130 131; NO-D16-HI: ds_read_u16 v 132define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 { 133entry: 134 %load = load i16, i16 addrspace(3)* %in 135 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 136 ret <2 x i16> %build 137} 138 139; FIXME: Remove m0 initialization 140; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift: 141; GCN: s_waitcnt 142; GFX900-NEXT: ds_read_u16 v0, v0 143; GFX900-NEXT: s_waitcnt lgkmcnt(0) 144; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 145; GFX900-NEXT: s_setpc_b64 146 147; NO-D16-HI: ds_read_u16 v 148; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0 149define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 { 150entry: 151 %load = load i16, i16 addrspace(3)* %in 152 %zext = zext i16 %load to i32 153 %shift = shl i32 %zext, 16 154 ret i32 %shift 155} 156 157; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg: 158; GCN: s_waitcnt 159; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 160; GFX900-NEXT: s_waitcnt 161; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 162; GFX900-NEXT: s_waitcnt 163; GFX900-NEXT: s_setpc_b64 164 165; NO-D16-HI: ds_read_u16 v 166define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 { 167entry: 168 %load = load half, half addrspace(3)* %in 169 %build0 = insertelement <2 x half> undef, half %reg, i32 0 170 %build1 = insertelement <2 x half> %build0, half %load, i32 1 171 store <2 x half> %build1, <2 x half> addrspace(1)* undef 172 ret void 173} 174 175; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8: 176; GCN: s_waitcnt 177; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 178; GFX900-NEXT: s_waitcnt 179; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 180; GFX900-NEXT: s_waitcnt 181; GFX900-NEXT: s_setpc_b64 182 183; NO-D16-HI: ds_read_u8 v 184define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 185entry: 186 %load = load i8, i8 addrspace(3)* %in 187 %ext = zext i8 %load to i16 188 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 189 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 190 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 191 ret void 192} 193 194; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8: 195; GCN: s_waitcnt 196; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 197; GFX900-NEXT: s_waitcnt 198; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 199; GFX900-NEXT: s_waitcnt 200; GFX900-NEXT: s_setpc_b64 201 202; NO-D16-HI: ds_read_i8 v 203define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 204entry: 205 %load = load i8, i8 addrspace(3)* %in 206 %ext = sext i8 %load to i16 207 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 208 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 209 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 210 ret void 211} 212 213; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_zexti8: 214; GCN: s_waitcnt 215; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 216; GFX900-NEXT: s_waitcnt 217; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 218; GFX900-NEXT: s_waitcnt 219; GFX900-NEXT: s_setpc_b64 220 221; NO-D16-HI: ds_read_u8 v 222define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 { 223entry: 224 %load = load i8, i8 addrspace(3)* %in 225 %ext = zext i8 %load to i16 226 %bitcast = bitcast i16 %ext to half 227 228 %build0 = insertelement <2 x half> undef, half %reg, i32 0 229 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 230 store <2 x half> %build1, <2 x half> addrspace(1)* undef 231 ret void 232} 233 234; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_sexti8: 235; GCN: s_waitcnt 236; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 237; GFX900-NEXT: s_waitcnt 238; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 239; GFX900-NEXT: s_waitcnt 240; GFX900-NEXT: s_setpc_b64 241 242; NO-D16-HI: ds_read_i8 v 243define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 { 244entry: 245 %load = load i8, i8 addrspace(3)* %in 246 %ext = sext i8 %load to i16 247 %bitcast = bitcast i16 %ext to half 248 249 %build0 = insertelement <2 x half> undef, half %reg, i32 0 250 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 251 store <2 x half> %build1, <2 x half> addrspace(1)* undef 252 ret void 253} 254 255; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg: 256; GCN: s_waitcnt 257; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 258; GFX900-NEXT: s_waitcnt 259; GFX900-NEXT: global_store_dword 260; GFX900-NEXT: s_waitcnt 261; GFX900-NEXT: s_setpc_b64 262define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 { 263entry: 264 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047 265 %load = load i16, i16 addrspace(1)* %gep 266 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 267 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 268 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 269 ret void 270} 271 272; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg: 273; GCN: s_waitcnt 274; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 275; GFX900-NEXT: s_waitcnt 276; GFX900-NEXT: global_store_dword 277; GFX900-NEXT: s_waitcnt 278; GFX900-NEXT: s_setpc_b64 279define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 { 280entry: 281 %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047 282 %load = load half, half addrspace(1)* %gep 283 %build0 = insertelement <2 x half> undef, half %reg, i32 0 284 %build1 = insertelement <2 x half> %build0, half %load, i32 1 285 store <2 x half> %build1, <2 x half> addrspace(1)* undef 286 ret void 287} 288 289; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8: 290; GCN: s_waitcnt 291; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 292; GFX900-NEXT: s_waitcnt 293; GFX900-NEXT: global_store_dword 294; GFX900-NEXT: s_waitcnt 295; GFX900-NEXT: s_setpc_b64 296define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 { 297entry: 298 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 299 %load = load i8, i8 addrspace(1)* %gep 300 %ext = zext i8 %load to i16 301 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 302 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 303 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 304 ret void 305} 306 307; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8: 308; GCN: s_waitcnt 309; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 310; GFX900-NEXT: s_waitcnt 311; GFX900-NEXT: global_store_dword 312; GFX900-NEXT: s_waitcnt 313; GFX900-NEXT: s_setpc_b64 314define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 { 315entry: 316 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 317 %load = load i8, i8 addrspace(1)* %gep 318 %ext = sext i8 %load to i16 319 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 320 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 321 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 322 ret void 323} 324 325; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_sexti8: 326; GCN: s_waitcnt 327; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 328; GFX900-NEXT: s_waitcnt 329; GFX900-NEXT: global_store_dword 330; GFX900-NEXT: s_waitcnt 331; GFX900-NEXT: s_setpc_b64 332define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %reg) #0 { 333entry: 334 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 335 %load = load i8, i8 addrspace(1)* %gep 336 %ext = sext i8 %load to i16 337 %bitcast = bitcast i16 %ext to half 338 %build0 = insertelement <2 x half> undef, half %reg, i32 0 339 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 340 store <2 x half> %build1, <2 x half> addrspace(1)* undef 341 ret void 342} 343 344; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_zexti8: 345; GCN: s_waitcnt 346; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 347; GFX900-NEXT: s_waitcnt 348; GFX900-NEXT: global_store_dword 349; GFX900-NEXT: s_waitcnt 350; GFX900-NEXT: s_setpc_b64 351define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %reg) #0 { 352entry: 353 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 354 %load = load i8, i8 addrspace(1)* %gep 355 %ext = zext i8 %load to i16 356 %bitcast = bitcast i16 %ext to half 357 %build0 = insertelement <2 x half> undef, half %reg, i32 0 358 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 359 store <2 x half> %build1, <2 x half> addrspace(1)* undef 360 ret void 361} 362 363; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg: 364; GCN: s_waitcnt 365; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] 366; GFX900-NEXT: s_waitcnt 367; GFX900-NEXT: global_store_dword v[0:1], v2 368; GFX900-NEXT: s_waitcnt 369; GFX900-NEXT: s_setpc_b64 370 371; NO-D16-HI: flat_load_ushort v{{[0-9]+}} 372; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 373; GFX803: v_or_b32_sdwa 374; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 375define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 { 376entry: 377 %load = load i16, i16* %in 378 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 379 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 380 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 381 ret void 382} 383 384; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg: 385; GCN: s_waitcnt 386; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] 387; GFX900-NEXT: s_waitcnt 388; GFX900-NEXT: global_store_dword v[0:1], v2 389; GFX900-NEXT: s_waitcnt 390; GFX900-NEXT: s_setpc_b64 391 392; NO-D16-HI: flat_load_ushort v{{[0-9]+}} 393; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 394; GFX803: v_or_b32_sdwa 395; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 396define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 { 397entry: 398 %load = load half, half* %in 399 %build0 = insertelement <2 x half> undef, half %reg, i32 0 400 %build1 = insertelement <2 x half> %build0, half %load, i32 1 401 store <2 x half> %build1, <2 x half> addrspace(1)* undef 402 ret void 403} 404 405; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8: 406; GCN: s_waitcnt 407; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] 408; GFX900-NEXT: s_waitcnt 409; GFX900-NEXT: global_store_dword v[0:1], v2 410; GFX900-NEXT: s_waitcnt 411; GFX900-NEXT: s_setpc_b64 412 413; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} 414; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 415; GFX803: v_or_b32_sdwa 416; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 417define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 { 418entry: 419 %load = load i8, i8* %in 420 %ext = zext i8 %load to i16 421 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 422 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 423 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 424 ret void 425} 426 427; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8: 428; GCN: s_waitcnt 429; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] 430; GFX900-NEXT: s_waitcnt 431; GFX900-NEXT: global_store_dword v[0:1], v2 432; GFX900-NEXT: s_waitcnt 433; GFX900-NEXT: s_setpc_b64 434 435; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} 436; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 437; GFX803: v_or_b32_sdwa 438; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 439define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 { 440entry: 441 %load = load i8, i8* %in 442 %ext = sext i8 %load to i16 443 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 444 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 445 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 446 ret void 447} 448 449; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_zexti8: 450; GCN: s_waitcnt 451; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] 452; GFX900-NEXT: s_waitcnt 453; GFX900-NEXT: global_store_dword v[0:1], v2 454; GFX900-NEXT: s_waitcnt 455; GFX900-NEXT: s_setpc_b64 456 457; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} 458; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 459; GFX803: v_or_b32_sdwa 460; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 461define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 { 462entry: 463 %load = load i8, i8* %in 464 %ext = zext i8 %load to i16 465 %bitcast = bitcast i16 %ext to half 466 %build0 = insertelement <2 x half> undef, half %reg, i32 0 467 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 468 store <2 x half> %build1, <2 x half> addrspace(1)* undef 469 ret void 470} 471 472; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_sexti8: 473; GCN: s_waitcnt 474; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] 475; GFX900-NEXT: s_waitcnt 476; GFX900-NEXT: global_store_dword v[0:1], v2 477; GFX900-NEXT: s_waitcnt 478; GFX900-NEXT: s_setpc_b64 479 480; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} 481; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 482; GFX803: v_or_b32_sdwa 483; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 484define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 { 485entry: 486 %load = load i8, i8* %in 487 %ext = sext i8 %load to i16 488 %bitcast = bitcast i16 %ext to half 489 %build0 = insertelement <2 x half> undef, half %reg, i32 0 490 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 491 store <2 x half> %build1, <2 x half> addrspace(1)* undef 492 ret void 493} 494 495; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: 496; GCN: s_waitcnt 497; GFX900-MUBUF: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} 498; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}} 499; GFX900-NEXT: s_waitcnt 500; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 501; GFX900-NEXT: s_waitcnt 502; GFX900-NEXT: s_setpc_b64 503 504; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} 505define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 { 506entry: 507 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 508 %load = load i16, i16 addrspace(5)* %gep 509 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 510 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 511 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 512 ret void 513} 514 515; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: 516; GCN: s_waitcnt 517; GFX900-MUBUF: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} 518; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}} 519; GFX900-NEXT: s_waitcnt 520; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 521; GFX900-NEXT: s_waitcnt 522; GFX900-NEXT: s_setpc_b64 523 524; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} 525define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval(half) %in, half %reg) #0 { 526entry: 527 %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047 528 %load = load half, half addrspace(5)* %gep 529 %build0 = insertelement <2 x half> undef, half %reg, i32 0 530 %build1 = insertelement <2 x half> %build0, half %load, i32 1 531 store <2 x half> %build1, <2 x half> addrspace(1)* undef 532 ret void 533} 534 535; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: 536; GCN: s_waitcnt 537; GFX900-MUBUFF: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}} 538; GFX900-FLATSCR: s_movk_i32 [[SOFF:[^,]+]], 0xffe 539; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]]{{$}} 540; GFX900: s_waitcnt 541; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 542; GFX900-NEXT: s_waitcnt 543; GFX900-NEXT: s_setpc_b64 544 545; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}} 546define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 { 547entry: 548 %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) 549 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 550 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 551 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 552 ret void 553} 554 555; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: 556; GCN: s_waitcnt 557; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} 558; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe 559; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]]{{$}} 560; GFX900-NEXT: s_waitcnt 561; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 562; GFX900-NEXT: s_waitcnt 563; GFX900-NEXT: s_setpc_b64 564 565; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}} 566define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 { 567entry: 568 %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) 569 %build0 = insertelement <2 x half> undef, half %reg, i32 0 570 %build1 = insertelement <2 x half> %build0, half %load, i32 1 571 store <2 x half> %build1, <2 x half> addrspace(1)* undef 572 ret void 573} 574 575; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: 576; GCN: s_waitcnt 577; GFX900-MUBUF: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 578; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}} 579; GFX900-NEXT: s_waitcnt 580; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 581; GFX900-NEXT: s_waitcnt 582; GFX900-NEXT: s_setpc_b64 583 584; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 585define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, i16 %reg) #0 { 586entry: 587 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 588 %load = load i8, i8 addrspace(5)* %gep 589 %ext = zext i8 %load to i16 590 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 591 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 592 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 593 ret void 594} 595 596; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8: 597; GCN: s_waitcnt 598; GFX900-MUBUF: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 599; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}} 600; GFX900-NEXT: s_waitcnt 601; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 602; GFX900-NEXT: s_waitcnt 603; GFX900-NEXT: s_setpc_b64 604 605; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 606define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, half %reg) #0 { 607entry: 608 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 609 %load = load i8, i8 addrspace(5)* %gep 610 %ext = zext i8 %load to i16 611 %bitcast = bitcast i16 %ext to half 612 %build0 = insertelement <2 x half> undef, half %reg, i32 0 613 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 614 store <2 x half> %build1, <2 x half> addrspace(1)* undef 615 ret void 616} 617 618; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8: 619; GCN: s_waitcnt 620; GFX900-MUBUF: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 621; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}} 622; GFX900-NEXT: s_waitcnt 623; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 624; GFX900-NEXT: s_waitcnt 625; GFX900-NEXT: s_setpc_b64 626 627; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 628define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, half %reg) #0 { 629entry: 630 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 631 %load = load i8, i8 addrspace(5)* %gep 632 %ext = sext i8 %load to i16 633 %bitcast = bitcast i16 %ext to half 634 %build0 = insertelement <2 x half> undef, half %reg, i32 0 635 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 636 store <2 x half> %build1, <2 x half> addrspace(1)* undef 637 ret void 638} 639 640; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: 641; GCN: s_waitcnt 642; GFX900-MUBUF: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 643; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}} 644; GFX900-NEXT: s_waitcnt 645; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 646; GFX900-NEXT: s_waitcnt 647; GFX900-NEXT: s_setpc_b64 648 649; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 650define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, i16 %reg) #0 { 651entry: 652 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 653 %load = load i8, i8 addrspace(5)* %gep 654 %ext = sext i8 %load to i16 655 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 656 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 657 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 658 ret void 659} 660 661; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: 662; GCN: s_waitcnt 663; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} 664; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe 665; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}} 666; GFX900-NEXT: s_waitcnt 667; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 668; GFX900-NEXT: s_waitcnt 669; GFX900-NEXT: s_setpc_b64 670 671; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}} 672define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 { 673entry: 674 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 675 %ext = zext i8 %load to i16 676 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 677 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 678 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 679 ret void 680} 681 682; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: 683; GCN: s_waitcnt 684; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} 685; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe 686; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]]{{$}} 687; GFX900-NEXT: s_waitcnt 688; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 689; GFX900-NEXT: s_waitcnt 690; GFX900-NEXT: s_setpc_b64 691 692; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}} 693define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 { 694entry: 695 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 696 %ext = sext i8 %load to i16 697 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 698 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 699 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 700 ret void 701} 702 703; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: 704; GCN: s_waitcnt 705; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} 706; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe 707; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}} 708; GFX900-NEXT: s_waitcnt 709; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 710; GFX900-NEXT: s_waitcnt 711; GFX900-NEXT: s_setpc_b64 712 713; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}} 714define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 { 715entry: 716 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 717 %ext = zext i8 %load to i16 718 %bc.ext = bitcast i16 %ext to half 719 %build0 = insertelement <2 x half> undef, half %reg, i32 0 720 %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1 721 store <2 x half> %build1, <2 x half> addrspace(1)* undef 722 ret void 723} 724 725; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg: 726; GCN: s_waitcnt 727; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 728; GFX900-NEXT: s_waitcnt 729; GFX900-NEXT: global_store_dword 730; GFX900-NEXT: s_waitcnt 731; GFX900-NEXT: s_setpc_b64 732 733; GFX803: flat_load_ushort 734; GFX906: global_load_ushort 735define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 { 736entry: 737 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047 738 %load = load i16, i16 addrspace(4)* %gep 739 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 740 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 741 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 742 ret void 743} 744 745; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg 746; GCN: s_waitcnt 747; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 748; GFX900-NEXT: s_waitcnt 749; GFX900-NEXT: global_store_dword 750; GFX900-NEXT: s_waitcnt 751; GFX900-NEXT: s_setpc_b64 752 753; GFX803: flat_load_ushort 754; GFX906: global_load_ushort 755define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 { 756entry: 757 %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047 758 %load = load half, half addrspace(4)* %gep 759 %build0 = insertelement <2 x half> undef, half %reg, i32 0 760 %build1 = insertelement <2 x half> %build0, half %load, i32 1 761 store <2 x half> %build1, <2 x half> addrspace(1)* undef 762 ret void 763} 764 765; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_sexti8: 766; GCN: s_waitcnt 767; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 768; GFX900-NEXT: s_waitcnt 769; GFX900-NEXT: global_store_dword 770; GFX900-NEXT: s_waitcnt 771; GFX900-NEXT: s_setpc_b64 772define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half %reg) #0 { 773entry: 774 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 775 %load = load i8, i8 addrspace(4)* %gep 776 %ext = sext i8 %load to i16 777 %bitcast = bitcast i16 %ext to half 778 %build0 = insertelement <2 x half> undef, half %reg, i32 0 779 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 780 store <2 x half> %build1, <2 x half> addrspace(1)* undef 781 ret void 782} 783 784; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_zexti8: 785; GCN: s_waitcnt 786; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 787; GFX900-NEXT: s_waitcnt 788; GFX900-NEXT: global_store_dword 789; GFX900-NEXT: s_waitcnt 790; GFX900-NEXT: s_setpc_b64 791define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half %reg) #0 { 792entry: 793 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 794 %load = load i8, i8 addrspace(4)* %gep 795 %ext = zext i8 %load to i16 796 %bitcast = bitcast i16 %ext to half 797 %build0 = insertelement <2 x half> undef, half %reg, i32 0 798 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 799 store <2 x half> %build1, <2 x half> addrspace(1)* undef 800 ret void 801} 802 803; Local object gives known offset, so requires converting from offen 804; to offset variant. 805 806; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: 807; GFX900-MUBUF: buffer_store_dword 808; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094 809; GFX900-FLATSCR: scratch_store_dword 810; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4094 811define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { 812entry: 813 %obj0 = alloca [10 x i32], align 4, addrspace(5) 814 %obj1 = alloca [4096 x i16], align 2, addrspace(5) 815 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 816 store volatile i32 123, i32 addrspace(5)* %bc 817 %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027 818 %load = load i16, i16 addrspace(5)* %gep 819 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 820 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 821 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 822 ret void 823} 824 825; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: 826; GFX900-MUBUF: buffer_store_dword 827; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 828; GFX900-FLATSCR: scratch_store_dword 829; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095 830define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { 831entry: 832 %obj0 = alloca [10 x i32], align 4, addrspace(5) 833 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 834 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 835 store volatile i32 123, i32 addrspace(5)* %bc 836 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 837 %load = load i8, i8 addrspace(5)* %gep 838 %ext = sext i8 %load to i16 839 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 840 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 841 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 842 ret void 843} 844 845; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: 846; GFX900-MUBUF: buffer_store_dword 847; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 848; GFX900-FLATSCR: scratch_store_dword 849; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095 850define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { 851entry: 852 %obj0 = alloca [10 x i32], align 4, addrspace(5) 853 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 854 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 855 store volatile i32 123, i32 addrspace(5)* %bc 856 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 857 %load = load i8, i8 addrspace(5)* %gep 858 %ext = zext i8 %load to i16 859 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 860 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 861 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 862 ret void 863} 864 865; FIXME: Remove m0 init and waitcnt between reads 866; FIXME: Is there a cost to using the extload over not? 867; GCN-LABEL: {{^}}load_local_v2i16_split_multi_chain: 868; GCN: s_waitcnt 869; GFX900-NEXT: ds_read_u16 v1, v0 870; GFX900-NEXT: s_waitcnt 871; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2 872; GFX900-NEXT: s_waitcnt 873; GFX900-NEXT: v_mov_b32_e32 v0, v1 874; GFX900-NEXT: s_setpc_b64 875define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 { 876entry: 877 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 878 %load0 = load volatile i16, i16 addrspace(3)* %in 879 %load1 = load volatile i16, i16 addrspace(3)* %gep 880 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 881 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 882 ret <2 x i16> %build1 883} 884 885; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_samechain: 886; GFX900: ds_read_u16 v1, v0 887; GFX900-NEXT: s_waitcnt lgkmcnt(0) 888; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16 889; GFX900-NEXT: s_waitcnt lgkmcnt(0) 890; GFX900-NEXT: v_mov_b32_e32 v0, v1 891; GFX900-NEXT: s_setpc_b64 892 893; NO-D16-HI: ds_read_u16 894; NO-D16-HI: ds_read_u16 895define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 { 896entry: 897 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 898 %load.lo = load i16, i16 addrspace(3)* %in 899 %load.hi = load i16, i16 addrspace(3)* %gep 900 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 901 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 902 ret <2 x i16> %build1 903} 904 905; FIXME: Remove and 906; GCN-LABEL: {{^}}load_local_v2i16_broadcast: 907; GCN: ds_read_u16 [[LOAD:v[0-9]+]] 908; GCN-NOT: ds_read 909; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]] 910; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]] 911define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 { 912entry: 913 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 914 %load0 = load i16, i16 addrspace(3)* %in 915 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 916 %build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1 917 ret <2 x i16> %build1 918} 919 920; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect: 921; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0 922; GFX900: ds_write_b16 923; GFX900: ds_read_u16_d16_hi [[LOAD0]], v0 offset:16 924 925; NO-D16-HI: ds_read_u16 926; NO-D16-HI: ds_write_b16 927; NO-D16-HI: ds_read_u16 928define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16 addrspace(3)* %may.alias) #0 { 929entry: 930 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 931 %load.lo = load i16, i16 addrspace(3)* %in 932 store i16 123, i16 addrspace(3)* %may.alias 933 %load.hi = load i16, i16 addrspace(3)* %gep 934 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 935 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 936 ret <2 x i16> %build1 937} 938 939; FIXME: Remove waitcnt between reads 940; GCN-LABEL: {{^}}load_global_v2i16_split: 941; GCN: s_waitcnt 942; GFX900-NEXT: global_load_ushort v2 943; GFX900-NEXT: s_waitcnt 944; GFX900-NEXT: global_load_short_d16_hi v2 945; GFX900-NEXT: s_waitcnt 946; GFX900-NEXT: v_mov_b32_e32 v0, v2 947; GFX900-NEXT: s_setpc_b64 948define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 { 949entry: 950 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1 951 %load0 = load volatile i16, i16 addrspace(1)* %in 952 %load1 = load volatile i16, i16 addrspace(1)* %gep 953 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 954 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 955 ret <2 x i16> %build1 956} 957 958; FIXME: Remove waitcnt between reads 959; GCN-LABEL: {{^}}load_flat_v2i16_split: 960; GCN: s_waitcnt 961; GFX900-NEXT: flat_load_ushort v2 962; GFX900-NEXT: s_waitcnt 963; GFX900-NEXT: flat_load_short_d16_hi v2 964; GFX900-NEXT: s_waitcnt 965; GFX900-NEXT: v_mov_b32_e32 v0, v2 966; GFX900-NEXT: s_setpc_b64 967define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 { 968entry: 969 %gep = getelementptr inbounds i16, i16* %in, i64 1 970 %load0 = load volatile i16, i16* %in 971 %load1 = load volatile i16, i16* %gep 972 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 973 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 974 ret <2 x i16> %build1 975} 976 977; FIXME: Remove waitcnt between reads 978; GCN-LABEL: {{^}}load_constant_v2i16_split: 979; GCN: s_waitcnt 980; GFX900-NEXT: global_load_ushort v2 981; GFX900-NEXT: s_waitcnt 982; GFX900-NEXT: global_load_short_d16_hi v2 983; GFX900-NEXT: s_waitcnt 984; GFX900-NEXT: v_mov_b32_e32 v0, v2 985; GFX900-NEXT: s_setpc_b64 986define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 { 987entry: 988 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1 989 %load0 = load volatile i16, i16 addrspace(4)* %in 990 %load1 = load volatile i16, i16 addrspace(4)* %gep 991 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 992 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 993 ret <2 x i16> %build1 994} 995 996; FIXME: Remove m0 init and waitcnt between reads 997; FIXME: Is there a cost to using the extload over not? 998; GCN-LABEL: {{^}}load_private_v2i16_split: 999; GCN: s_waitcnt 1000; GFX900-MUBUF: buffer_load_ushort v0, off, s[0:3], s32{{$}} 1001; GFX900-FLATSCR: scratch_load_ushort v0, off, s32{{$}} 1002; GFX900-NEXT: s_waitcnt 1003; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 1004; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:2 1005; GFX900-NEXT: s_waitcnt 1006; GFX900-NEXT: s_setpc_b64 1007define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval(i16) %in) #0 { 1008entry: 1009 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1 1010 %load0 = load volatile i16, i16 addrspace(5)* %in 1011 %load1 = load volatile i16, i16 addrspace(5)* %gep 1012 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 1013 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 1014 ret <2 x i16> %build1 1015} 1016 1017; FIXME: This test should work without copying of v0. 1018; ds_read_u16_d16_hi preserves low 16 bits of the destination 1019; and ds_write_b16 only reads low 16 bits. 1020; GCN: s_waitcnt 1021; GFX900: v_mov_b32_e32 [[COPY:v[0-9]+]], v0 1022; GFX900-NEXT: ds_read_u16_d16_hi [[COPY]], v1 1023; GFX900-NEXT: ds_write_b16 v1, v0 1024; GFX900-NEXT: s_waitcnt 1025; GFX900-NEXT: v_mov_b32_e32 v0, [[COPY]] 1026; GFX900-NEXT: s_waitcnt 1027; GFX900-NEXT: s_setpc_b64 1028define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, i16 addrspace(3)* %in) #0 { 1029entry: 1030 %load = load i16, i16 addrspace(3)* %in 1031 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 1032 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 1033 store volatile i16 %reg, i16 addrspace(3)* %in 1034 ret <2 x i16> %build1 1035} 1036 1037attributes #0 = { nounwind } 1038