1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s 2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s 3; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global,-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s 4; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-flat-for-global,-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s 5; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 6 7; Testing for ds_read/write_128 8; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=SI,FUNC %s 9; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s 10; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s 11 12; FUNC-LABEL: {{^}}local_load_i32: 13; GCN-NOT: s_wqm_b64 14; SICIVI: s_mov_b32 m0, -1 15; GFX9-NOT: m0 16; GCN: ds_read_b32 17 18; EG: LDS_READ_RET 19define amdgpu_kernel void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { 20entry: 21 %ld = load i32, i32 addrspace(3)* %in 22 store i32 %ld, i32 addrspace(3)* %out 23 ret void 24} 25 26; FUNC-LABEL: {{^}}local_load_v2i32: 27; SICIVI: s_mov_b32 m0, -1 28; GFX9-NOT: m0 29 30; GCN: ds_read_b64 31define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { 32entry: 33 %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in 34 store <2 x i32> %ld, <2 x i32> addrspace(3)* %out 35 ret void 36} 37 38; FUNC-LABEL: {{^}}local_load_v3i32: 39; SICIVI: s_mov_b32 m0, -1 40; GFX9-NOT: m0 41 42; SI-DAG: ds_read_b64 43; SI-DAG: ds_read_b32 44; CIVI-DAG: ds_read_b96 45define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 { 46entry: 47 %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in 48 store <3 x i32> %ld, <3 x i32> addrspace(3)* %out 49 ret void 50} 51 52; FUNC-LABEL: {{^}}local_load_v4i32: 53; SICIVI: s_mov_b32 m0, -1 54; GFX9-NOT: m0 55 56; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} 57 58define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { 59entry: 60 %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in 61 store <4 x i32> %ld, <4 x i32> addrspace(3)* %out 62 ret void 63} 64 65; FUNC-LABEL: {{^}}local_load_v8i32: 66; SICIVI: s_mov_b32 m0, -1 67; GFX9-NOT: m0 68 69; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} 70; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} 71define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { 72entry: 73 %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in 74 store <8 x i32> %ld, <8 x i32> addrspace(3)* %out 75 ret void 76} 77 78; FUNC-LABEL: {{^}}local_load_v16i32: 79; SICIVI: s_mov_b32 m0, -1 80; GFX9-NOT: m0 81 82; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}} 83; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}} 84; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} 85; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} 86; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 87; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 88; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 89; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 90define amdgpu_kernel void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { 91entry: 92 %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in 93 store <16 x i32> %ld, <16 x i32> addrspace(3)* %out 94 ret void 95} 96 97; FUNC-LABEL: {{^}}local_zextload_i32_to_i64: 98; SICIVI: s_mov_b32 m0, -1 99; GFX9-NOT: m0 100 101define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { 102 %ld = load i32, i32 addrspace(3)* %in 103 %ext = zext i32 %ld to i64 104 store i64 %ext, i64 addrspace(3)* %out 105 ret void 106} 107 108; FUNC-LABEL: {{^}}local_sextload_i32_to_i64: 109; SICIVI: s_mov_b32 m0, -1 110; GFX9-NOT: m0 111 112define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { 113 %ld = load i32, i32 addrspace(3)* %in 114 %ext = sext i32 %ld to i64 115 store i64 %ext, i64 addrspace(3)* %out 116 ret void 117} 118 119; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64: 120; SICIVI: s_mov_b32 m0, -1 121; GFX9-NOT: m0 122 123define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { 124 %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in 125 %ext = zext <1 x i32> %ld to <1 x i64> 126 store <1 x i64> %ext, <1 x i64> addrspace(3)* %out 127 ret void 128} 129 130; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64: 131; SICIVI: s_mov_b32 m0, -1 132; GFX9-NOT: m0 133 134define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { 135 %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in 136 %ext = sext <1 x i32> %ld to <1 x i64> 137 store <1 x i64> %ext, <1 x i64> addrspace(3)* %out 138 ret void 139} 140 141; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64: 142; SICIVI: s_mov_b32 m0, -1 143; GFX9-NOT: m0 144 145define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { 146 %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in 147 %ext = zext <2 x i32> %ld to <2 x i64> 148 store <2 x i64> %ext, <2 x i64> addrspace(3)* %out 149 ret void 150} 151 152; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64: 153; SICIVI: s_mov_b32 m0, -1 154; GFX9-NOT: m0 155 156define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { 157 %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in 158 %ext = sext <2 x i32> %ld to <2 x i64> 159 store <2 x i64> %ext, <2 x i64> addrspace(3)* %out 160 ret void 161} 162 163; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64: 164; SICIVI: s_mov_b32 m0, -1 165; GFX9-NOT: m0 166 167define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { 168 %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in 169 %ext = zext <4 x i32> %ld to <4 x i64> 170 store <4 x i64> %ext, <4 x i64> addrspace(3)* %out 171 ret void 172} 173 174; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64: 175; SICIVI: s_mov_b32 m0, -1 176; GFX9-NOT: m0 177 178define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { 179 %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in 180 %ext = sext <4 x i32> %ld to <4 x i64> 181 store <4 x i64> %ext, <4 x i64> addrspace(3)* %out 182 ret void 183} 184 185; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load. 186; FUNC-LABEL: {{^}}local_v4i32_to_128: 187 188; SI-NOT: ds_read_b128 189; SI-NOT: ds_write_b128 190 191; CIVI: ds_read_b128 192; CIVI: ds_write_b128 193 194; EG: LDS_READ_RET 195; EG: LDS_READ_RET 196; EG: LDS_READ_RET 197; EG: LDS_READ_RET 198define amdgpu_kernel void @local_v4i32_to_128(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) { 199 %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16 200 store <4 x i32> %ld, <4 x i32> addrspace(3)* %out, align 16 201 ret void 202} 203 204; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64: 205; SICIVI: s_mov_b32 m0, -1 206; GFX9-NOT: m0 207 208define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { 209 %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in 210 %ext = zext <8 x i32> %ld to <8 x i64> 211 store <8 x i64> %ext, <8 x i64> addrspace(3)* %out 212 ret void 213} 214 215; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64: 216; SICIVI: s_mov_b32 m0, -1 217; GFX9-NOT: m0 218 219define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { 220 %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in 221 %ext = sext <8 x i32> %ld to <8 x i64> 222 store <8 x i64> %ext, <8 x i64> addrspace(3)* %out 223 ret void 224} 225 226; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64: 227; SICIVI: s_mov_b32 m0, -1 228; GFX9-NOT: m0 229 230define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { 231 %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in 232 %ext = sext <16 x i32> %ld to <16 x i64> 233 store <16 x i64> %ext, <16 x i64> addrspace(3)* %out 234 ret void 235} 236 237; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64 238; SICIVI: s_mov_b32 m0, -1 239; GFX9-NOT: m0 240 241define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { 242 %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in 243 %ext = zext <16 x i32> %ld to <16 x i64> 244 store <16 x i64> %ext, <16 x i64> addrspace(3)* %out 245 ret void 246} 247 248; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64: 249; SICIVI: s_mov_b32 m0, -1 250; GFX9-NOT: m0 251 252define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { 253 %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in 254 %ext = sext <32 x i32> %ld to <32 x i64> 255 store <32 x i64> %ext, <32 x i64> addrspace(3)* %out 256 ret void 257} 258 259; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64: 260; SICIVI: s_mov_b32 m0, -1 261; GFX9-NOT: m0 262 263define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { 264 %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in 265 %ext = zext <32 x i32> %ld to <32 x i64> 266 store <32 x i64> %ext, <32 x i64> addrspace(3)* %out 267 ret void 268} 269 270; FUNC-LABEL: {{^}}local_load_v32i32: 271; SICIVI: s_mov_b32 m0, -1 272; GFX9-NOT: m0 273; GFX9-NOT: accvgpr 274 275define amdgpu_kernel void @local_load_v32i32(<32 x i32> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { 276 %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in 277 store <32 x i32> %ld, <32 x i32> addrspace(3)* %out 278 ret void 279} 280 281attributes #0 = { nounwind } 282