1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=FUNC %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=FUNC %s 4; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 5; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s 6 7; FUNC-LABEL: {{^}}store_i1: 8; EG: MEM_RAT MSKOR 9; EG-NOT: MEM_RAT MSKOR 10 11; CM: MEM_RAT MSKOR 12; CM-NOT: MEM_RAT MSKOR 13 14; SIVI: buffer_store_byte 15; GFX9: global_store_byte 16define amdgpu_kernel void @store_i1(i1 addrspace(1)* %out) { 17entry: 18 store i1 true, i1 addrspace(1)* %out 19 ret void 20} 21 22; i8 store 23; FUNC-LABEL: {{^}}store_i8: 24; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X 25; EG-NOT: MEM_RAT MSKOR 26 27; EG: VTX_READ_8 28; EG: AND_INT 29; EG: AND_INT 30; EG: LSHL 31; EG: LSHL 32; EG: LSHL 33 34; SIVI: buffer_store_byte 35; GFX9: global_store_byte 36define amdgpu_kernel void @store_i8(i8 addrspace(1)* %out, i8 %in) { 37entry: 38 store i8 %in, i8 addrspace(1)* %out 39 ret void 40} 41 42; i16 store 43; FUNC-LABEL: {{^}}store_i16: 44; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X 45; EG-NOT: MEM_RAT MSKOR 46 47; EG: VTX_READ_16 48; EG: AND_INT 49; EG: AND_INT 50; EG: LSHL 51; EG: LSHL 52; EG: LSHL 53 54 55; SIVI: buffer_store_short 56; GFX9: global_store_short 57define amdgpu_kernel void @store_i16(i16 addrspace(1)* %out, i16 %in) { 58entry: 59 store i16 %in, i16 addrspace(1)* %out 60 ret void 61} 62 63; FUNC-LABEL: {{^}}store_i24: 64; SIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 65; SIVI-DAG: buffer_store_byte 66; SIVI-DAG: buffer_store_short 67 68; GFX9-DAG: global_store_byte_d16_hi v{{\[[0-9]:[0-9]+\]}}, v{{[0-9]+}}, off offset:2 69; GFX9-DAG: global_store_short 70 71; EG: MEM_RAT MSKOR 72; EG: MEM_RAT MSKOR 73define amdgpu_kernel void @store_i24(i24 addrspace(1)* %out, i24 %in) { 74entry: 75 store i24 %in, i24 addrspace(1)* %out 76 ret void 77} 78 79; FUNC-LABEL: {{^}}store_i25: 80; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}} 81; GCN: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]] 82; SIVI: buffer_store_dword [[VAND]] 83; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAND]] 84 85; EG: MEM_RAT_CACHELESS STORE_RAW 86; EG-NOT: MEM_RAT 87 88; CM: MEM_RAT_CACHELESS STORE_DWORD 89; CM-NOT: MEM_RAT 90define amdgpu_kernel void @store_i25(i25 addrspace(1)* %out, i25 %in) { 91entry: 92 store i25 %in, i25 addrspace(1)* %out 93 ret void 94} 95 96; FUNC-LABEL: {{^}}store_v2i8: 97; v2i8 is naturally 2B aligned 98; EG: MEM_RAT MSKOR 99; EG-NOT: MEM_RAT MSKOR 100 101; CM: MEM_RAT MSKOR 102; CM-NOT: MEM_RAT MSKOR 103 104; SIVI: buffer_store_short 105; GFX9: global_store_short 106define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { 107entry: 108 %0 = trunc <2 x i32> %in to <2 x i8> 109 store <2 x i8> %0, <2 x i8> addrspace(1)* %out 110 ret void 111} 112 113; FUNC-LABEL: {{^}}store_v2i8_unaligned: 114; EG: MEM_RAT MSKOR 115; EG: MEM_RAT MSKOR 116; EG-NOT: MEM_RAT MSKOR 117 118; CM: MEM_RAT MSKOR 119; CM: MEM_RAT MSKOR 120; CM-NOT: MEM_RAT MSKOR 121 122; SI: buffer_store_byte 123define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { 124entry: 125 %0 = trunc <2 x i32> %in to <2 x i8> 126 store <2 x i8> %0, <2 x i8> addrspace(1)* %out, align 1 127 ret void 128} 129 130 131; FUNC-LABEL: {{^}}store_v2i16: 132; EG: MEM_RAT_CACHELESS STORE_RAW 133 134; CM: MEM_RAT_CACHELESS STORE_DWORD 135 136; SIVI: buffer_store_dword 137; GFX9: global_store_dword 138define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { 139entry: 140 %0 = trunc <2 x i32> %in to <2 x i16> 141 store <2 x i16> %0, <2 x i16> addrspace(1)* %out 142 ret void 143} 144 145; FUNC-LABEL: {{^}}store_v2i16_unaligned: 146; EG: MEM_RAT MSKOR 147; EG: MEM_RAT MSKOR 148; EG-NOT: MEM_RAT MSKOR 149; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 150 151; CM: MEM_RAT MSKOR 152; CM: MEM_RAT MSKOR 153; CM-NOT: MEM_RAT MSKOR 154; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 155 156; SIVI: buffer_store_short 157; SIVI: buffer_store_short 158 159; GFX9: global_store_short 160; GFX9: global_store_short 161define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { 162entry: 163 %0 = trunc <2 x i32> %in to <2 x i16> 164 store <2 x i16> %0, <2 x i16> addrspace(1)* %out, align 2 165 ret void 166} 167 168; FUNC-LABEL: {{^}}store_v4i8: 169; EG: MEM_RAT_CACHELESS STORE_RAW 170 171; CM: MEM_RAT_CACHELESS STORE_DWORD 172 173; SIVI: buffer_store_dword 174; GFX9: global_store_dword 175define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { 176entry: 177 %0 = trunc <4 x i32> %in to <4 x i8> 178 store <4 x i8> %0, <4 x i8> addrspace(1)* %out 179 ret void 180} 181 182; FUNC-LABEL: {{^}}store_v4i8_unaligned: 183; EG: MEM_RAT MSKOR 184; EG: MEM_RAT MSKOR 185; EG: MEM_RAT MSKOR 186; EG: MEM_RAT MSKOR 187; EG-NOT: MEM_RAT MSKOR 188; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 189 190; CM: MEM_RAT MSKOR 191; CM: MEM_RAT MSKOR 192; CM: MEM_RAT MSKOR 193; CM: MEM_RAT MSKOR 194; CM-NOT: MEM_RAT MSKOR 195; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 196 197; SI: buffer_store_byte 198; SI: buffer_store_byte 199; SI: buffer_store_byte 200; SI: buffer_store_byte 201; SI-NOT: buffer_store_dword 202define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { 203entry: 204 %0 = trunc <4 x i32> %in to <4 x i8> 205 store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 1 206 ret void 207} 208 209; FUNC-LABEL: {{^}}store_v4i8_halfaligned: 210; EG: MEM_RAT MSKOR 211; EG: MEM_RAT MSKOR 212; EG-NOT: MEM_RAT MSKOR 213; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 214 215; CM: MEM_RAT MSKOR 216; CM: MEM_RAT MSKOR 217; CM-NOT: MEM_RAT MSKOR 218; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 219 220; SI: buffer_store_short 221; SI: buffer_store_short 222; SI-NOT: buffer_store_dword 223define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { 224entry: 225 %0 = trunc <4 x i32> %in to <4 x i8> 226 store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 2 227 ret void 228} 229 230; floating-point store 231; FUNC-LABEL: {{^}}store_f32: 232; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 233 234; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}} 235 236; SIVI: buffer_store_dword 237; GFX9: global_store_dword 238 239define amdgpu_kernel void @store_f32(float addrspace(1)* %out, float %in) { 240 store float %in, float addrspace(1)* %out 241 ret void 242} 243 244; FUNC-LABEL: {{^}}store_v4i16: 245; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY 246 247; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}} 248 249; SIVI: buffer_store_dwordx2 250; GFX9: global_store_dwordx2 251define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { 252entry: 253 %0 = trunc <4 x i32> %in to <4 x i16> 254 store <4 x i16> %0, <4 x i16> addrspace(1)* %out 255 ret void 256} 257 258; vec2 floating-point stores 259; FUNC-LABEL: {{^}}store_v2f32: 260; EG: MEM_RAT_CACHELESS STORE_RAW 261 262; CM: MEM_RAT_CACHELESS STORE_DWORD 263 264; SIVI: buffer_store_dwordx2 265; GFX9: global_store_dwordx2 266 267define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { 268entry: 269 %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0 270 %1 = insertelement <2 x float> %0, float %b, i32 1 271 store <2 x float> %1, <2 x float> addrspace(1)* %out 272 ret void 273} 274 275; FUNC-LABEL: {{^}}store_v3i32: 276; SIVI-DAG: buffer_store_dwordx2 277; SIVI-DAG: buffer_store_dword v 278 279; GFX9-DAG: global_store_dwordx2 280; GFX9-DAG: global_store_dword v 281 282; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 283; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XY}}, {{T[0-9]+\.[XYZW]}}, 284define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind { 285 store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16 286 ret void 287} 288 289; FUNC-LABEL: {{^}}store_v4i32: 290; EG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XYZW}} 291; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 292 293; CM: MEM_RAT_CACHELESS STORE_DWORD 294; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 295 296; SIVI: buffer_store_dwordx4 297; GFX9: global_store_dwordx4 298define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { 299entry: 300 store <4 x i32> %in, <4 x i32> addrspace(1)* %out 301 ret void 302} 303 304; FUNC-LABEL: {{^}}store_v4i32_unaligned: 305; EG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XYZW}} 306; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 307 308; CM: MEM_RAT_CACHELESS STORE_DWORD 309; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 310 311; SIVI: buffer_store_dwordx4 312; GFX9: global_store_dwordx4 313define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { 314entry: 315 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 316 ret void 317} 318 319; v4f32 store 320; FUNC-LABEL: {{^}}store_v4f32: 321; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 322; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 323 324; CM: MEM_RAT_CACHELESS STORE_DWORD 325; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 326 327; SIVI: buffer_store_dwordx4 328; GFX9: global_store_dwordx4 329define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { 330 %1 = load <4 x float>, <4 x float> addrspace(1) * %in 331 store <4 x float> %1, <4 x float> addrspace(1)* %out 332 ret void 333} 334 335; FUNC-LABEL: {{^}}store_i64_i8: 336; EG: MEM_RAT MSKOR 337 338; CM: MEM_RAT MSKOR 339 340; SIVI: buffer_store_byte 341; GFX9: global_store_byte 342define amdgpu_kernel void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { 343entry: 344 %0 = trunc i64 %in to i8 345 store i8 %0, i8 addrspace(1)* %out 346 ret void 347} 348 349; FUNC-LABEL: {{^}}store_i64_i16: 350; EG: MEM_RAT MSKOR 351; SIVI: buffer_store_short 352; GFX9: global_store_short 353define amdgpu_kernel void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) { 354entry: 355 %0 = trunc i64 %in to i16 356 store i16 %0, i16 addrspace(1)* %out 357 ret void 358} 359 360; The stores in this function are combined by the optimizer to create a 361; 64-bit store with 32-bit alignment. This is legal and the legalizer 362; should not try to split the 64-bit store back into 2 32-bit stores. 363 364; FUNC-LABEL: {{^}}vecload2: 365; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XY, T[0-9]+\.X}}, 1 366; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 367 368; CM: MEM_RAT_CACHELESS STORE_DWORD 369; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 370 371; SIVI: buffer_store_dwordx2 372; GFX9: global_store_dwordx2 373define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* nocapture %mem) #0 { 374entry: 375 %0 = load i32, i32 addrspace(4)* %mem, align 4 376 %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(4)* %mem, i64 1 377 %1 = load i32, i32 addrspace(4)* %arrayidx1.i, align 4 378 store i32 %0, i32 addrspace(1)* %out, align 4 379 %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 380 store i32 %1, i32 addrspace(1)* %arrayidx1, align 4 381 ret void 382} 383 384; When i128 was a legal type this program generated cannot select errors: 385 386; FUNC-LABEL: {{^}}"i128-const-store": 387; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 1 388 389; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}, T{{[0-9]+}}.X 390 391; SIVI: buffer_store_dwordx4 392; GFX9: global_store_dwordx4 393define amdgpu_kernel void @i128-const-store(i32 addrspace(1)* %out) { 394entry: 395 store i32 1, i32 addrspace(1)* %out, align 4 396 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 397 store i32 1, i32 addrspace(1)* %arrayidx2, align 4 398 %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 399 store i32 2, i32 addrspace(1)* %arrayidx4, align 4 400 %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 401 store i32 2, i32 addrspace(1)* %arrayidx6, align 4 402 ret void 403} 404 405attributes #0 = { nounwind } 406