1; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 3; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 4; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s 5 6;===------------------------------------------------------------------------===; 7; Global Address Space 8;===------------------------------------------------------------------------===; 9; FUNC-LABEL: {{^}}store_i1: 10; EG: MEM_RAT MSKOR 11; SI: buffer_store_byte 12define void @store_i1(i1 addrspace(1)* %out) { 13entry: 14 store i1 true, i1 addrspace(1)* %out 15 ret void 16} 17 18; i8 store 19; FUNC-LABEL: {{^}}store_i8: 20; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X 21 22; IG 0: Get the byte index and truncate the value 23; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x 24; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x 25; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y 26; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) 27 28 29; IG 1: Truncate the calculated the shift amount for the mask 30 31; IG 2: Shift the value and the mask 32; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] 33; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] 34; EG-NEXT: 255 35; IG 3: Initialize the Y and Z channels to zero 36; XXX: An optimal scheduler should merge this into one of the prevous IGs. 37; EG: MOV T[[RW_GPR]].Y, 0.0 38; EG: MOV * T[[RW_GPR]].Z, 0.0 39 40; SI: buffer_store_byte 41 42define void @store_i8(i8 addrspace(1)* %out, i8 %in) { 43entry: 44 store i8 %in, i8 addrspace(1)* %out 45 ret void 46} 47 48; i16 store 49; FUNC-LABEL: {{^}}store_i16: 50; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X 51 52; IG 0: Get the byte index and truncate the value 53 54 55; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x 56; EG-NEXT: 3(4.203895e-45), 57 58; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x 59; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y 60 61; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 62; IG 1: Truncate the calculated the shift amount for the mask 63 64; IG 2: Shift the value and the mask 65; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] 66; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] 67; EG-NEXT: 65535 68; IG 3: Initialize the Y and Z channels to zero 69; XXX: An optimal scheduler should merge this into one of the prevous IGs. 70; EG: MOV T[[RW_GPR]].Y, 0.0 71; EG: MOV * T[[RW_GPR]].Z, 0.0 72 73; SI: buffer_store_short 74define void @store_i16(i16 addrspace(1)* %out, i16 %in) { 75entry: 76 store i16 %in, i16 addrspace(1)* %out 77 ret void 78} 79 80; FUNC-LABEL: {{^}}store_v2i8: 81; EG: MEM_RAT MSKOR 82; EG-NOT: MEM_RAT MSKOR 83 84; SI: buffer_store_byte 85; SI: buffer_store_byte 86define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { 87entry: 88 %0 = trunc <2 x i32> %in to <2 x i8> 89 store <2 x i8> %0, <2 x i8> addrspace(1)* %out 90 ret void 91} 92 93 94; FUNC-LABEL: {{^}}store_v2i16: 95; EG: MEM_RAT_CACHELESS STORE_RAW 96 97; CM: MEM_RAT_CACHELESS STORE_DWORD 98 99; SI: buffer_store_short 100; SI: buffer_store_short 101define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { 102entry: 103 %0 = trunc <2 x i32> %in to <2 x i16> 104 store <2 x i16> %0, <2 x i16> addrspace(1)* %out 105 ret void 106} 107 108; FUNC-LABEL: {{^}}store_v4i8: 109; EG: MEM_RAT_CACHELESS STORE_RAW 110 111; CM: MEM_RAT_CACHELESS STORE_DWORD 112 113; SI: buffer_store_byte 114; SI: buffer_store_byte 115; SI: buffer_store_byte 116; SI: buffer_store_byte 117define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { 118entry: 119 %0 = trunc <4 x i32> %in to <4 x i8> 120 store <4 x i8> %0, <4 x i8> addrspace(1)* %out 121 ret void 122} 123 124; floating-point store 125; FUNC-LABEL: {{^}}store_f32: 126; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 127 128; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}} 129 130; SI: buffer_store_dword 131 132define void @store_f32(float addrspace(1)* %out, float %in) { 133 store float %in, float addrspace(1)* %out 134 ret void 135} 136 137; FUNC-LABEL: {{^}}store_v4i16: 138; EG: MEM_RAT MSKOR 139; EG: MEM_RAT MSKOR 140; EG: MEM_RAT MSKOR 141; EG: MEM_RAT MSKOR 142; EG-NOT: MEM_RAT MSKOR 143 144; SI: buffer_store_short 145; SI: buffer_store_short 146; SI: buffer_store_short 147; SI: buffer_store_short 148; SI-NOT: buffer_store_byte 149define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { 150entry: 151 %0 = trunc <4 x i32> %in to <4 x i16> 152 store <4 x i16> %0, <4 x i16> addrspace(1)* %out 153 ret void 154} 155 156; vec2 floating-point stores 157; FUNC-LABEL: {{^}}store_v2f32: 158; EG: MEM_RAT_CACHELESS STORE_RAW 159 160; CM: MEM_RAT_CACHELESS STORE_DWORD 161 162; SI: buffer_store_dwordx2 163 164define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { 165entry: 166 %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0 167 %1 = insertelement <2 x float> %0, float %b, i32 1 168 store <2 x float> %1, <2 x float> addrspace(1)* %out 169 ret void 170} 171 172; FUNC-LABEL: {{^}}store_v4i32: 173; EG: MEM_RAT_CACHELESS STORE_RAW 174; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 175 176; CM: MEM_RAT_CACHELESS STORE_DWORD 177; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 178 179; SI: buffer_store_dwordx4 180define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { 181entry: 182 store <4 x i32> %in, <4 x i32> addrspace(1)* %out 183 ret void 184} 185 186; FUNC-LABEL: {{^}}store_i64_i8: 187; EG: MEM_RAT MSKOR 188; SI: buffer_store_byte 189define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { 190entry: 191 %0 = trunc i64 %in to i8 192 store i8 %0, i8 addrspace(1)* %out 193 ret void 194} 195 196; FUNC-LABEL: {{^}}store_i64_i16: 197; EG: MEM_RAT MSKOR 198; SI: buffer_store_short 199define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) { 200entry: 201 %0 = trunc i64 %in to i16 202 store i16 %0, i16 addrspace(1)* %out 203 ret void 204} 205 206;===------------------------------------------------------------------------===; 207; Local Address Space 208;===------------------------------------------------------------------------===; 209 210; FUNC-LABEL: {{^}}store_local_i1: 211; EG: LDS_BYTE_WRITE 212; SI: ds_write_b8 213define void @store_local_i1(i1 addrspace(3)* %out) { 214entry: 215 store i1 true, i1 addrspace(3)* %out 216 ret void 217} 218 219; FUNC-LABEL: {{^}}store_local_i8: 220; EG: LDS_BYTE_WRITE 221 222; SI: ds_write_b8 223define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { 224 store i8 %in, i8 addrspace(3)* %out 225 ret void 226} 227 228; FUNC-LABEL: {{^}}store_local_i16: 229; EG: LDS_SHORT_WRITE 230 231; SI: ds_write_b16 232define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { 233 store i16 %in, i16 addrspace(3)* %out 234 ret void 235} 236 237; FUNC-LABEL: {{^}}store_local_v2i16: 238; EG: LDS_WRITE 239 240; CM: LDS_WRITE 241 242; SI: ds_write_b16 243; SI: ds_write_b16 244define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { 245entry: 246 store <2 x i16> %in, <2 x i16> addrspace(3)* %out 247 ret void 248} 249 250; FUNC-LABEL: {{^}}store_local_v4i8: 251; EG: LDS_WRITE 252 253; CM: LDS_WRITE 254 255; SI: ds_write_b8 256; SI: ds_write_b8 257; SI: ds_write_b8 258; SI: ds_write_b8 259define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { 260entry: 261 store <4 x i8> %in, <4 x i8> addrspace(3)* %out 262 ret void 263} 264 265; FUNC-LABEL: {{^}}store_local_v2i32: 266; EG: LDS_WRITE 267; EG: LDS_WRITE 268 269; CM: LDS_WRITE 270; CM: LDS_WRITE 271 272; SI: ds_write_b64 273define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { 274entry: 275 store <2 x i32> %in, <2 x i32> addrspace(3)* %out 276 ret void 277} 278 279; FUNC-LABEL: {{^}}store_local_v4i32: 280; EG: LDS_WRITE 281; EG: LDS_WRITE 282; EG: LDS_WRITE 283; EG: LDS_WRITE 284 285; CM: LDS_WRITE 286; CM: LDS_WRITE 287; CM: LDS_WRITE 288; CM: LDS_WRITE 289 290; SI: ds_write_b64 291; SI: ds_write_b64 292define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { 293entry: 294 store <4 x i32> %in, <4 x i32> addrspace(3)* %out 295 ret void 296} 297 298; FUNC-LABEL: {{^}}store_local_v4i32_align4: 299; EG: LDS_WRITE 300; EG: LDS_WRITE 301; EG: LDS_WRITE 302; EG: LDS_WRITE 303 304; CM: LDS_WRITE 305; CM: LDS_WRITE 306; CM: LDS_WRITE 307; CM: LDS_WRITE 308 309; SI: ds_write2_b32 310; SI: ds_write2_b32 311define void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { 312entry: 313 store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4 314 ret void 315} 316 317; FUNC-LABEL: {{^}}store_local_i64_i8: 318; EG: LDS_BYTE_WRITE 319; SI: ds_write_b8 320define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) { 321entry: 322 %0 = trunc i64 %in to i8 323 store i8 %0, i8 addrspace(3)* %out 324 ret void 325} 326 327; FUNC-LABEL: {{^}}store_local_i64_i16: 328; EG: LDS_SHORT_WRITE 329; SI: ds_write_b16 330define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) { 331entry: 332 %0 = trunc i64 %in to i16 333 store i16 %0, i16 addrspace(3)* %out 334 ret void 335} 336 337; The stores in this function are combined by the optimizer to create a 338; 64-bit store with 32-bit alignment. This is legal for SI and the legalizer 339; should not try to split the 64-bit store back into 2 32-bit stores. 340; 341; Evergreen / Northern Islands don't support 64-bit stores yet, so there should 342; be two 32-bit stores. 343 344; FUNC-LABEL: {{^}}vecload2: 345; EG: MEM_RAT_CACHELESS STORE_RAW 346 347; CM: MEM_RAT_CACHELESS STORE_DWORD 348 349; SI: buffer_store_dwordx2 350define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { 351entry: 352 %0 = load i32, i32 addrspace(2)* %mem, align 4 353 %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 354 %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4 355 store i32 %0, i32 addrspace(1)* %out, align 4 356 %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 357 store i32 %1, i32 addrspace(1)* %arrayidx1, align 4 358 ret void 359} 360 361attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } 362 363; When i128 was a legal type this program generated cannot select errors: 364 365; FUNC-LABEL: {{^}}"i128-const-store": 366; FIXME: We should be able to to this with one store instruction 367; EG: STORE_RAW 368; EG: STORE_RAW 369; EG: STORE_RAW 370; EG: STORE_RAW 371; CM: STORE_DWORD 372; CM: STORE_DWORD 373; CM: STORE_DWORD 374; CM: STORE_DWORD 375; SI: buffer_store_dwordx4 376define void @i128-const-store(i32 addrspace(1)* %out) { 377entry: 378 store i32 1, i32 addrspace(1)* %out, align 4 379 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 380 store i32 1, i32 addrspace(1)* %arrayidx2, align 4 381 %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 382 store i32 2, i32 addrspace(1)* %arrayidx4, align 4 383 %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 384 store i32 2, i32 addrspace(1)* %arrayidx6, align 4 385 ret void 386} 387