1; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s 2; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-buffer-access -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=UNALIGNED %s 3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s 4 5; SI-LABEL: {{^}}local_unaligned_load_store_i16: 6; SI: ds_read_u8 7; SI: ds_read_u8 8; SI: ds_write_b8 9; SI: ds_write_b8 10; SI: s_endpgm 11define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 { 12 %v = load i16, i16 addrspace(3)* %p, align 1 13 store i16 %v, i16 addrspace(3)* %r, align 1 14 ret void 15} 16 17; SI-LABEL: {{^}}global_unaligned_load_store_i16: 18; ALIGNED: buffer_load_ubyte 19; ALIGNED: buffer_load_ubyte 20; ALIGNED: buffer_store_byte 21; ALIGNED: buffer_store_byte 22 23; UNALIGNED: buffer_load_ushort 24; UNALIGNED: buffer_store_short 25; SI: s_endpgm 26define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 { 27 %v = load i16, i16 addrspace(1)* %p, align 1 28 store i16 %v, i16 addrspace(1)* %r, align 1 29 ret void 30} 31 32; FUNC-LABEL: {{^}}local_unaligned_load_store_i32: 33 34; SI: ds_read_u8 35; SI: ds_read_u8 36; SI: ds_read_u8 37; SI: ds_read_u8 38; SI-NOT: v_or 39; SI-NOT: v_lshl 40; SI: ds_write_b8 41; SI: ds_write_b8 42; SI: ds_write_b8 43; SI: ds_write_b8 44; SI: s_endpgm 45define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { 46 %v = load i32, i32 addrspace(3)* %p, align 1 47 store i32 %v, i32 addrspace(3)* %r, align 1 48 ret void 49} 50 51; SI-LABEL: {{^}}global_unaligned_load_store_i32: 52; ALIGNED: buffer_load_ubyte 53; ALIGNED: buffer_load_ubyte 54; ALIGNED: buffer_load_ubyte 55; ALIGNED: buffer_load_ubyte 56; ALIGNED: buffer_store_byte 57; ALIGNED: buffer_store_byte 58; ALIGNED: buffer_store_byte 59; ALIGNED: buffer_store_byte 60 61; UNALIGNED: buffer_load_dword 62; UNALIGNED: buffer_store_dword 63define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 { 64 %v = load i32, i32 addrspace(1)* %p, align 1 65 store i32 %v, i32 addrspace(1)* %r, align 1 66 ret void 67} 68 69; SI-LABEL: {{^}}global_align2_load_store_i32: 70; ALIGNED: buffer_load_ushort 71; ALIGNED: buffer_load_ushort 72; ALIGNED: buffer_store_short 73; ALIGNED: buffer_store_short 74 75; UNALIGNED: buffer_load_dword 76; UNALIGNED: buffer_store_dword 77define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 { 78 %v = load i32, i32 addrspace(1)* %p, align 2 79 store i32 %v, i32 addrspace(1)* %r, align 2 80 ret void 81} 82 83; FUNC-LABEL: {{^}}local_align2_load_store_i32: 84; GCN: ds_read_u16 85; GCN: ds_read_u16 86; GCN: ds_write_b16 87; GCN: ds_write_b16 88define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { 89 %v = load i32, i32 addrspace(3)* %p, align 2 90 store i32 %v, i32 addrspace(3)* %r, align 2 91 ret void 92} 93 94; FUNC-LABEL: {{^}}local_unaligned_load_store_i64: 95; SI: ds_read_u8 96; SI: ds_read_u8 97; SI: ds_read_u8 98; SI: ds_read_u8 99; SI: ds_read_u8 100; SI: ds_read_u8 101; SI: ds_read_u8 102; SI: ds_read_u8 103 104; SI-NOT: v_or_b32 105; SI-NOT: v_lshl 106; SI: ds_write_b8 107; SI-NOT: v_or_b32 108; SI-NOT: v_lshl 109 110; SI: ds_write_b8 111; SI-NOT: v_or_b32 112; SI-NOT: v_lshl 113 114; SI: ds_write_b8 115; SI-NOT: v_or_b32 116; SI-NOT: v_lshl 117 118; SI: ds_write_b8 119; SI-NOT: v_or_b32 120; SI-NOT: v_lshl 121 122; SI: ds_write_b8 123; SI-NOT: v_or_b32 124; SI-NOT: v_lshl 125 126; SI: ds_write_b8 127; SI-NOT: v_or_b32 128; SI-NOT: v_lshl 129 130; SI: ds_write_b8 131; SI-NOT: v_or_b32 132; SI-NOT: v_lshl 133; SI: ds_write_b8 134; SI: s_endpgm 135define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 { 136 %v = load i64, i64 addrspace(3)* %p, align 1 137 store i64 %v, i64 addrspace(3)* %r, align 1 138 ret void 139} 140 141; SI-LABEL: {{^}}local_unaligned_load_store_v2i32: 142; SI: ds_read_u8 143; SI: ds_read_u8 144; SI: ds_read_u8 145; SI: ds_read_u8 146; SI: ds_read_u8 147; SI: ds_read_u8 148; SI: ds_read_u8 149; SI: ds_read_u8 150 151; SI-NOT: v_or_b32 152; SI-NOT: v_lshl 153; SI: ds_write_b8 154; SI-NOT: v_or_b32 155; SI-NOT: v_lshl 156 157; SI: ds_write_b8 158; SI-NOT: v_or_b32 159; SI-NOT: v_lshl 160 161; SI: ds_write_b8 162; SI-NOT: v_or_b32 163; SI-NOT: v_lshl 164 165; SI: ds_write_b8 166; SI-NOT: v_or_b32 167; SI-NOT: v_lshl 168 169; SI: ds_write_b8 170; SI-NOT: v_or_b32 171; SI-NOT: v_lshl 172 173; SI: ds_write_b8 174; SI-NOT: v_or_b32 175; SI-NOT: v_lshl 176 177; SI: ds_write_b8 178; SI-NOT: v_or_b32 179; SI-NOT: v_lshl 180; SI: ds_write_b8 181; SI: s_endpgm 182define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 { 183 %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1 184 store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1 185 ret void 186} 187 188; SI-LABEL: {{^}}global_align2_load_store_i64: 189; ALIGNED: buffer_load_ushort 190; ALIGNED: buffer_load_ushort 191 192; ALIGNED-NOT: v_or_ 193; ALIGNED-NOT: v_lshl 194 195; ALIGNED: buffer_load_ushort 196 197; ALIGNED-NOT: v_or_ 198; ALIGNED-NOT: v_lshl 199 200; ALIGNED: buffer_load_ushort 201 202; ALIGNED-NOT: v_or_ 203; ALIGNED-NOT: v_lshl 204 205; ALIGNED: buffer_store_short 206; ALIGNED: buffer_store_short 207; ALIGNED: buffer_store_short 208; ALIGNED: buffer_store_short 209 210; UNALIGNED: buffer_load_dwordx2 211; UNALIGNED: buffer_store_dwordx2 212define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { 213 %v = load i64, i64 addrspace(1)* %p, align 2 214 store i64 %v, i64 addrspace(1)* %r, align 2 215 ret void 216} 217 218; SI-LABEL: {{^}}unaligned_load_store_i64_global: 219; ALIGNED: buffer_load_ubyte 220; ALIGNED: buffer_load_ubyte 221; ALIGNED: buffer_load_ubyte 222; ALIGNED: buffer_load_ubyte 223; ALIGNED: buffer_load_ubyte 224; ALIGNED: buffer_load_ubyte 225; ALIGNED: buffer_load_ubyte 226; ALIGNED: buffer_load_ubyte 227 228; ALIGNED-NOT: v_or_ 229; ALIGNED-NOT: v_lshl 230 231; ALIGNED: buffer_store_byte 232; ALIGNED: buffer_store_byte 233; ALIGNED: buffer_store_byte 234; ALIGNED: buffer_store_byte 235; ALIGNED: buffer_store_byte 236; ALIGNED: buffer_store_byte 237; ALIGNED: buffer_store_byte 238; ALIGNED: buffer_store_byte 239 240; UNALIGNED: buffer_load_dwordx2 241; UNALIGNED: buffer_store_dwordx2 242define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { 243 %v = load i64, i64 addrspace(1)* %p, align 1 244 store i64 %v, i64 addrspace(1)* %r, align 1 245 ret void 246} 247 248; FUNC-LABEL: {{^}}local_unaligned_load_store_v4i32: 249; GCN: ds_read_u8 250; GCN: ds_read_u8 251; GCN: ds_read_u8 252; GCN: ds_read_u8 253 254; GCN: ds_read_u8 255; GCN: ds_read_u8 256; GCN: ds_read_u8 257; GCN: ds_read_u8 258 259; GCN: ds_read_u8 260; GCN: ds_read_u8 261; GCN: ds_read_u8 262; GCN: ds_read_u8 263 264; GCN: ds_read_u8 265; GCN: ds_read_u8 266; GCN: ds_read_u8 267; GCN: ds_read_u8 268 269; GCN: ds_write_b8 270; GCN: ds_write_b8 271; GCN: ds_write_b8 272; GCN: ds_write_b8 273 274; GCN: ds_write_b8 275; GCN: ds_write_b8 276; GCN: ds_write_b8 277; GCN: ds_write_b8 278 279; GCN: ds_write_b8 280; GCN: ds_write_b8 281; GCN: ds_write_b8 282; GCN: ds_write_b8 283 284; GCN: ds_write_b8 285; GCN: ds_write_b8 286; GCN: ds_write_b8 287; GCN: ds_write_b8 288; GCN: s_endpgm 289define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 { 290 %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1 291 store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1 292 ret void 293} 294 295; SI-LABEL: {{^}}global_unaligned_load_store_v4i32 296; ALIGNED: buffer_load_ubyte 297; ALIGNED: buffer_load_ubyte 298; ALIGNED: buffer_load_ubyte 299; ALIGNED: buffer_load_ubyte 300; ALIGNED: buffer_load_ubyte 301; ALIGNED: buffer_load_ubyte 302; ALIGNED: buffer_load_ubyte 303; ALIGNED: buffer_load_ubyte 304; ALIGNED: buffer_load_ubyte 305; ALIGNED: buffer_load_ubyte 306; ALIGNED: buffer_load_ubyte 307; ALIGNED: buffer_load_ubyte 308; ALIGNED: buffer_load_ubyte 309; ALIGNED: buffer_load_ubyte 310; ALIGNED: buffer_load_ubyte 311; ALIGNED: buffer_load_ubyte 312 313; ALIGNED: buffer_store_byte 314; ALIGNED: buffer_store_byte 315; ALIGNED: buffer_store_byte 316; ALIGNED: buffer_store_byte 317; ALIGNED: buffer_store_byte 318; ALIGNED: buffer_store_byte 319; ALIGNED: buffer_store_byte 320; ALIGNED: buffer_store_byte 321; ALIGNED: buffer_store_byte 322; ALIGNED: buffer_store_byte 323; ALIGNED: buffer_store_byte 324; ALIGNED: buffer_store_byte 325; ALIGNED: buffer_store_byte 326; ALIGNED: buffer_store_byte 327; ALIGNED: buffer_store_byte 328; ALIGNED: buffer_store_byte 329 330; UNALIGNED: buffer_load_dwordx4 331; UNALIGNED: buffer_store_dwordx4 332define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 { 333 %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1 334 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1 335 ret void 336} 337 338; FUNC-LABEL: {{^}}local_load_i64_align_4: 339; GCN: ds_read2_b32 340define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 341 %val = load i64, i64 addrspace(3)* %in, align 4 342 store i64 %val, i64 addrspace(1)* %out, align 8 343 ret void 344} 345 346; FUNC-LABEL: {{^}}local_load_i64_align_4_with_offset 347; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9 348define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 349 %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4 350 %val = load i64, i64 addrspace(3)* %ptr, align 4 351 store i64 %val, i64 addrspace(1)* %out, align 8 352 ret void 353} 354 355; FUNC-LABEL: {{^}}local_load_i64_align_4_with_split_offset: 356; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits 357; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1 358; GCN: s_endpgm 359define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 360 %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)* 361 %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 362 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* 363 %val = load i64, i64 addrspace(3)* %ptri64, align 4 364 store i64 %val, i64 addrspace(1)* %out, align 8 365 ret void 366} 367 368; FUNC-LABEL: {{^}}local_load_i64_align_1: 369; GCN: ds_read_u8 370; GCN: ds_read_u8 371; GCN: ds_read_u8 372; GCN: ds_read_u8 373; GCN: ds_read_u8 374; GCN: ds_read_u8 375; GCN: ds_read_u8 376; GCN: ds_read_u8 377; GCN: store_dwordx2 378define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 379 %val = load i64, i64 addrspace(3)* %in, align 1 380 store i64 %val, i64 addrspace(1)* %out, align 8 381 ret void 382} 383 384; FUNC-LABEL: {{^}}local_store_i64_align_4: 385; GCN: ds_write2_b32 386define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { 387 store i64 %val, i64 addrspace(3)* %out, align 4 388 ret void 389} 390 391; FUNC-LABEL: {{^}}local_store_i64_align_4_with_offset 392; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9 393; GCN: s_endpgm 394define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { 395 %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4 396 store i64 0, i64 addrspace(3)* %ptr, align 4 397 ret void 398} 399 400; FUNC-LABEL: {{^}}local_store_i64_align_4_with_split_offset: 401; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits 402; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1 403; GCN: s_endpgm 404define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { 405 %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)* 406 %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 407 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* 408 store i64 0, i64 addrspace(3)* %out, align 4 409 ret void 410} 411 412; SI-LABEL: {{^}}constant_unaligned_load_i32: 413; ALIGNED: buffer_load_ubyte 414; ALIGNED: buffer_load_ubyte 415; ALIGNED: buffer_load_ubyte 416; ALIGNED: buffer_load_ubyte 417 418; UNALIGNED: s_load_dword 419 420; SI: buffer_store_dword 421define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { 422 %v = load i32, i32 addrspace(2)* %p, align 1 423 store i32 %v, i32 addrspace(1)* %r, align 4 424 ret void 425} 426 427; SI-LABEL: {{^}}constant_align2_load_i32: 428; ALIGNED: buffer_load_ushort 429; ALIGNED: buffer_load_ushort 430 431; UNALIGNED: s_load_dword 432; UNALIGNED: buffer_store_dword 433define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { 434 %v = load i32, i32 addrspace(2)* %p, align 2 435 store i32 %v, i32 addrspace(1)* %r, align 4 436 ret void 437} 438 439; SI-LABEL: {{^}}constant_align2_load_i64: 440; ALIGNED: buffer_load_ushort 441; ALIGNED: buffer_load_ushort 442; ALIGNED: buffer_load_ushort 443; ALIGNED: buffer_load_ushort 444 445; UNALIGNED: s_load_dwordx2 446; UNALIGNED: buffer_store_dwordx2 447define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 { 448 %v = load i64, i64 addrspace(2)* %p, align 2 449 store i64 %v, i64 addrspace(1)* %r, align 4 450 ret void 451} 452 453; SI-LABEL: {{^}}constant_align4_load_i64: 454; SI: s_load_dwordx2 455; SI: buffer_store_dwordx2 456define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 { 457 %v = load i64, i64 addrspace(2)* %p, align 4 458 store i64 %v, i64 addrspace(1)* %r, align 4 459 ret void 460} 461 462; SI-LABEL: {{^}}constant_align4_load_v4i32: 463; SI: s_load_dwordx4 464; SI: buffer_store_dwordx4 465define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 { 466 %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 4 467 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4 468 ret void 469} 470 471; SI-LABEL: {{^}}constant_unaligned_load_v2i32: 472; ALIGNED: buffer_load_ubyte 473; ALIGNED: buffer_load_ubyte 474; ALIGNED: buffer_load_ubyte 475; ALIGNED: buffer_load_ubyte 476 477; ALIGNED: buffer_load_ubyte 478; ALIGNED: buffer_load_ubyte 479; ALIGNED: buffer_load_ubyte 480; ALIGNED: buffer_load_ubyte 481 482; UNALIGNED: buffer_load_dwordx2 483 484; SI: buffer_store_dwordx2 485define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 { 486 %v = load <2 x i32>, <2 x i32> addrspace(2)* %p, align 1 487 store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4 488 ret void 489} 490 491; SI-LABEL: {{^}}constant_unaligned_load_v4i32: 492; ALIGNED: buffer_load_ubyte 493; ALIGNED: buffer_load_ubyte 494; ALIGNED: buffer_load_ubyte 495; ALIGNED: buffer_load_ubyte 496 497; ALIGNED: buffer_load_ubyte 498; ALIGNED: buffer_load_ubyte 499; ALIGNED: buffer_load_ubyte 500; ALIGNED: buffer_load_ubyte 501 502; ALIGNED: buffer_load_ubyte 503; ALIGNED: buffer_load_ubyte 504; ALIGNED: buffer_load_ubyte 505; ALIGNED: buffer_load_ubyte 506 507; ALIGNED: buffer_load_ubyte 508; ALIGNED: buffer_load_ubyte 509; ALIGNED: buffer_load_ubyte 510; ALIGNED: buffer_load_ubyte 511 512; UNALIGNED: buffer_load_dwordx4 513 514; SI: buffer_store_dwordx4 515define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 { 516 %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1 517 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4 518 ret void 519} 520 521; SI-LABEL: {{^}}constant_align4_load_i8: 522; SI: buffer_load_ubyte 523; SI: buffer_store_byte 524define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { 525 %v = load i8, i8 addrspace(2)* %p, align 4 526 store i8 %v, i8 addrspace(1)* %r, align 4 527 ret void 528} 529 530; SI-LABEL: {{^}}constant_align2_load_i8: 531; SI: buffer_load_ubyte 532; SI: buffer_store_byte 533define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { 534 %v = load i8, i8 addrspace(2)* %p, align 2 535 store i8 %v, i8 addrspace(1)* %r, align 2 536 ret void 537} 538 539; SI-LABEL: {{^}}constant_align4_merge_load_2_i32: 540; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} 541; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]] 542; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]] 543; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 544define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { 545 %gep0 = getelementptr i32, i32 addrspace(2)* %p, i64 1 546 %v0 = load i32, i32 addrspace(2)* %p, align 4 547 %v1 = load i32, i32 addrspace(2)* %gep0, align 4 548 549 %gep1 = getelementptr i32, i32 addrspace(1)* %r, i64 1 550 store i32 %v0, i32 addrspace(1)* %r, align 4 551 store i32 %v1, i32 addrspace(1)* %gep1, align 4 552 ret void 553} 554 555attributes #0 = { nounwind } 556