1; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 3 4declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind 5declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind 6 7 8; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1: 9; SI: ds_read_u8 10; SI: ds_read_u8 11; SI: ds_read_u8 12; SI: ds_read_u8 13; SI: ds_read_u8 14; SI: ds_read_u8 15; SI: ds_read_u8 16; SI: ds_read_u8 17 18; SI: ds_read_u8 19; SI: ds_read_u8 20; SI: ds_read_u8 21; SI: ds_read_u8 22; SI: ds_read_u8 23; SI: ds_read_u8 24; SI: ds_read_u8 25; SI: ds_read_u8 26 27; SI: ds_read_u8 28; SI: ds_read_u8 29; SI: ds_read_u8 30; SI: ds_read_u8 31; SI: ds_read_u8 32; SI: ds_read_u8 33; SI: ds_read_u8 34; SI: ds_read_u8 35 36; SI: ds_read_u8 37; SI: ds_read_u8 38; SI: ds_read_u8 39; SI: ds_read_u8 40; SI: ds_read_u8 41; SI: ds_read_u8 42; SI: ds_read_u8 43; SI: ds_read_u8 44 45; SI: ds_write_b8 46; SI: ds_write_b8 47; SI: ds_write_b8 48; SI: ds_write_b8 49; SI: ds_write_b8 50; SI: ds_write_b8 51; SI: ds_write_b8 52; SI: ds_write_b8 53 54; SI: ds_write_b8 55; SI: ds_write_b8 56; SI: ds_write_b8 57; SI: ds_write_b8 58; SI: ds_write_b8 59; SI: ds_write_b8 60; SI: ds_write_b8 61; SI: ds_write_b8 62 63; SI: ds_write_b8 64; SI: ds_write_b8 65; SI: ds_write_b8 66; SI: ds_write_b8 67; SI: ds_write_b8 68; SI: ds_write_b8 69; SI: ds_write_b8 70; SI: ds_write_b8 71 72; SI: ds_write_b8 73; SI: ds_write_b8 74; SI: ds_write_b8 75; SI: ds_write_b8 76; SI: ds_write_b8 77; SI: ds_write_b8 78; SI: ds_write_b8 79; SI: ds_write_b8 80 81; SI: s_endpgm 82define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { 83 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* 84 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* 85 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind 86 ret void 87} 88 89; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2: 90; SI: ds_read_u16 91; SI: ds_read_u16 92; SI: ds_read_u16 93; SI: ds_read_u16 94; SI: ds_read_u16 95; SI: ds_read_u16 96; SI: ds_read_u16 97; SI: ds_read_u16 98 99; SI: ds_read_u16 100; SI: ds_read_u16 101; SI: ds_read_u16 102; SI: ds_read_u16 103; SI: ds_read_u16 104; SI: ds_read_u16 105; SI: ds_read_u16 106; SI: ds_read_u16 107 108; SI: ds_write_b16 109; SI: ds_write_b16 110; SI: ds_write_b16 111; SI: ds_write_b16 112; SI: ds_write_b16 113; SI: ds_write_b16 114; SI: ds_write_b16 115; SI: ds_write_b16 116 117; SI: ds_write_b16 118; SI: ds_write_b16 119; SI: ds_write_b16 120; SI: ds_write_b16 121; SI: ds_write_b16 122; SI: ds_write_b16 123; SI: ds_write_b16 124; SI: ds_write_b16 125 126; SI: s_endpgm 127define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { 128 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* 129 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* 130 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind 131 ret void 132} 133 134; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4: 135; SI-DAG: ds_read_b32 136; SI-DAG: ds_write_b32 137 138; SI-DAG: ds_read_b32 139; SI-DAG: ds_write_b32 140 141; SI-DAG: ds_read_b32 142; SI-DAG: ds_write_b32 143 144; SI-DAG: ds_read_b32 145; SI-DAG: ds_write_b32 146 147; SI-DAG: ds_read_b32 148; SI-DAG: ds_write_b32 149 150; SI-DAG: ds_read_b32 151; SI-DAG: ds_write_b32 152 153; SI-DAG: ds_read_b32 154; SI-DAG: ds_write_b32 155 156; SI-DAG: ds_read_b32 157; SI-DAG: ds_write_b32 158 159; SI-DAG: ds_read_b32 160; SI-DAG: ds_write_b32 161 162; SI: s_endpgm 163define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { 164 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* 165 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* 166 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind 167 ret void 168} 169 170; FIXME: Use 64-bit ops 171; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8: 172 173; SI-DAG: ds_read_b32 174; SI-DAG: ds_write_b32 175 176; SI-DAG: ds_read_b32 177; SI-DAG: ds_write_b32 178 179; SI-DAG: ds_read_b32 180; SI-DAG: ds_write_b32 181 182; SI-DAG: ds_read_b32 183; SI-DAG: ds_write_b32 184 185; SI-DAG: ds_read_b32 186; SI-DAG: ds_write_b32 187 188; SI-DAG: ds_read_b32 189; SI-DAG: ds_write_b32 190 191; SI-DAG: ds_read_b32 192; SI-DAG: ds_write_b32 193 194; SI-DAG: ds_read_b32 195; SI-DAG: ds_write_b32 196 197; SI-DAG: ds_read_b32 198; SI-DAG: ds_write_b32 199 200; SI-DAG: s_endpgm 201define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { 202 %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* 203 %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* 204 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind 205 ret void 206} 207 208; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1: 209; SI-DAG: buffer_load_ubyte 210; SI-DAG: buffer_store_byte 211; SI-DAG: buffer_load_ubyte 212; SI-DAG: buffer_store_byte 213; SI-DAG: buffer_load_ubyte 214; SI-DAG: buffer_store_byte 215; SI-DAG: buffer_load_ubyte 216; SI-DAG: buffer_store_byte 217; SI-DAG: buffer_load_ubyte 218; SI-DAG: buffer_store_byte 219; SI-DAG: buffer_load_ubyte 220; SI-DAG: buffer_store_byte 221; SI-DAG: buffer_load_ubyte 222; SI-DAG: buffer_store_byte 223; SI-DAG: buffer_load_ubyte 224; SI-DAG: buffer_store_byte 225 226; SI-DAG: buffer_load_ubyte 227; SI-DAG: buffer_store_byte 228; SI-DAG: buffer_load_ubyte 229; SI-DAG: buffer_store_byte 230; SI-DAG: buffer_load_ubyte 231; SI-DAG: buffer_store_byte 232; SI-DAG: buffer_load_ubyte 233; SI-DAG: buffer_store_byte 234; SI-DAG: buffer_load_ubyte 235; SI-DAG: buffer_store_byte 236; SI-DAG: buffer_load_ubyte 237; SI-DAG: buffer_store_byte 238; SI-DAG: buffer_load_ubyte 239; SI-DAG: buffer_store_byte 240; SI-DAG: buffer_load_ubyte 241; SI-DAG: buffer_store_byte 242 243; SI-DAG: buffer_load_ubyte 244; SI-DAG: buffer_store_byte 245; SI-DAG: buffer_load_ubyte 246; SI-DAG: buffer_store_byte 247; SI-DAG: buffer_load_ubyte 248; SI-DAG: buffer_store_byte 249; SI-DAG: buffer_load_ubyte 250; SI-DAG: buffer_store_byte 251; SI-DAG: buffer_load_ubyte 252; SI-DAG: buffer_store_byte 253; SI-DAG: buffer_load_ubyte 254; SI-DAG: buffer_store_byte 255; SI-DAG: buffer_load_ubyte 256; SI-DAG: buffer_store_byte 257; SI-DAG: buffer_load_ubyte 258; SI-DAG: buffer_store_byte 259 260; SI-DAG: buffer_load_ubyte 261; SI-DAG: buffer_store_byte 262; SI-DAG: buffer_load_ubyte 263; SI-DAG: buffer_store_byte 264; SI-DAG: buffer_load_ubyte 265; SI-DAG: buffer_store_byte 266; SI-DAG: buffer_load_ubyte 267; SI-DAG: buffer_store_byte 268; SI-DAG: buffer_load_ubyte 269; SI-DAG: buffer_store_byte 270; SI-DAG: buffer_load_ubyte 271; SI-DAG: buffer_store_byte 272; SI-DAG: buffer_load_ubyte 273; SI-DAG: buffer_store_byte 274; SI-DAG: buffer_load_ubyte 275; SI-DAG: buffer_store_byte 276 277; SI: s_endpgm 278define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 279 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 280 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 281 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind 282 ret void 283} 284 285; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2: 286; SI-DAG: buffer_load_ushort 287; SI-DAG: buffer_load_ushort 288; SI-DAG: buffer_load_ushort 289; SI-DAG: buffer_load_ushort 290; SI-DAG: buffer_load_ushort 291; SI-DAG: buffer_load_ushort 292; SI-DAG: buffer_load_ushort 293; SI-DAG: buffer_load_ushort 294; SI-DAG: buffer_load_ushort 295; SI-DAG: buffer_load_ushort 296; SI-DAG: buffer_load_ushort 297; SI-DAG: buffer_load_ushort 298; SI-DAG: buffer_load_ushort 299; SI-DAG: buffer_load_ushort 300; SI-DAG: buffer_load_ushort 301; SI-DAG: buffer_load_ushort 302 303; SI-DAG: buffer_store_short 304; SI-DAG: buffer_store_short 305; SI-DAG: buffer_store_short 306; SI-DAG: buffer_store_short 307; SI-DAG: buffer_store_short 308; SI-DAG: buffer_store_short 309; SI-DAG: buffer_store_short 310; SI-DAG: buffer_store_short 311; SI-DAG: buffer_store_short 312; SI-DAG: buffer_store_short 313; SI-DAG: buffer_store_short 314; SI-DAG: buffer_store_short 315; SI-DAG: buffer_store_short 316; SI-DAG: buffer_store_short 317; SI-DAG: buffer_store_short 318; SI-DAG: buffer_store_short 319 320; SI: s_endpgm 321define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 322 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 323 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 324 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind 325 ret void 326} 327 328; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4: 329; SI: buffer_load_dwordx4 330; SI: buffer_load_dwordx4 331; SI: buffer_store_dwordx4 332; SI: buffer_store_dwordx4 333; SI: s_endpgm 334define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 335 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 336 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 337 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind 338 ret void 339} 340 341; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8: 342; SI: buffer_load_dwordx4 343; SI: buffer_load_dwordx4 344; SI: buffer_store_dwordx4 345; SI: buffer_store_dwordx4 346; SI: s_endpgm 347define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 348 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 349 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 350 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind 351 ret void 352} 353 354; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16: 355; SI: buffer_load_dwordx4 356; SI: buffer_load_dwordx4 357; SI: buffer_store_dwordx4 358; SI: buffer_store_dwordx4 359; SI: s_endpgm 360define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 361 %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* 362 %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* 363 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind 364 ret void 365} 366