1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s 2; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s 3 4; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s 5; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s 6 7; Run with devices with different unaligned load restrictions. 8 9; TODO: Vector element tests 10; TODO: Non-zero base offset for load and store combinations 11; TODO: Same base addrspacecasted 12 13 14; GCN-LABEL: {{^}}merge_global_store_2_constants_i8: 15; GCN: buffer_store_byte 16; GCN: buffer_store_byte 17; GCN: s_endpgm 18define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { 19 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 20 21 store i8 123, i8 addrspace(1)* %out.gep.1 22 store i8 456, i8 addrspace(1)* %out, align 2 23 ret void 24} 25 26; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align: 27; GCN: buffer_store_byte 28; GCN: buffer_store_byte 29; GCN: s_endpgm 30define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { 31 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 32 33 store i8 123, i8 addrspace(1)* %out.gep.1 34 store i8 456, i8 addrspace(1)* %out 35 ret void 36} 37 38; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: 39; GCN: buffer_store_dword v 40define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { 41 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 42 43 store i16 123, i16 addrspace(1)* %out.gep.1 44 store i16 456, i16 addrspace(1)* %out, align 4 45 ret void 46} 47 48; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: 49; GCN: buffer_store_dword v 50define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { 51 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 52 53 store i16 0, i16 addrspace(1)* %out.gep.1 54 store i16 0, i16 addrspace(1)* %out, align 4 55 ret void 56} 57 58; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align: 59; GCN: buffer_store_short 60; GCN: buffer_store_short 61; GCN: s_endpgm 62define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { 63 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 64 65 store i16 123, i16 addrspace(1)* %out.gep.1 66 store i16 456, i16 addrspace(1)* %out 67 ret void 68} 69 70; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: 71; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 72; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 73; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} 74define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { 75 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 76 77 store i32 123, i32 addrspace(1)* %out.gep.1 78 store i32 456, i32 addrspace(1)* %out 79 ret void 80} 81 82; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: 83; GCN: buffer_store_dwordx2 84define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { 85 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 86 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* 87 store float 1.0, float addrspace(1)* %out.gep.1.bc 88 store i32 456, i32 addrspace(1)* %out 89 ret void 90} 91 92; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: 93; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 94; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b 95; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 96define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { 97 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 98 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* 99 store i32 123, i32 addrspace(1)* %out.gep.1.bc 100 store float 4.0, float addrspace(1)* %out 101 ret void 102} 103 104; GCN-LABEL: {{^}}merge_global_store_4_constants_i32: 105; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}} 106; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}} 107; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}} 108; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}} 109; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}} 110define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { 111 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 112 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 113 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 114 115 store i32 123, i32 addrspace(1)* %out.gep.1 116 store i32 456, i32 addrspace(1)* %out.gep.2 117 store i32 333, i32 addrspace(1)* %out.gep.3 118 store i32 1234, i32 addrspace(1)* %out 119 ret void 120} 121 122; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: 123; GCN: buffer_store_dwordx4 124define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { 125 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 126 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 127 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 128 129 store float 8.0, float addrspace(1)* %out 130 store float 1.0, float addrspace(1)* %out.gep.1 131 store float 2.0, float addrspace(1)* %out.gep.2 132 store float 4.0, float addrspace(1)* %out.gep.3 133 ret void 134} 135 136; First store is out of order. 137; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: 138; GCN: buffer_store_dwordx4 139define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { 140 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 141 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 142 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 143 144 store float 1.0, float addrspace(1)* %out.gep.1 145 store float 2.0, float addrspace(1)* %out.gep.2 146 store float 4.0, float addrspace(1)* %out.gep.3 147 store float 8.0, float addrspace(1)* %out 148 ret void 149} 150 151; FIXME: Should be able to merge this 152; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32: 153; GCN-NOAA: buffer_store_dword v 154; GCN-NOAA: buffer_store_dword v 155; GCN-NOAA: buffer_store_dword v 156; GCN-NOAA: buffer_store_dword v 157 158; GCN-AA: buffer_store_dwordx2 159; GCN-AA: buffer_store_dword v 160; GCN-AA: buffer_store_dword v 161 162; GCN: s_endpgm 163define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { 164 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 165 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 166 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 167 168 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* 169 %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)* 170 171 store i32 11, i32 addrspace(1)* %out.gep.1.bc 172 store float 2.0, float addrspace(1)* %out.gep.2 173 store i32 17, i32 addrspace(1)* %out.gep.3.bc 174 store float 8.0, float addrspace(1)* %out 175 ret void 176} 177 178; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: 179; SI-DAG: buffer_store_dwordx2 180; SI-DAG: buffer_store_dword 181; SI-NOT: buffer_store_dword 182; GCN: s_endpgm 183define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { 184 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 185 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 186 187 store i32 123, i32 addrspace(1)* %out.gep.1 188 store i32 456, i32 addrspace(1)* %out.gep.2 189 store i32 1234, i32 addrspace(1)* %out 190 ret void 191} 192 193; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: 194; GCN: buffer_store_dwordx4 195define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { 196 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 197 198 store i64 123, i64 addrspace(1)* %out.gep.1 199 store i64 456, i64 addrspace(1)* %out 200 ret void 201} 202 203; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: 204; GCN: buffer_store_dwordx4 205; GCN: buffer_store_dwordx4 206define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { 207 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 208 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 209 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 210 211 store i64 123, i64 addrspace(1)* %out.gep.1 212 store i64 456, i64 addrspace(1)* %out.gep.2 213 store i64 333, i64 addrspace(1)* %out.gep.3 214 store i64 1234, i64 addrspace(1)* %out 215 ret void 216} 217 218; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: 219; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 220; GCN: buffer_store_dwordx2 [[LOAD]] 221define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 222 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 223 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 224 225 %lo = load i32, i32 addrspace(1)* %in 226 %hi = load i32, i32 addrspace(1)* %in.gep.1 227 228 store i32 %lo, i32 addrspace(1)* %out 229 store i32 %hi, i32 addrspace(1)* %out.gep.1 230 ret void 231} 232 233; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: 234; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 235; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 236define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 237 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 238 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 239 240 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2 241 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3 242 %lo = load i32, i32 addrspace(1)* %in.gep.0 243 %hi = load i32, i32 addrspace(1)* %in.gep.1 244 245 store i32 %lo, i32 addrspace(1)* %out.gep.0 246 store i32 %hi, i32 addrspace(1)* %out.gep.1 247 ret void 248} 249 250; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: 251; GCN: buffer_load_dword v 252; GCN: buffer_load_dword v 253; GCN: buffer_store_dword v 254; GCN: buffer_store_dword v 255define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 256 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 257 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 258 259 %lo = load i32, i32 addrspace(1)* %in 260 %hi = load i32, i32 addrspace(1)* %in.gep.1 261 262 store i32 %hi, i32 addrspace(1)* %out 263 store i32 %lo, i32 addrspace(1)* %out.gep.1 264 ret void 265} 266 267; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: 268; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 269; GCN: buffer_store_dwordx4 [[LOAD]] 270define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 271 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 272 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 273 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 274 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 275 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 276 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 277 278 %x = load i32, i32 addrspace(1)* %in 279 %y = load i32, i32 addrspace(1)* %in.gep.1 280 %z = load i32, i32 addrspace(1)* %in.gep.2 281 %w = load i32, i32 addrspace(1)* %in.gep.3 282 283 store i32 %x, i32 addrspace(1)* %out 284 store i32 %y, i32 addrspace(1)* %out.gep.1 285 store i32 %z, i32 addrspace(1)* %out.gep.2 286 store i32 %w, i32 addrspace(1)* %out.gep.3 287 ret void 288} 289 290; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: 291; SI-DAG: buffer_load_dwordx2 292; SI-DAG: buffer_load_dword v 293; GCN: s_waitcnt 294; SI-DAG: buffer_store_dword v 295; SI-DAG: buffer_store_dwordx2 v 296; GCN: s_endpgm 297define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 298 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 299 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 300 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 301 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 302 303 %x = load i32, i32 addrspace(1)* %in 304 %y = load i32, i32 addrspace(1)* %in.gep.1 305 %z = load i32, i32 addrspace(1)* %in.gep.2 306 307 store i32 %x, i32 addrspace(1)* %out 308 store i32 %y, i32 addrspace(1)* %out.gep.1 309 store i32 %z, i32 addrspace(1)* %out.gep.2 310 ret void 311} 312 313; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: 314; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 315; GCN: buffer_store_dwordx4 [[LOAD]] 316define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 317 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 318 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 319 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 320 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1 321 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2 322 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3 323 324 %x = load float, float addrspace(1)* %in 325 %y = load float, float addrspace(1)* %in.gep.1 326 %z = load float, float addrspace(1)* %in.gep.2 327 %w = load float, float addrspace(1)* %in.gep.3 328 329 store float %x, float addrspace(1)* %out 330 store float %y, float addrspace(1)* %out.gep.1 331 store float %z, float addrspace(1)* %out.gep.2 332 store float %w, float addrspace(1)* %out.gep.3 333 ret void 334} 335 336; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: 337; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 338; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 339define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 340 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 341 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 342 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 343 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14 344 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7 345 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8 346 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9 347 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10 348 349 %x = load i32, i32 addrspace(1)* %in.gep.0 350 %y = load i32, i32 addrspace(1)* %in.gep.1 351 %z = load i32, i32 addrspace(1)* %in.gep.2 352 %w = load i32, i32 addrspace(1)* %in.gep.3 353 354 store i32 %x, i32 addrspace(1)* %out.gep.0 355 store i32 %y, i32 addrspace(1)* %out.gep.1 356 store i32 %z, i32 addrspace(1)* %out.gep.2 357 store i32 %w, i32 addrspace(1)* %out.gep.3 358 ret void 359} 360 361; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32: 362; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 363; GCN: s_barrier 364; GCN: buffer_store_dwordx4 [[LOAD]] 365define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 366 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 367 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 368 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 369 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 370 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 371 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 372 373 %x = load i32, i32 addrspace(1)* %in 374 %y = load i32, i32 addrspace(1)* %in.gep.1 375 %z = load i32, i32 addrspace(1)* %in.gep.2 376 %w = load i32, i32 addrspace(1)* %in.gep.3 377 378 ; Make sure the barrier doesn't stop this 379 tail call void @llvm.AMDGPU.barrier.local() #1 380 381 store i32 %w, i32 addrspace(1)* %out.gep.3 382 store i32 %z, i32 addrspace(1)* %out.gep.2 383 store i32 %y, i32 addrspace(1)* %out.gep.1 384 store i32 %x, i32 addrspace(1)* %out 385 386 ret void 387} 388 389; TODO: Re-packing of loaded register required. Maybe an IR pass 390; should catch this? 391 392; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: 393; GCN: buffer_load_dword v 394; GCN: buffer_load_dword v 395; GCN: buffer_load_dword v 396; GCN: buffer_load_dword v 397; GCN: s_barrier 398; GCN: buffer_store_dword v 399; GCN: buffer_store_dword v 400; GCN: buffer_store_dword v 401; GCN: buffer_store_dword v 402define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 403 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 404 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 405 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 406 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 407 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 408 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 409 410 %x = load i32, i32 addrspace(1)* %in 411 %y = load i32, i32 addrspace(1)* %in.gep.1 412 %z = load i32, i32 addrspace(1)* %in.gep.2 413 %w = load i32, i32 addrspace(1)* %in.gep.3 414 415 ; Make sure the barrier doesn't stop this 416 tail call void @llvm.AMDGPU.barrier.local() #1 417 418 store i32 %w, i32 addrspace(1)* %out 419 store i32 %z, i32 addrspace(1)* %out.gep.1 420 store i32 %y, i32 addrspace(1)* %out.gep.2 421 store i32 %x, i32 addrspace(1)* %out.gep.3 422 423 ret void 424} 425 426; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8: 427; GCN: buffer_load_dword [[LOAD:v[0-9]+]] 428; GCN: buffer_store_dword [[LOAD]] 429; GCN: s_endpgm 430define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 431 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 432 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 433 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 434 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 435 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 436 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 437 438 %x = load i8, i8 addrspace(1)* %in, align 4 439 %y = load i8, i8 addrspace(1)* %in.gep.1 440 %z = load i8, i8 addrspace(1)* %in.gep.2 441 %w = load i8, i8 addrspace(1)* %in.gep.3 442 443 store i8 %x, i8 addrspace(1)* %out, align 4 444 store i8 %y, i8 addrspace(1)* %out.gep.1 445 store i8 %z, i8 addrspace(1)* %out.gep.2 446 store i8 %w, i8 addrspace(1)* %out.gep.3 447 ret void 448} 449 450; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align: 451; GCN: buffer_load_ubyte 452; GCN: buffer_load_ubyte 453; GCN: buffer_load_ubyte 454; GCN: buffer_load_ubyte 455; GCN: buffer_store_byte 456; GCN: buffer_store_byte 457; GCN: buffer_store_byte 458; GCN: buffer_store_byte 459; GCN: s_endpgm 460define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 461 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 462 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 463 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 464 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 465 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 466 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 467 468 %x = load i8, i8 addrspace(1)* %in 469 %y = load i8, i8 addrspace(1)* %in.gep.1 470 %z = load i8, i8 addrspace(1)* %in.gep.2 471 %w = load i8, i8 addrspace(1)* %in.gep.3 472 473 store i8 %x, i8 addrspace(1)* %out 474 store i8 %y, i8 addrspace(1)* %out.gep.1 475 store i8 %z, i8 addrspace(1)* %out.gep.2 476 store i8 %w, i8 addrspace(1)* %out.gep.3 477 ret void 478} 479 480; This works once AA is enabled on the subtarget 481; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: 482; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 483 484; GCN-NOAA: buffer_store_dword v 485; GCN-NOAA: buffer_store_dword v 486; GCN-NOAA: buffer_store_dword v 487; GCN-NOAA: buffer_store_dword v 488 489; GCN-AA: buffer_store_dwordx4 [[LOAD]] 490 491; GCN: s_endpgm 492define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { 493 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 494 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 495 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 496 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in 497 498 %x = extractelement <4 x i32> %vec, i32 0 499 %y = extractelement <4 x i32> %vec, i32 1 500 %z = extractelement <4 x i32> %vec, i32 2 501 %w = extractelement <4 x i32> %vec, i32 3 502 503 store i32 %x, i32 addrspace(1)* %out 504 store i32 %y, i32 addrspace(1)* %out.gep.1 505 store i32 %z, i32 addrspace(1)* %out.gep.2 506 store i32 %w, i32 addrspace(1)* %out.gep.3 507 ret void 508} 509 510; GCN-LABEL: {{^}}merge_local_store_2_constants_i8: 511; GCN: ds_write_b8 512; GCN: ds_write_b8 513; GCN: s_endpgm 514define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { 515 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 516 517 store i8 123, i8 addrspace(3)* %out.gep.1 518 store i8 456, i8 addrspace(3)* %out, align 2 519 ret void 520} 521 522; GCN-LABEL: {{^}}merge_local_store_2_constants_i32: 523; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 524; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 525; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} 526define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { 527 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 528 529 store i32 123, i32 addrspace(3)* %out.gep.1 530 store i32 456, i32 addrspace(3)* %out 531 ret void 532} 533 534; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: 535; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8 536; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d 537; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3 538 539; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2 540; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b 541; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1 542 543; GCN: s_endpgm 544define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { 545 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 546 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 547 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 548 549 store i32 123, i32 addrspace(3)* %out.gep.1 550 store i32 456, i32 addrspace(3)* %out.gep.2 551 store i32 333, i32 addrspace(3)* %out.gep.3 552 store i32 1234, i32 addrspace(3)* %out 553 ret void 554} 555 556; GCN-LABEL: {{^}}merge_global_store_5_constants_i32: 557; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}} 558; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}} 559; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}} 560; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} 561; GCN: buffer_store_dword v[[HI]] 562define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { 563 store i32 9, i32 addrspace(1)* %out, align 4 564 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 565 store i32 12, i32 addrspace(1)* %idx1, align 4 566 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 567 store i32 16, i32 addrspace(1)* %idx2, align 4 568 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 569 store i32 -12, i32 addrspace(1)* %idx3, align 4 570 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 571 store i32 11, i32 addrspace(1)* %idx4, align 4 572 ret void 573} 574 575; GCN-LABEL: {{^}}merge_global_store_6_constants_i32: 576; GCN: buffer_store_dwordx4 577; GCN: buffer_store_dwordx2 578define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { 579 store i32 13, i32 addrspace(1)* %out, align 4 580 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 581 store i32 15, i32 addrspace(1)* %idx1, align 4 582 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 583 store i32 62, i32 addrspace(1)* %idx2, align 4 584 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 585 store i32 63, i32 addrspace(1)* %idx3, align 4 586 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 587 store i32 11, i32 addrspace(1)* %idx4, align 4 588 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 589 store i32 123, i32 addrspace(1)* %idx5, align 4 590 ret void 591} 592 593; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: 594; GCN: buffer_store_dwordx4 595; GCN: buffer_store_dwordx2 596; GCN: buffer_store_dword v 597define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { 598 store i32 34, i32 addrspace(1)* %out, align 4 599 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 600 store i32 999, i32 addrspace(1)* %idx1, align 4 601 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 602 store i32 65, i32 addrspace(1)* %idx2, align 4 603 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 604 store i32 33, i32 addrspace(1)* %idx3, align 4 605 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 606 store i32 98, i32 addrspace(1)* %idx4, align 4 607 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 608 store i32 91, i32 addrspace(1)* %idx5, align 4 609 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 610 store i32 212, i32 addrspace(1)* %idx6, align 4 611 ret void 612} 613 614; GCN-LABEL: {{^}}merge_global_store_8_constants_i32: 615; GCN: buffer_store_dwordx4 616; GCN: buffer_store_dwordx4 617; GCN: s_endpgm 618define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { 619 store i32 34, i32 addrspace(1)* %out, align 4 620 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 621 store i32 999, i32 addrspace(1)* %idx1, align 4 622 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 623 store i32 65, i32 addrspace(1)* %idx2, align 4 624 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 625 store i32 33, i32 addrspace(1)* %idx3, align 4 626 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 627 store i32 98, i32 addrspace(1)* %idx4, align 4 628 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 629 store i32 91, i32 addrspace(1)* %idx5, align 4 630 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 631 store i32 212, i32 addrspace(1)* %idx6, align 4 632 %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7 633 store i32 999, i32 addrspace(1)* %idx7, align 4 634 ret void 635} 636 637; This requires handling of scalar_to_vector for v2i64 to avoid 638; scratch usage. 639; FIXME: Should do single load and store 640 641; GCN-LABEL: {{^}}copy_v3i32_align4: 642; GCN-NOT: SCRATCH_RSRC_DWORD 643; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 644; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 645; GCN-NOT: offen 646; GCN: s_waitcnt vmcnt 647; GCN-NOT: offen 648; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 649; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 650 651; GCN: ScratchSize: 0{{$}} 652define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { 653 %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 654 store <3 x i32> %vec, <3 x i32> addrspace(1)* %out 655 ret void 656} 657 658; GCN-LABEL: {{^}}copy_v3i64_align4: 659; GCN-NOT: SCRATCH_RSRC_DWORD 660; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 661; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 662; GCN-NOT: offen 663; GCN: s_waitcnt vmcnt 664; GCN-NOT: offen 665; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 666; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 667; GCN: ScratchSize: 0{{$}} 668define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { 669 %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 670 store <3 x i64> %vec, <3 x i64> addrspace(1)* %out 671 ret void 672} 673 674; GCN-LABEL: {{^}}copy_v3f32_align4: 675; GCN-NOT: SCRATCH_RSRC_DWORD 676; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 677; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 678; GCN-NOT: offen 679; GCN: s_waitcnt vmcnt 680; GCN-NOT: offen 681; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 682; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 683; GCN: ScratchSize: 0{{$}} 684define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { 685 %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 686 %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0> 687 store <3 x float> %fadd, <3 x float> addrspace(1)* %out 688 ret void 689} 690 691; GCN-LABEL: {{^}}copy_v3f64_align4: 692; GCN-NOT: SCRATCH_RSRC_DWORD 693; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 694; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 695; GCN-NOT: offen 696; GCN: s_waitcnt vmcnt 697; GCN-NOT: offen 698; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 699; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 700; GCN: ScratchSize: 0{{$}} 701define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { 702 %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 703 %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0> 704 store <3 x double> %fadd, <3 x double> addrspace(1)* %out 705 ret void 706} 707 708declare void @llvm.AMDGPU.barrier.local() #1 709 710attributes #0 = { nounwind } 711attributes #1 = { convergent nounwind } 712