1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s 3 4; This test is mostly to test DAG store merging, so disable the vectorizer. 5; Run with devices with different unaligned load restrictions. 6 7; TODO: Vector element tests 8; TODO: Non-zero base offset for load and store combinations 9; TODO: Same base addrspacecasted 10 11 12; GCN-LABEL: {{^}}merge_global_store_2_constants_i8: 13; GCN: buffer_store_short 14; GCN: s_endpgm 15define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { 16 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 17 18 store i8 123, i8 addrspace(1)* %out.gep.1 19 store i8 456, i8 addrspace(1)* %out, align 2 20 ret void 21} 22 23; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align: 24; GCN: buffer_store_byte 25; GCN: buffer_store_byte 26; GCN: s_endpgm 27define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { 28 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 29 30 store i8 123, i8 addrspace(1)* %out.gep.1 31 store i8 456, i8 addrspace(1)* %out 32 ret void 33} 34 35; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: 36; GCN: buffer_store_dword v 37define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { 38 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 39 40 store i16 123, i16 addrspace(1)* %out.gep.1 41 store i16 456, i16 addrspace(1)* %out, align 4 42 ret void 43} 44 45; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: 46; GCN: buffer_store_dword v 47define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { 48 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 49 50 store i16 0, i16 addrspace(1)* %out.gep.1 51 store i16 0, i16 addrspace(1)* %out, align 4 52 ret void 53} 54 55; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align: 56; GCN: buffer_store_short 57; GCN: buffer_store_short 58; GCN: s_endpgm 59define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { 60 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 61 62 store i16 123, i16 addrspace(1)* %out.gep.1 63 store i16 456, i16 addrspace(1)* %out 64 ret void 65} 66 67; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: 68; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 69; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 70; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} 71define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { 72 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 73 74 store i32 123, i32 addrspace(1)* %out.gep.1 75 store i32 456, i32 addrspace(1)* %out 76 ret void 77} 78 79; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: 80; GCN: buffer_store_dwordx2 81define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { 82 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 83 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* 84 store float 1.0, float addrspace(1)* %out.gep.1.bc 85 store i32 456, i32 addrspace(1)* %out 86 ret void 87} 88 89; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: 90; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 91; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b 92; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 93define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { 94 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 95 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* 96 store i32 123, i32 addrspace(1)* %out.gep.1.bc 97 store float 4.0, float addrspace(1)* %out 98 ret void 99} 100 101; GCN-LABEL: {{^}}merge_global_store_4_constants_i32: 102; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}} 103; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}} 104; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}} 105; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}} 106; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}} 107define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { 108 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 109 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 110 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 111 112 store i32 123, i32 addrspace(1)* %out.gep.1 113 store i32 456, i32 addrspace(1)* %out.gep.2 114 store i32 333, i32 addrspace(1)* %out.gep.3 115 store i32 1234, i32 addrspace(1)* %out 116 ret void 117} 118 119; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: 120; GCN: buffer_store_dwordx4 121define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { 122 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 123 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 124 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 125 126 store float 8.0, float addrspace(1)* %out 127 store float 1.0, float addrspace(1)* %out.gep.1 128 store float 2.0, float addrspace(1)* %out.gep.2 129 store float 4.0, float addrspace(1)* %out.gep.3 130 ret void 131} 132 133; First store is out of order. 134; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: 135; GCN: buffer_store_dwordx4 136define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { 137 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 138 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 139 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 140 141 store float 1.0, float addrspace(1)* %out.gep.1 142 store float 2.0, float addrspace(1)* %out.gep.2 143 store float 4.0, float addrspace(1)* %out.gep.3 144 store float 8.0, float addrspace(1)* %out 145 ret void 146} 147 148; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32: 149; GCN-AA: buffer_store_dwordx4 v 150; GCN: s_endpgm 151define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { 152 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 153 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 154 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 155 156 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* 157 %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)* 158 159 store i32 11, i32 addrspace(1)* %out.gep.1.bc 160 store float 2.0, float addrspace(1)* %out.gep.2 161 store i32 17, i32 addrspace(1)* %out.gep.3.bc 162 store float 8.0, float addrspace(1)* %out 163 ret void 164} 165 166; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: 167; SI-DAG: buffer_store_dwordx2 168; SI-DAG: buffer_store_dword 169; SI-NOT: buffer_store_dword 170; GCN: s_endpgm 171define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { 172 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 173 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 174 175 store i32 123, i32 addrspace(1)* %out.gep.1 176 store i32 456, i32 addrspace(1)* %out.gep.2 177 store i32 1234, i32 addrspace(1)* %out 178 ret void 179} 180 181; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: 182; GCN: buffer_store_dwordx4 183define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { 184 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 185 186 store i64 123, i64 addrspace(1)* %out.gep.1 187 store i64 456, i64 addrspace(1)* %out 188 ret void 189} 190 191; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: 192; GCN: buffer_store_dwordx4 193; GCN: buffer_store_dwordx4 194define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { 195 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 196 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 197 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 198 199 store i64 123, i64 addrspace(1)* %out.gep.1 200 store i64 456, i64 addrspace(1)* %out.gep.2 201 store i64 333, i64 addrspace(1)* %out.gep.3 202 store i64 1234, i64 addrspace(1)* %out 203 ret void 204} 205 206; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: 207; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 208; GCN: buffer_store_dwordx2 [[LOAD]] 209define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 210 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 211 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 212 213 %lo = load i32, i32 addrspace(1)* %in 214 %hi = load i32, i32 addrspace(1)* %in.gep.1 215 216 store i32 %lo, i32 addrspace(1)* %out 217 store i32 %hi, i32 addrspace(1)* %out.gep.1 218 ret void 219} 220 221; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: 222; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 223; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 224define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 225 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 226 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 227 228 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2 229 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3 230 %lo = load i32, i32 addrspace(1)* %in.gep.0 231 %hi = load i32, i32 addrspace(1)* %in.gep.1 232 233 store i32 %lo, i32 addrspace(1)* %out.gep.0 234 store i32 %hi, i32 addrspace(1)* %out.gep.1 235 ret void 236} 237 238; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: 239; GCN: buffer_load_dwordx2 v 240; GCN: buffer_store_dwordx2 v 241define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 242 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 243 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 244 245 %lo = load i32, i32 addrspace(1)* %in 246 %hi = load i32, i32 addrspace(1)* %in.gep.1 247 248 store i32 %hi, i32 addrspace(1)* %out 249 store i32 %lo, i32 addrspace(1)* %out.gep.1 250 ret void 251} 252 253; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: 254; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 255; GCN: buffer_store_dwordx4 [[LOAD]] 256define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 257 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 258 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 259 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 260 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 261 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 262 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 263 264 %x = load i32, i32 addrspace(1)* %in 265 %y = load i32, i32 addrspace(1)* %in.gep.1 266 %z = load i32, i32 addrspace(1)* %in.gep.2 267 %w = load i32, i32 addrspace(1)* %in.gep.3 268 269 store i32 %x, i32 addrspace(1)* %out 270 store i32 %y, i32 addrspace(1)* %out.gep.1 271 store i32 %z, i32 addrspace(1)* %out.gep.2 272 store i32 %w, i32 addrspace(1)* %out.gep.3 273 ret void 274} 275 276; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: 277; SI-DAG: buffer_load_dwordx2 278; SI-DAG: buffer_load_dword v 279; GCN: s_waitcnt 280; SI-DAG: buffer_store_dword v 281; SI-DAG: buffer_store_dwordx2 v 282; GCN: s_endpgm 283define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 284 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 285 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 286 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 287 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 288 289 %x = load i32, i32 addrspace(1)* %in 290 %y = load i32, i32 addrspace(1)* %in.gep.1 291 %z = load i32, i32 addrspace(1)* %in.gep.2 292 293 store i32 %x, i32 addrspace(1)* %out 294 store i32 %y, i32 addrspace(1)* %out.gep.1 295 store i32 %z, i32 addrspace(1)* %out.gep.2 296 ret void 297} 298 299; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: 300; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 301; GCN: buffer_store_dwordx4 [[LOAD]] 302define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 303 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 304 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 305 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 306 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1 307 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2 308 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3 309 310 %x = load float, float addrspace(1)* %in 311 %y = load float, float addrspace(1)* %in.gep.1 312 %z = load float, float addrspace(1)* %in.gep.2 313 %w = load float, float addrspace(1)* %in.gep.3 314 315 store float %x, float addrspace(1)* %out 316 store float %y, float addrspace(1)* %out.gep.1 317 store float %z, float addrspace(1)* %out.gep.2 318 store float %w, float addrspace(1)* %out.gep.3 319 ret void 320} 321 322; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: 323; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 324; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 325define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 326 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 327 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 328 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 329 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14 330 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7 331 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8 332 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9 333 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10 334 335 %x = load i32, i32 addrspace(1)* %in.gep.0 336 %y = load i32, i32 addrspace(1)* %in.gep.1 337 %z = load i32, i32 addrspace(1)* %in.gep.2 338 %w = load i32, i32 addrspace(1)* %in.gep.3 339 340 store i32 %x, i32 addrspace(1)* %out.gep.0 341 store i32 %y, i32 addrspace(1)* %out.gep.1 342 store i32 %z, i32 addrspace(1)* %out.gep.2 343 store i32 %w, i32 addrspace(1)* %out.gep.3 344 ret void 345} 346 347; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32: 348; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 349; GCN: s_barrier 350; GCN: buffer_store_dwordx4 [[LOAD]] 351define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 352 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 353 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 354 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 355 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 356 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 357 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 358 359 %x = load i32, i32 addrspace(1)* %in 360 %y = load i32, i32 addrspace(1)* %in.gep.1 361 %z = load i32, i32 addrspace(1)* %in.gep.2 362 %w = load i32, i32 addrspace(1)* %in.gep.3 363 364 ; Make sure the barrier doesn't stop this 365 tail call void @llvm.amdgcn.s.barrier() #1 366 367 store i32 %w, i32 addrspace(1)* %out.gep.3 368 store i32 %z, i32 addrspace(1)* %out.gep.2 369 store i32 %y, i32 addrspace(1)* %out.gep.1 370 store i32 %x, i32 addrspace(1)* %out 371 372 ret void 373} 374 375; TODO: Re-packing of loaded register required. Maybe an IR pass 376; should catch this? 377 378; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: 379; GCN: buffer_load_dwordx4 v 380; GCN: s_barrier 381; GCN: buffer_store_dwordx4 v 382define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 383 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 384 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 385 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 386 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 387 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 388 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 389 390 %x = load i32, i32 addrspace(1)* %in 391 %y = load i32, i32 addrspace(1)* %in.gep.1 392 %z = load i32, i32 addrspace(1)* %in.gep.2 393 %w = load i32, i32 addrspace(1)* %in.gep.3 394 395 ; Make sure the barrier doesn't stop this 396 tail call void @llvm.amdgcn.s.barrier() #1 397 398 store i32 %w, i32 addrspace(1)* %out 399 store i32 %z, i32 addrspace(1)* %out.gep.1 400 store i32 %y, i32 addrspace(1)* %out.gep.2 401 store i32 %x, i32 addrspace(1)* %out.gep.3 402 403 ret void 404} 405 406; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8: 407; GCN: buffer_load_dword [[LOAD:v[0-9]+]] 408; GCN: buffer_store_dword [[LOAD]] 409; GCN: s_endpgm 410define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 411 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 412 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 413 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 414 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 415 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 416 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 417 418 %x = load i8, i8 addrspace(1)* %in, align 4 419 %y = load i8, i8 addrspace(1)* %in.gep.1 420 %z = load i8, i8 addrspace(1)* %in.gep.2 421 %w = load i8, i8 addrspace(1)* %in.gep.3 422 423 store i8 %x, i8 addrspace(1)* %out, align 4 424 store i8 %y, i8 addrspace(1)* %out.gep.1 425 store i8 %z, i8 addrspace(1)* %out.gep.2 426 store i8 %w, i8 addrspace(1)* %out.gep.3 427 ret void 428} 429 430; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align: 431; GCN: buffer_load_ubyte 432; GCN: buffer_load_ubyte 433; GCN: buffer_load_ubyte 434; GCN: buffer_load_ubyte 435; GCN: buffer_store_byte 436; GCN: buffer_store_byte 437; GCN: buffer_store_byte 438; GCN: buffer_store_byte 439; GCN: s_endpgm 440define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 441 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 442 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 443 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 444 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 445 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 446 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 447 448 %x = load i8, i8 addrspace(1)* %in 449 %y = load i8, i8 addrspace(1)* %in.gep.1 450 %z = load i8, i8 addrspace(1)* %in.gep.2 451 %w = load i8, i8 addrspace(1)* %in.gep.3 452 453 store i8 %x, i8 addrspace(1)* %out 454 store i8 %y, i8 addrspace(1)* %out.gep.1 455 store i8 %z, i8 addrspace(1)* %out.gep.2 456 store i8 %w, i8 addrspace(1)* %out.gep.3 457 ret void 458} 459 460; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: 461; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 462; GCN: buffer_store_dwordx4 [[LOAD]] 463; GCN: s_endpgm 464define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { 465 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 466 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 467 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 468 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in 469 470 %x = extractelement <4 x i32> %vec, i32 0 471 %y = extractelement <4 x i32> %vec, i32 1 472 %z = extractelement <4 x i32> %vec, i32 2 473 %w = extractelement <4 x i32> %vec, i32 3 474 475 store i32 %x, i32 addrspace(1)* %out 476 store i32 %y, i32 addrspace(1)* %out.gep.1 477 store i32 %z, i32 addrspace(1)* %out.gep.2 478 store i32 %w, i32 addrspace(1)* %out.gep.3 479 ret void 480} 481 482; GCN-LABEL: {{^}}merge_local_store_2_constants_i8: 483; GCN: ds_write_b16 484; GCN: s_endpgm 485define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { 486 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 487 488 store i8 123, i8 addrspace(3)* %out.gep.1 489 store i8 456, i8 addrspace(3)* %out, align 2 490 ret void 491} 492 493; GCN-LABEL: {{^}}merge_local_store_2_constants_i32: 494; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 495; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 496; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} 497define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { 498 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 499 500 store i32 123, i32 addrspace(3)* %out.gep.1 501 store i32 456, i32 addrspace(3)* %out 502 ret void 503} 504 505; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: 506; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8 507; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d 508; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3 509 510; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2 511; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b 512; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1 513 514; GCN: s_endpgm 515define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { 516 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 517 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 518 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 519 520 store i32 123, i32 addrspace(3)* %out.gep.1 521 store i32 456, i32 addrspace(3)* %out.gep.2 522 store i32 333, i32 addrspace(3)* %out.gep.3 523 store i32 1234, i32 addrspace(3)* %out 524 ret void 525} 526 527; GCN-LABEL: {{^}}merge_global_store_5_constants_i32: 528; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}} 529; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}} 530; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}} 531; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} 532; GCN: buffer_store_dword v[[HI]] 533define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { 534 store i32 9, i32 addrspace(1)* %out, align 4 535 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 536 store i32 12, i32 addrspace(1)* %idx1, align 4 537 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 538 store i32 16, i32 addrspace(1)* %idx2, align 4 539 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 540 store i32 -12, i32 addrspace(1)* %idx3, align 4 541 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 542 store i32 11, i32 addrspace(1)* %idx4, align 4 543 ret void 544} 545 546; GCN-LABEL: {{^}}merge_global_store_6_constants_i32: 547; GCN: buffer_store_dwordx4 548; GCN: buffer_store_dwordx2 549define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { 550 store i32 13, i32 addrspace(1)* %out, align 4 551 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 552 store i32 15, i32 addrspace(1)* %idx1, align 4 553 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 554 store i32 62, i32 addrspace(1)* %idx2, align 4 555 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 556 store i32 63, i32 addrspace(1)* %idx3, align 4 557 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 558 store i32 11, i32 addrspace(1)* %idx4, align 4 559 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 560 store i32 123, i32 addrspace(1)* %idx5, align 4 561 ret void 562} 563 564; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: 565; GCN: buffer_store_dwordx4 566; GCN: buffer_store_dwordx2 567; GCN: buffer_store_dword v 568define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { 569 store i32 34, i32 addrspace(1)* %out, align 4 570 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 571 store i32 999, i32 addrspace(1)* %idx1, align 4 572 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 573 store i32 65, i32 addrspace(1)* %idx2, align 4 574 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 575 store i32 33, i32 addrspace(1)* %idx3, align 4 576 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 577 store i32 98, i32 addrspace(1)* %idx4, align 4 578 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 579 store i32 91, i32 addrspace(1)* %idx5, align 4 580 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 581 store i32 212, i32 addrspace(1)* %idx6, align 4 582 ret void 583} 584 585; GCN-LABEL: {{^}}merge_global_store_8_constants_i32: 586; GCN: buffer_store_dwordx4 587; GCN: buffer_store_dwordx4 588; GCN: s_endpgm 589define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { 590 store i32 34, i32 addrspace(1)* %out, align 4 591 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 592 store i32 999, i32 addrspace(1)* %idx1, align 4 593 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 594 store i32 65, i32 addrspace(1)* %idx2, align 4 595 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 596 store i32 33, i32 addrspace(1)* %idx3, align 4 597 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 598 store i32 98, i32 addrspace(1)* %idx4, align 4 599 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 600 store i32 91, i32 addrspace(1)* %idx5, align 4 601 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 602 store i32 212, i32 addrspace(1)* %idx6, align 4 603 %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7 604 store i32 999, i32 addrspace(1)* %idx7, align 4 605 ret void 606} 607 608; This requires handling of scalar_to_vector for v2i64 to avoid 609; scratch usage. 610; FIXME: Should do single load and store 611 612; GCN-LABEL: {{^}}copy_v3i32_align4: 613; GCN-NOT: SCRATCH_RSRC_DWORD 614; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 615; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 616; GCN-NOT: offen 617; GCN: s_waitcnt vmcnt 618; GCN-NOT: offen 619; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 620; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 621 622; GCN: ScratchSize: 0{{$}} 623define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { 624 %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 625 store <3 x i32> %vec, <3 x i32> addrspace(1)* %out 626 ret void 627} 628 629; GCN-LABEL: {{^}}copy_v3i64_align4: 630; GCN-NOT: SCRATCH_RSRC_DWORD 631; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 632; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 633; GCN-NOT: offen 634; GCN: s_waitcnt vmcnt 635; GCN-NOT: offen 636; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 637; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 638; GCN: ScratchSize: 0{{$}} 639define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { 640 %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 641 store <3 x i64> %vec, <3 x i64> addrspace(1)* %out 642 ret void 643} 644 645; GCN-LABEL: {{^}}copy_v3f32_align4: 646; GCN-NOT: SCRATCH_RSRC_DWORD 647; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 648; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 649; GCN-NOT: offen 650; GCN: s_waitcnt vmcnt 651; GCN-NOT: offen 652; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 653; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 654; GCN: ScratchSize: 0{{$}} 655define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { 656 %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 657 %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0> 658 store <3 x float> %fadd, <3 x float> addrspace(1)* %out 659 ret void 660} 661 662; GCN-LABEL: {{^}}copy_v3f64_align4: 663; GCN-NOT: SCRATCH_RSRC_DWORD 664; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 665; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 666; GCN-NOT: offen 667; GCN: s_waitcnt vmcnt 668; GCN-NOT: offen 669; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 670; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 671; GCN: ScratchSize: 0{{$}} 672define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { 673 %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 674 %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0> 675 store <3 x double> %fadd, <3 x double> addrspace(1)* %out 676 ret void 677} 678 679declare void @llvm.amdgcn.s.barrier() #1 680 681attributes #0 = { nounwind } 682attributes #1 = { convergent nounwind } 683