1; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s 3 4; GCN-LABEL: ds_read32_combine_stride_400: 5; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 6; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] 7 8; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 9; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 10; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 11 12; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] 13; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] 14; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]] 15 16; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100 17; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100 18; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100 19; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100 20define amdgpu_kernel void @ds_read32_combine_stride_400(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { 21bb: 22 %tmp = load float, float addrspace(3)* %arg, align 4 23 %tmp2 = fadd float %tmp, 0.000000e+00 24 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100 25 %tmp4 = load float, float addrspace(3)* %tmp3, align 4 26 %tmp5 = fadd float %tmp2, %tmp4 27 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200 28 %tmp7 = load float, float addrspace(3)* %tmp6, align 4 29 %tmp8 = fadd float %tmp5, %tmp7 30 %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300 31 %tmp10 = load float, float addrspace(3)* %tmp9, align 4 32 %tmp11 = fadd float %tmp8, %tmp10 33 %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400 34 %tmp13 = load float, float addrspace(3)* %tmp12, align 4 35 %tmp14 = fadd float %tmp11, %tmp13 36 %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500 37 %tmp16 = load float, float addrspace(3)* %tmp15, align 4 38 %tmp17 = fadd float %tmp14, %tmp16 39 %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600 40 %tmp19 = load float, float addrspace(3)* %tmp18, align 4 41 %tmp20 = fadd float %tmp17, %tmp19 42 %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700 43 %tmp22 = load float, float addrspace(3)* %tmp21, align 4 44 %tmp23 = fadd float %tmp20, %tmp22 45 store float %tmp23, float *%arg1, align 4 46 ret void 47} 48 49; GCN-LABEL: ds_read32_combine_stride_400_back: 50; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 51; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] 52 53; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 54; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 55; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 56 57; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] 58; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] 59; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]] 60 61; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100 62; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100 63; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100 64; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100 65define amdgpu_kernel void @ds_read32_combine_stride_400_back(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { 66bb: 67 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700 68 %tmp2 = load float, float addrspace(3)* %tmp, align 4 69 %tmp3 = fadd float %tmp2, 0.000000e+00 70 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600 71 %tmp5 = load float, float addrspace(3)* %tmp4, align 4 72 %tmp6 = fadd float %tmp3, %tmp5 73 %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500 74 %tmp8 = load float, float addrspace(3)* %tmp7, align 4 75 %tmp9 = fadd float %tmp6, %tmp8 76 %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400 77 %tmp11 = load float, float addrspace(3)* %tmp10, align 4 78 %tmp12 = fadd float %tmp9, %tmp11 79 %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300 80 %tmp14 = load float, float addrspace(3)* %tmp13, align 4 81 %tmp15 = fadd float %tmp12, %tmp14 82 %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200 83 %tmp17 = load float, float addrspace(3)* %tmp16, align 4 84 %tmp18 = fadd float %tmp15, %tmp17 85 %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100 86 %tmp20 = load float, float addrspace(3)* %tmp19, align 4 87 %tmp21 = fadd float %tmp18, %tmp20 88 %tmp22 = load float, float addrspace(3)* %arg, align 4 89 %tmp23 = fadd float %tmp21, %tmp22 90 store float %tmp23, float *%arg1, align 4 91 ret void 92} 93 94; GCN-LABEL: ds_read32_combine_stride_8192: 95; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 96; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] 97; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32 98; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96 99; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:128 offset1:160 100; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:192 offset1:224 101define amdgpu_kernel void @ds_read32_combine_stride_8192(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { 102bb: 103 %tmp = load float, float addrspace(3)* %arg, align 4 104 %tmp2 = fadd float %tmp, 0.000000e+00 105 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048 106 %tmp4 = load float, float addrspace(3)* %tmp3, align 4 107 %tmp5 = fadd float %tmp2, %tmp4 108 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096 109 %tmp7 = load float, float addrspace(3)* %tmp6, align 4 110 %tmp8 = fadd float %tmp5, %tmp7 111 %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144 112 %tmp10 = load float, float addrspace(3)* %tmp9, align 4 113 %tmp11 = fadd float %tmp8, %tmp10 114 %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192 115 %tmp13 = load float, float addrspace(3)* %tmp12, align 4 116 %tmp14 = fadd float %tmp11, %tmp13 117 %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240 118 %tmp16 = load float, float addrspace(3)* %tmp15, align 4 119 %tmp17 = fadd float %tmp14, %tmp16 120 %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288 121 %tmp19 = load float, float addrspace(3)* %tmp18, align 4 122 %tmp20 = fadd float %tmp17, %tmp19 123 %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336 124 %tmp22 = load float, float addrspace(3)* %tmp21, align 4 125 %tmp23 = fadd float %tmp20, %tmp22 126 store float %tmp23, float *%arg1, align 4 127 ret void 128} 129 130; GCN-LABEL: ds_read32_combine_stride_8192_shifted: 131; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 132; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] 133 134; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] 135; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 136; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 137 138; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] 139; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] 140; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]] 141 142; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32 143; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:32 144; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:32 145define amdgpu_kernel void @ds_read32_combine_stride_8192_shifted(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { 146bb: 147 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2 148 %tmp2 = load float, float addrspace(3)* %tmp, align 4 149 %tmp3 = fadd float %tmp2, 0.000000e+00 150 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2050 151 %tmp5 = load float, float addrspace(3)* %tmp4, align 4 152 %tmp6 = fadd float %tmp3, %tmp5 153 %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4098 154 %tmp8 = load float, float addrspace(3)* %tmp7, align 4 155 %tmp9 = fadd float %tmp6, %tmp8 156 %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6146 157 %tmp11 = load float, float addrspace(3)* %tmp10, align 4 158 %tmp12 = fadd float %tmp9, %tmp11 159 %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8194 160 %tmp14 = load float, float addrspace(3)* %tmp13, align 4 161 %tmp15 = fadd float %tmp12, %tmp14 162 %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10242 163 %tmp17 = load float, float addrspace(3)* %tmp16, align 4 164 %tmp18 = fadd float %tmp15, %tmp17 165 store float %tmp18, float *%arg1, align 4 166 ret void 167} 168 169; GCN-LABEL: ds_read64_combine_stride_400: 170; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 171; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] 172 173; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 174; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]] 175 176; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50 177; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150 178; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250 179; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:50 180define amdgpu_kernel void @ds_read64_combine_stride_400(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) { 181bb: 182 %tmp = load double, double addrspace(3)* %arg, align 8 183 %tmp2 = fadd double %tmp, 0.000000e+00 184 %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 50 185 %tmp4 = load double, double addrspace(3)* %tmp3, align 8 186 %tmp5 = fadd double %tmp2, %tmp4 187 %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100 188 %tmp7 = load double, double addrspace(3)* %tmp6, align 8 189 %tmp8 = fadd double %tmp5, %tmp7 190 %tmp9 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150 191 %tmp10 = load double, double addrspace(3)* %tmp9, align 8 192 %tmp11 = fadd double %tmp8, %tmp10 193 %tmp12 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200 194 %tmp13 = load double, double addrspace(3)* %tmp12, align 8 195 %tmp14 = fadd double %tmp11, %tmp13 196 %tmp15 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250 197 %tmp16 = load double, double addrspace(3)* %tmp15, align 8 198 %tmp17 = fadd double %tmp14, %tmp16 199 %tmp18 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300 200 %tmp19 = load double, double addrspace(3)* %tmp18, align 8 201 %tmp20 = fadd double %tmp17, %tmp19 202 %tmp21 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350 203 %tmp22 = load double, double addrspace(3)* %tmp21, align 8 204 %tmp23 = fadd double %tmp20, %tmp22 205 store double %tmp23, double *%arg1, align 8 206 ret void 207} 208 209; GCN-LABEL: ds_read64_combine_stride_8192_shifted: 210; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 211; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] 212 213; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] 214; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 215; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 216 217; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] 218; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] 219; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]] 220 221; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16 222; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:16 223; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:16 224define amdgpu_kernel void @ds_read64_combine_stride_8192_shifted(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) { 225bb: 226 %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1 227 %tmp2 = load double, double addrspace(3)* %tmp, align 8 228 %tmp3 = fadd double %tmp2, 0.000000e+00 229 %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025 230 %tmp5 = load double, double addrspace(3)* %tmp4, align 8 231 %tmp6 = fadd double %tmp3, %tmp5 232 %tmp7 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049 233 %tmp8 = load double, double addrspace(3)* %tmp7, align 8 234 %tmp9 = fadd double %tmp6, %tmp8 235 %tmp10 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073 236 %tmp11 = load double, double addrspace(3)* %tmp10, align 8 237 %tmp12 = fadd double %tmp9, %tmp11 238 %tmp13 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097 239 %tmp14 = load double, double addrspace(3)* %tmp13, align 8 240 %tmp15 = fadd double %tmp12, %tmp14 241 %tmp16 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121 242 %tmp17 = load double, double addrspace(3)* %tmp16, align 8 243 %tmp18 = fadd double %tmp15, %tmp17 244 store double %tmp18, double *%arg1, align 8 245 ret void 246} 247 248; GCN-LABEL: ds_write32_combine_stride_400: 249; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 250; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] 251 252; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 253; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 254; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 255 256; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] 257; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] 258; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]] 259 260; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 261; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 262; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 263; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 264define amdgpu_kernel void @ds_write32_combine_stride_400(float addrspace(3)* nocapture %arg) { 265bb: 266 store float 1.000000e+00, float addrspace(3)* %arg, align 4 267 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 100 268 store float 1.000000e+00, float addrspace(3)* %tmp, align 4 269 %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200 270 store float 1.000000e+00, float addrspace(3)* %tmp1, align 4 271 %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300 272 store float 1.000000e+00, float addrspace(3)* %tmp2, align 4 273 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400 274 store float 1.000000e+00, float addrspace(3)* %tmp3, align 4 275 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500 276 store float 1.000000e+00, float addrspace(3)* %tmp4, align 4 277 %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600 278 store float 1.000000e+00, float addrspace(3)* %tmp5, align 4 279 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700 280 store float 1.000000e+00, float addrspace(3)* %tmp6, align 4 281 ret void 282} 283 284; GCN-LABEL: ds_write32_combine_stride_400_back: 285; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 286; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] 287 288; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 289; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 290; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 291 292; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] 293; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] 294; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]] 295 296; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 297; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 298; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 299; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 300define amdgpu_kernel void @ds_write32_combine_stride_400_back(float addrspace(3)* nocapture %arg) { 301bb: 302 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700 303 store float 1.000000e+00, float addrspace(3)* %tmp, align 4 304 %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600 305 store float 1.000000e+00, float addrspace(3)* %tmp1, align 4 306 %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500 307 store float 1.000000e+00, float addrspace(3)* %tmp2, align 4 308 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400 309 store float 1.000000e+00, float addrspace(3)* %tmp3, align 4 310 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300 311 store float 1.000000e+00, float addrspace(3)* %tmp4, align 4 312 %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200 313 store float 1.000000e+00, float addrspace(3)* %tmp5, align 4 314 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100 315 store float 1.000000e+00, float addrspace(3)* %tmp6, align 4 316 store float 1.000000e+00, float addrspace(3)* %arg, align 4 317 ret void 318} 319 320; GCN-LABEL: ds_write32_combine_stride_8192: 321; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 322; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] 323; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 324; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96 325; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160 326; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:192 offset1:224 327define amdgpu_kernel void @ds_write32_combine_stride_8192(float addrspace(3)* nocapture %arg) { 328bb: 329 store float 1.000000e+00, float addrspace(3)* %arg, align 4 330 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048 331 store float 1.000000e+00, float addrspace(3)* %tmp, align 4 332 %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096 333 store float 1.000000e+00, float addrspace(3)* %tmp1, align 4 334 %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144 335 store float 1.000000e+00, float addrspace(3)* %tmp2, align 4 336 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192 337 store float 1.000000e+00, float addrspace(3)* %tmp3, align 4 338 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240 339 store float 1.000000e+00, float addrspace(3)* %tmp4, align 4 340 %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288 341 store float 1.000000e+00, float addrspace(3)* %tmp5, align 4 342 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336 343 store float 1.000000e+00, float addrspace(3)* %tmp6, align 4 344 ret void 345} 346 347; GCN-LABEL: ds_write32_combine_stride_8192_shifted: 348; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 349; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] 350 351; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] 352; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 353; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 354 355; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]] 356; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]] 357; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8004, [[BASE]] 358 359; GCN-DAG: ds_write2st64_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 360; GCN-DAG: ds_write2st64_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 361; GCN-DAG: ds_write2st64_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 362define amdgpu_kernel void @ds_write32_combine_stride_8192_shifted(float addrspace(3)* nocapture %arg) { 363bb: 364 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1 365 store float 1.000000e+00, float addrspace(3)* %tmp, align 4 366 %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2049 367 store float 1.000000e+00, float addrspace(3)* %tmp1, align 4 368 %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4097 369 store float 1.000000e+00, float addrspace(3)* %tmp2, align 4 370 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6145 371 store float 1.000000e+00, float addrspace(3)* %tmp3, align 4 372 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8193 373 store float 1.000000e+00, float addrspace(3)* %tmp4, align 4 374 %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10241 375 store float 1.000000e+00, float addrspace(3)* %tmp5, align 4 376 ret void 377} 378 379; GCN-LABEL: ds_write64_combine_stride_400: 380; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 381; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] 382 383; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 384; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]] 385 386; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 387; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150 388; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250 389; GCN-DAG: ds_write2_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 390define amdgpu_kernel void @ds_write64_combine_stride_400(double addrspace(3)* nocapture %arg) { 391bb: 392 store double 1.000000e+00, double addrspace(3)* %arg, align 8 393 %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 50 394 store double 1.000000e+00, double addrspace(3)* %tmp, align 8 395 %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100 396 store double 1.000000e+00, double addrspace(3)* %tmp1, align 8 397 %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150 398 store double 1.000000e+00, double addrspace(3)* %tmp2, align 8 399 %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200 400 store double 1.000000e+00, double addrspace(3)* %tmp3, align 8 401 %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250 402 store double 1.000000e+00, double addrspace(3)* %tmp4, align 8 403 %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300 404 store double 1.000000e+00, double addrspace(3)* %tmp5, align 8 405 %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350 406 store double 1.000000e+00, double addrspace(3)* %tmp6, align 8 407 ret void 408} 409 410; GCN-LABEL: ds_write64_combine_stride_8192_shifted: 411; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 412; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] 413 414; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] 415; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 416; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] 417 418; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] 419; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] 420; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]] 421 422; GCN-DAG: ds_write2st64_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 423; GCN-DAG: ds_write2st64_b64 [[B2]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 424; GCN-DAG: ds_write2st64_b64 [[B3]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 425define amdgpu_kernel void @ds_write64_combine_stride_8192_shifted(double addrspace(3)* nocapture %arg) { 426bb: 427 %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1 428 store double 1.000000e+00, double addrspace(3)* %tmp, align 8 429 %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025 430 store double 1.000000e+00, double addrspace(3)* %tmp1, align 8 431 %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049 432 store double 1.000000e+00, double addrspace(3)* %tmp2, align 8 433 %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073 434 store double 1.000000e+00, double addrspace(3)* %tmp3, align 8 435 %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097 436 store double 1.000000e+00, double addrspace(3)* %tmp4, align 8 437 %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121 438 store double 1.000000e+00, double addrspace(3)* %tmp5, align 8 439 ret void 440} 441