1; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=SI,SICI,ALL %s 2; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=CI,SICI,ALL %s 3; RUN: opt -S -mcpu=gfx1010 -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=GFX10,ALL %s 4 5; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 6; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 7 8define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { 9entry: 10 %stack = alloca [5 x i32], align 4 11 %0 = load i32, i32 addrspace(1)* %in, align 4 12 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 13 store i32 4, i32* %arrayidx1, align 4 14 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 15 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 16 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 17 store i32 5, i32* %arrayidx3, align 4 18 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 19 %2 = load i32, i32* %arrayidx10, align 4 20 store i32 %2, i32 addrspace(1)* %out, align 4 21 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 22 %3 = load i32, i32* %arrayidx12 23 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 24 store i32 %3, i32 addrspace(1)* %arrayidx13 25 ret void 26} 27 28; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4 29 30define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 { 31entry: 32 %stack = alloca [5 x i32], align 4 33 %0 = load i32, i32 addrspace(1)* %in, align 4 34 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 35 store i32 4, i32* %arrayidx1, align 4 36 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 37 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 38 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 39 store i32 5, i32* %arrayidx3, align 4 40 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 41 %2 = load i32, i32* %arrayidx10, align 4 42 store i32 %2, i32 addrspace(1)* %out, align 4 43 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 44 %3 = load i32, i32* %arrayidx12 45 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 46 store i32 %3, i32 addrspace(1)* %arrayidx13 47 ret void 48} 49 50; SI-NOT: @promote_alloca_size_1600.stack 51; CI: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] undef, align 4 52; GFX10: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] undef, align 4 53 54define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 { 55entry: 56 %stack = alloca [5 x i32], align 4 57 %0 = load i32, i32 addrspace(1)* %in, align 4 58 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 59 store i32 4, i32* %arrayidx1, align 4 60 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 61 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 62 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 63 store i32 5, i32* %arrayidx3, align 4 64 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 65 %2 = load i32, i32* %arrayidx10, align 4 66 store i32 %2, i32 addrspace(1)* %out, align 4 67 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 68 %3 = load i32, i32* %arrayidx12 69 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 70 store i32 %3, i32 addrspace(1)* %arrayidx13 71 ret void 72} 73 74; ALL-LABEL: @occupancy_0( 75; CI-NOT: alloca [5 x i32] 76; SI: alloca [5 x i32] 77define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 { 78entry: 79 %stack = alloca [5 x i32], align 4 80 %0 = load i32, i32 addrspace(1)* %in, align 4 81 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 82 store i32 4, i32* %arrayidx1, align 4 83 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 84 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 85 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 86 store i32 5, i32* %arrayidx3, align 4 87 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 88 %2 = load i32, i32* %arrayidx10, align 4 89 store i32 %2, i32 addrspace(1)* %out, align 4 90 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 91 %3 = load i32, i32* %arrayidx12 92 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 93 store i32 %3, i32 addrspace(1)* %arrayidx13 94 ret void 95} 96 97; ALL-LABEL: @occupancy_max( 98; CI-NOT: alloca [5 x i32] 99; SI: alloca [5 x i32] 100define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 { 101entry: 102 %stack = alloca [5 x i32], align 4 103 %0 = load i32, i32 addrspace(1)* %in, align 4 104 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 105 store i32 4, i32* %arrayidx1, align 4 106 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 107 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 108 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 109 store i32 5, i32* %arrayidx3, align 4 110 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 111 %2 = load i32, i32* %arrayidx10, align 4 112 store i32 %2, i32 addrspace(1)* %out, align 4 113 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 114 %3 = load i32, i32* %arrayidx12 115 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 116 store i32 %3, i32 addrspace(1)* %arrayidx13 117 ret void 118} 119 120; SI-LABEL: @occupancy_6( 121; CI-LABEL: @occupancy_6( 122; SI: alloca 123; CI-NOT: alloca 124define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { 125entry: 126 %stack = alloca [42 x i8], align 4 127 %tmp = load i8, i8 addrspace(1)* %in, align 1 128 %tmp4 = sext i8 %tmp to i64 129 %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp4 130 store i8 4, i8* %arrayidx1, align 1 131 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 132 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 133 %tmp5 = sext i8 %tmp1 to i64 134 %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp5 135 store i8 5, i8* %arrayidx3, align 1 136 %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 0 137 %tmp2 = load i8, i8* %arrayidx10, align 1 138 store i8 %tmp2, i8 addrspace(1)* %out, align 1 139 %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 1 140 %tmp3 = load i8, i8* %arrayidx12, align 1 141 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 142 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 143 ret void 144} 145 146; ALL-LABEL: @occupancy_6_over( 147; SICI: alloca [43 x i8] 148; GFX10-NOT: alloca 149 150define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { 151entry: 152 %stack = alloca [43 x i8], align 4 153 %tmp = load i8, i8 addrspace(1)* %in, align 1 154 %tmp4 = sext i8 %tmp to i64 155 %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp4 156 store i8 4, i8* %arrayidx1, align 1 157 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 158 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 159 %tmp5 = sext i8 %tmp1 to i64 160 %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp5 161 store i8 5, i8* %arrayidx3, align 1 162 %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 0 163 %tmp2 = load i8, i8* %arrayidx10, align 1 164 store i8 %tmp2, i8 addrspace(1)* %out, align 1 165 %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 1 166 %tmp3 = load i8, i8* %arrayidx12, align 1 167 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 168 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 169 ret void 170} 171 172; SI-LABEL: @occupancy_8( 173; CI-LABEL: @occupancy_8( 174; SI: alloca 175; CI-NOT: alloca 176define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { 177entry: 178 %stack = alloca [32 x i8], align 4 179 %tmp = load i8, i8 addrspace(1)* %in, align 1 180 %tmp4 = sext i8 %tmp to i64 181 %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp4 182 store i8 4, i8* %arrayidx1, align 1 183 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 184 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 185 %tmp5 = sext i8 %tmp1 to i64 186 %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp5 187 store i8 5, i8* %arrayidx3, align 1 188 %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 0 189 %tmp2 = load i8, i8* %arrayidx10, align 1 190 store i8 %tmp2, i8 addrspace(1)* %out, align 1 191 %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 1 192 %tmp3 = load i8, i8* %arrayidx12, align 1 193 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 194 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 195 ret void 196} 197 198; ALL-LABEL: @occupancy_8_over( 199; SICI: alloca [33 x i8] 200; GFX10-NOT: alloca 201 202define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { 203entry: 204 %stack = alloca [33 x i8], align 4 205 %tmp = load i8, i8 addrspace(1)* %in, align 1 206 %tmp4 = sext i8 %tmp to i64 207 %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp4 208 store i8 4, i8* %arrayidx1, align 1 209 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 210 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 211 %tmp5 = sext i8 %tmp1 to i64 212 %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp5 213 store i8 5, i8* %arrayidx3, align 1 214 %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 0 215 %tmp2 = load i8, i8* %arrayidx10, align 1 216 store i8 %tmp2, i8 addrspace(1)* %out, align 1 217 %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 1 218 %tmp3 = load i8, i8* %arrayidx12, align 1 219 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 220 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 221 ret void 222} 223 224; SI-LABEL: @occupancy_9( 225; CI-LABEL: @occupancy_9( 226; SI: alloca 227; CI-NOT: alloca 228define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { 229entry: 230 %stack = alloca [28 x i8], align 4 231 %tmp = load i8, i8 addrspace(1)* %in, align 1 232 %tmp4 = sext i8 %tmp to i64 233 %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp4 234 store i8 4, i8* %arrayidx1, align 1 235 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 236 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 237 %tmp5 = sext i8 %tmp1 to i64 238 %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp5 239 store i8 5, i8* %arrayidx3, align 1 240 %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 0 241 %tmp2 = load i8, i8* %arrayidx10, align 1 242 store i8 %tmp2, i8 addrspace(1)* %out, align 1 243 %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 1 244 %tmp3 = load i8, i8* %arrayidx12, align 1 245 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 246 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 247 ret void 248} 249 250; ALL-LABEL: @occupancy_9_over( 251; SICI: alloca [29 x i8] 252; GFX10-NOT: alloca 253 254define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { 255entry: 256 %stack = alloca [29 x i8], align 4 257 %tmp = load i8, i8 addrspace(1)* %in, align 1 258 %tmp4 = sext i8 %tmp to i64 259 %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp4 260 store i8 4, i8* %arrayidx1, align 1 261 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 262 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 263 %tmp5 = sext i8 %tmp1 to i64 264 %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp5 265 store i8 5, i8* %arrayidx3, align 1 266 %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 0 267 %tmp2 = load i8, i8* %arrayidx10, align 1 268 store i8 %tmp2, i8 addrspace(1)* %out, align 1 269 %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 1 270 %tmp3 = load i8, i8* %arrayidx12, align 1 271 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 272 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 273 ret void 274} 275 276attributes #0 = { nounwind "amdgpu-flat-work-group-size"="63,63" } 277attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" } 278attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1024,1024" } 279attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" } 280attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" } 281attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" } 282attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" } 283attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" } 284