1; RUN: opt -mtriple=amdgcn-- -S -separate-const-offset-from-gep -reassociate-geps-verify-no-dead-code -gvn < %s | FileCheck -check-prefix=IR %s 2 3target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" 4 5@array = internal addrspace(2) constant [4096 x [32 x float]] zeroinitializer, align 4 6 7; IR-LABEL: @sum_of_array( 8; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} 9; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 1 10; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 32 11; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 33 12define void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) { 13 %tmp = sext i32 %y to i64 14 %tmp1 = sext i32 %x to i64 15 %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp 16 %tmp4 = load float, float addrspace(2)* %tmp2, align 4 17 %tmp5 = fadd float %tmp4, 0.000000e+00 18 %tmp6 = add i32 %y, 1 19 %tmp7 = sext i32 %tmp6 to i64 20 %tmp8 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp7 21 %tmp10 = load float, float addrspace(2)* %tmp8, align 4 22 %tmp11 = fadd float %tmp5, %tmp10 23 %tmp12 = add i32 %x, 1 24 %tmp13 = sext i32 %tmp12 to i64 25 %tmp14 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp13, i64 %tmp 26 %tmp16 = load float, float addrspace(2)* %tmp14, align 4 27 %tmp17 = fadd float %tmp11, %tmp16 28 %tmp18 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp13, i64 %tmp7 29 %tmp20 = load float, float addrspace(2)* %tmp18, align 4 30 %tmp21 = fadd float %tmp17, %tmp20 31 store float %tmp21, float addrspace(1)* %output, align 4 32 ret void 33} 34 35@array2 = internal addrspace(2) constant [4096 x [4 x float]] zeroinitializer, align 4 36 37; Some of the indices go over the maximum mubuf offset, so don't split them. 38 39; IR-LABEL: @sum_of_array_over_max_mubuf_offset( 40; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} 41; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 255 42; IR: add i32 %x, 256 43; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} 44; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} 45define void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) { 46 %tmp = sext i32 %y to i64 47 %tmp1 = sext i32 %x to i64 48 %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp 49 %tmp4 = load float, float addrspace(2)* %tmp2, align 4 50 %tmp5 = fadd float %tmp4, 0.000000e+00 51 %tmp6 = add i32 %y, 255 52 %tmp7 = sext i32 %tmp6 to i64 53 %tmp8 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp7 54 %tmp10 = load float, float addrspace(2)* %tmp8, align 4 55 %tmp11 = fadd float %tmp5, %tmp10 56 %tmp12 = add i32 %x, 256 57 %tmp13 = sext i32 %tmp12 to i64 58 %tmp14 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp13, i64 %tmp 59 %tmp16 = load float, float addrspace(2)* %tmp14, align 4 60 %tmp17 = fadd float %tmp11, %tmp16 61 %tmp18 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp13, i64 %tmp7 62 %tmp20 = load float, float addrspace(2)* %tmp18, align 4 63 %tmp21 = fadd float %tmp17, %tmp20 64 store float %tmp21, float addrspace(1)* %output, align 4 65 ret void 66} 67 68 69@lds_array = internal addrspace(3) global [4096 x [4 x float]] undef, align 4 70 71; DS instructions have a larger immediate offset, so make sure these are OK. 72; IR-LABEL: @sum_of_lds_array_over_max_mubuf_offset( 73; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %{{[a-zA-Z0-9]+}}, i32 %{{[a-zA-Z0-9]+}} 74; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 255 75; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16128 76; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16383 77define void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) { 78 %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %x, i32 %y 79 %tmp4 = load float, float addrspace(3)* %tmp2, align 4 80 %tmp5 = fadd float %tmp4, 0.000000e+00 81 %tmp6 = add i32 %y, 255 82 %tmp8 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %x, i32 %tmp6 83 %tmp10 = load float, float addrspace(3)* %tmp8, align 4 84 %tmp11 = fadd float %tmp5, %tmp10 85 %tmp12 = add i32 %x, 4032 86 %tmp14 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %tmp12, i32 %y 87 %tmp16 = load float, float addrspace(3)* %tmp14, align 4 88 %tmp17 = fadd float %tmp11, %tmp16 89 %tmp18 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %tmp12, i32 %tmp6 90 %tmp20 = load float, float addrspace(3)* %tmp18, align 4 91 %tmp21 = fadd float %tmp17, %tmp20 92 store float %tmp21, float addrspace(1)* %output, align 4 93 ret void 94} 95