• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
5; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
6; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
7target datalayout = "A5"
8
9; OPT-LABEL: @vector_read(
10; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
11; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
12
13; FUNC-LABEL: {{^}}vector_read:
14; EG: MOV
15; EG: MOV
16; EG: MOV
17; EG: MOV
18; EG: MOVA_INT
19define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) {
20entry:
21  %tmp = alloca [4 x i32], addrspace(5)
22  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
23  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
24  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
25  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
26  store i32 0, i32 addrspace(5)* %x
27  store i32 1, i32 addrspace(5)* %y
28  store i32 2, i32 addrspace(5)* %z
29  store i32 3, i32 addrspace(5)* %w
30  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
31  %tmp2 = load i32, i32 addrspace(5)* %tmp1
32  store i32 %tmp2, i32 addrspace(1)* %out
33  ret void
34}
35
36; OPT-LABEL: @vector_write(
37; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
38; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
39; OPT: store i32 %1, i32 addrspace(1)* %out, align 4
40
41; FUNC-LABEL: {{^}}vector_write:
42; EG: MOV
43; EG: MOV
44; EG: MOV
45; EG: MOV
46; EG: MOVA_INT
47; EG: MOVA_INT
48define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
49entry:
50  %tmp = alloca [4 x i32], addrspace(5)
51  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
52  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
53  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
54  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
55  store i32 0, i32 addrspace(5)* %x
56  store i32 0, i32 addrspace(5)* %y
57  store i32 0, i32 addrspace(5)* %z
58  store i32 0, i32 addrspace(5)* %w
59  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index
60  store i32 1, i32 addrspace(5)* %tmp1
61  %tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index
62  %tmp3 = load i32, i32 addrspace(5)* %tmp2
63  store i32 %tmp3, i32 addrspace(1)* %out
64  ret void
65}
66
67; This test should be optimize to:
68; store i32 0, i32 addrspace(1)* %out
69
70; OPT-LABEL: @bitcast_gep(
71; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4
72
73; FUNC-LABEL: {{^}}bitcast_gep:
74; EG: STORE_RAW
75define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
76entry:
77  %tmp = alloca [4 x i32], addrspace(5)
78  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
79  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
80  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
81  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
82  store i32 0, i32 addrspace(5)* %x
83  store i32 0, i32 addrspace(5)* %y
84  store i32 0, i32 addrspace(5)* %z
85  store i32 0, i32 addrspace(5)* %w
86  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
87  %tmp2 = bitcast i32 addrspace(5)* %tmp1 to [4 x i32] addrspace(5)*
88  %tmp3 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp2, i32 0, i32 0
89  %tmp4 = load i32, i32 addrspace(5)* %tmp3
90  store i32 %tmp4, i32 addrspace(1)* %out
91  ret void
92}
93
94; OPT-LABEL: @vector_read_bitcast_gep(
95; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
96; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
97define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
98entry:
99  %tmp = alloca [4 x i32], addrspace(5)
100  %x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
101  %y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
102  %z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
103  %w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
104  %bc = bitcast i32 addrspace(5)* %x to float addrspace(5)*
105  store float 1.0, float addrspace(5)* %bc
106  store i32 1, i32 addrspace(5)* %y
107  store i32 2, i32 addrspace(5)* %z
108  store i32 3, i32 addrspace(5)* %w
109  %tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
110  %tmp2 = load i32, i32 addrspace(5)* %tmp1
111  store i32 %tmp2, i32 addrspace(1)* %out
112  ret void
113}
114
115; FIXME: Should be able to promote this. Instcombine should fold the
116; cast in the hasOneUse case so it might not matter in practice
117
118; OPT-LABEL: @vector_read_bitcast_alloca(
119; OPT: alloca [4 x float]
120; OPT: store float
121; OPT: store float
122; OPT: store float
123; OPT: store float
124; OPT: load float
125define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
126entry:
127  %tmp = alloca [4 x i32], addrspace(5)
128  %tmp.bc = bitcast [4 x i32] addrspace(5)* %tmp to [4 x float] addrspace(5)*
129  %x = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 0
130  %y = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 1
131  %z = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 2
132  %w = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 3
133  store float 0.0, float addrspace(5)* %x
134  store float 1.0, float addrspace(5)* %y
135  store float 2.0, float addrspace(5)* %z
136  store float 4.0, float addrspace(5)* %w
137  %tmp1 = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 %index
138  %tmp2 = load float, float addrspace(5)* %tmp1
139  store float %tmp2, float addrspace(1)* %out
140  ret void
141}
142
143; The pointer arguments in local address space should not affect promotion to vector.
144
145; OPT-LABEL: @vector_read_with_local_arg(
146; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
147; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
148define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) {
149entry:
150  %tmp = alloca [4 x i32], addrspace(5)
151  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
152  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
153  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
154  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
155  store i32 0, i32 addrspace(5)* %x
156  store i32 1, i32 addrspace(5)* %y
157  store i32 2, i32 addrspace(5)* %z
158  store i32 3, i32 addrspace(5)* %w
159  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
160  %tmp2 = load i32, i32 addrspace(5)* %tmp1
161  store i32 %tmp2, i32 addrspace(1)* %out
162  ret void
163}
164