1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/conv_constants.h"
17
18 #include <algorithm>
19 #include <string>
20 #include <utility>
21
22 #include "absl/strings/match.h"
23 #include "absl/strings/str_cat.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
26
27 namespace tflite {
28 namespace gpu {
29
30 namespace {
31 // Adreno can provide up to ~3-4KB of constant memory, but in some cases even
32 // 3KB can have very bad performance.
GetAdrenoOptimalMaxConstantSize(const AdrenoInfo & adreno_info)33 int GetAdrenoOptimalMaxConstantSize(const AdrenoInfo& adreno_info) {
34 if (adreno_info.IsAdreno3xx() || adreno_info.IsAdreno4xx() ||
35 adreno_info.IsAdreno5xx()) {
36 return 256 * 10; // 2.5KB
37 } else {
38 return 256 * 14; // 3.5KB
39 }
40 }
41
GetOptimalMaxConstantSize(const GpuInfo & info)42 int GetOptimalMaxConstantSize(const GpuInfo& info) {
43 if (!info.IsAdreno()) {
44 // In general we do not expect that this kernel will be used with non Adreno
45 // so as it tuned for __constant memory that have big profit on Adreno
46 return 1024; // 1KB
47 } else {
48 return GetAdrenoOptimalMaxConstantSize(info.adreno_info);
49 }
50 }
51
52 // src_size and dst_size must be <= 4;
GenerateConv(int src_size,int dst_size,bool use_dot_conv,int const_mem_offset,CalculationsPrecision precision,const std::string & dst,const std::string & src)53 std::string GenerateConv(int src_size, int dst_size, bool use_dot_conv,
54 int const_mem_offset, CalculationsPrecision precision,
55 const std::string& dst, const std::string& src) {
56 std::string result;
57 const std::string postfixes[] = {".x", ".y", ".z", ".w"};
58 if (use_dot_conv) {
59 const std::string src_postfixes[] = {".x", ".xy", ".xyz", ""};
60 const std::string src_postfix = src_postfixes[src_size - 1];
61 for (int i = 0; i < dst_size; ++i) {
62 result += " " + dst + postfixes[i] + " += dot(" + src +
63 ", constants[" + std::to_string(const_mem_offset + i) + "]" +
64 src_postfix + ");\n";
65 }
66 } else {
67 const std::string dst_postfixes[] = {".x", ".xy", ".xyz", ""};
68 const std::string dst_postfix = dst_postfixes[dst_size - 1];
69 if (precision == CalculationsPrecision::F32_F16) {
70 for (int i = 0; i < src_size; ++i) {
71 if (i != 0) {
72 result += " + ";
73 }
74 std::string src_name = src;
75 if (src_size != 1) {
76 src_name += postfixes[i];
77 }
78 result += src_name + " * constants[" +
79 std::to_string(const_mem_offset + i) + "]" + dst_postfix;
80 }
81 std::string size = dst_size == 1 ? "" : std::to_string(dst_size);
82 result = " " + dst + dst_postfix + " += TO_ACCUM_FLT" + size + "(" +
83 result + ");\n";
84 } else {
85 for (int i = 0; i < src_size; ++i) {
86 std::string src_name = src;
87 if (src_size != 1) {
88 src_name += postfixes[i];
89 }
90 result += " " + dst + dst_postfix + " += " + src_name +
91 " * constants[" + std::to_string(const_mem_offset + i) + "]" +
92 dst_postfix + ";\n";
93 }
94 }
95 }
96 return result;
97 }
98
GenerateConvolutionConstantCode(const OperationDef & op_def,const OHWI & weights_shape,bool stride_correction,bool use_dot_conv,GPUOperation * op)99 std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
100 const OHWI& weights_shape,
101 bool stride_correction,
102 bool use_dot_conv,
103 GPUOperation* op) {
104 auto src_desc = op_def.src_tensors[0];
105 src_desc.SetAddressMode(AddressMode::kZero);
106 if (op_def.IsBatchSupported()) {
107 src_desc.SetStateVar("BatchedWidth", "true");
108 }
109 op->AddSrcTensor("src_tensor", src_desc);
110
111 auto dst_desc = op_def.dst_tensors[0];
112 if (op_def.IsBatchSupported()) {
113 dst_desc.SetStateVar("BatchedWidth", "true");
114 }
115 op->AddDstTensor("dst_tensor", dst_desc);
116
117 const int out_z = DivideRoundUp(weights_shape.o, 4);
118 const std::string kOutZ = std::to_string(out_z);
119 const int src_depth = DivideRoundUp(weights_shape.i, 4);
120
121 const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
122
123 std::string c;
124 c += "MAIN_FUNCTION($0) {\n";
125 c += " int X = GLOBAL_ID_0;\n";
126 c += " int Y = GLOBAL_ID_1;\n";
127 c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
128 "return;\n";
129 if (stride_correction) {
130 c += " int start_x = " +
131 GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
132 "args.padding_x") +
133 ";\n";
134 } else {
135 if (op_def.IsBatchSupported()) {
136 c += " int start_x = X * args.stride_x + args.padding_x * "
137 "args.src_tensor.Batch();\n";
138 } else {
139 c += " int start_x = X * args.stride_x + args.padding_x;\n";
140 }
141 }
142 c += " int start_y = Y * args.stride_y + args.padding_y;\n";
143 c += " __constant FLT4* constants = args.weights.GetPtr();\n";
144 for (int i = 0; i < out_z; ++i) {
145 c += " ACCUM_FLT4 r" + std::to_string(i) + " = INIT_ACCUM_FLT4(0.0f);\n";
146 }
147 auto generate_check = [&]() {
148 std::string check;
149 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
150 const std::vector<std::string> names{"x_out", "y_out", "z_out"};
151 for (int i = 0; i < axes.size(); ++i) {
152 const auto& axis = axes[i];
153 if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
154 if (!check.empty()) {
155 check += " || ";
156 }
157 check += names[i];
158 }
159 }
160 return check;
161 };
162 const std::string check = generate_check();
163 int filters_counter = 0;
164 for (int s = 0; s < src_depth; ++s) {
165 const int src_ch_count = std::min(4, weights_shape.i - s * 4);
166 const std::string s_count =
167 src_ch_count == 1 ? "" : std::to_string(src_ch_count);
168 const std::string s_type = absl::StrCat("FLT", s_count);
169 const std::string s_postfix = postfixes[src_ch_count - 1];
170 const std::string dilation_x =
171 op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
172 : "args.dilation_x";
173 for (int ky = 0; ky < weights_shape.h; ++ky) {
174 std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
175 if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
176 c += " {\n";
177 c += " bool y_out = " + s_y + " < 0 || " + s_y +
178 " >= args.src_tensor.Height();\n";
179 }
180 for (int kx = 0; kx < weights_shape.w; ++kx) {
181 c += " {\n";
182 std::string s_x =
183 absl::StrCat("(start_x + ", kx, " * " + dilation_x + ")");
184 if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
185 c += " bool x_out = " + s_x + " < 0 || " + s_x +
186 ">= args.src_tensor.Width();\n";
187 }
188 if (check.empty()) {
189 c += " " + s_type + " src = args.src_tensor.Read(" + s_x + ", " +
190 s_y + ", " + std::to_string(s) + ")" + s_postfix + ";\n";
191 } else {
192 c += " FLT4 zero_vec = INIT_FLT4(0.0);\n";
193 c += " " + s_type + " src = x_out || y_out ? ";
194 c += "zero_vec" + s_postfix + " : args.src_tensor.Read(" + s_x +
195 ", " + s_y + ", " + std::to_string(s) + ")" + s_postfix + ";\n";
196 }
197 for (int d = 0; d < out_z; ++d) {
198 const int dst_ch_count = std::min(4, weights_shape.o - d * 4);
199 c += GenerateConv(src_ch_count, dst_ch_count, use_dot_conv,
200 filters_counter, op_def.precision,
201 "r" + std::to_string(d), "src");
202 filters_counter += use_dot_conv ? dst_ch_count : src_ch_count;
203 }
204 c += " }\n";
205 }
206 if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
207 c += " }\n";
208 }
209 }
210 }
211 for (int i = 0; i < out_z; ++i) {
212 std::string s_i = std::to_string(i);
213 c += " {\n";
214 c += " FLT4 res = TO_FLT4(r" + s_i + ") + args.biases.Read(" + s_i +
215 ");\n";
216 c += " args.dst_tensor.Write(res, X, Y, " + s_i + ");\n";
217 c += " }\n";
218 }
219 c += "}\n";
220 return c;
221 }
222
IsDotConvBetter(int src_channels,int dst_channels)223 bool IsDotConvBetter(int src_channels, int dst_channels) {
224 if (dst_channels % 4 == 0) {
225 return false;
226 }
227
228 // dst_channels % 4 != 0
229 if (src_channels % 4 == 0) {
230 return true;
231 }
232
233 // dst_channels % 4 != 0 && src_channels % 4 != 0
234 const int src_depth = DivideRoundUp(src_channels, 4);
235 const int dst_depth = DivideRoundUp(dst_channels, 4);
236 return dst_channels * src_depth < src_channels * dst_depth;
237 }
238
239 } // namespace
240
IsConvConstantsSupported(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr)241 bool IsConvConstantsSupported(const GpuInfo& gpu_info,
242 const OperationDef& definition,
243 const Convolution2DAttributes& attr) {
244 if (gpu_info.IsAMD() && definition.precision != CalculationsPrecision::F32 &&
245 definition.src_tensors[0].storage_type != TensorStorageType::BUFFER) {
246 // BUG, some AMD GPUs crash without it
247 return false;
248 }
249
250 if (gpu_info.IsApiOpenCl() && gpu_info.IsAdreno()) {
251 const std::string kBadDriver =
252 "OpenCL 2.0 QUALCOMM build: commit #7ff4f54 changeid #I4460aa6217 "
253 "Date: 12/30/18";
254 if (absl::StrContains(gpu_info.opencl_info.platform_version, kBadDriver)) {
255 return false;
256 }
257 }
258
259 const bool use_dot_conv =
260 IsDotConvBetter(attr.weights.shape.i, attr.weights.shape.o);
261 const auto& w_shape = attr.weights.shape;
262 const int src_depth = DivideRoundUp(w_shape.i, 4);
263 const int dst_depth = DivideRoundUp(w_shape.o, 4);
264 const int aligned_ch_count =
265 use_dot_conv ? w_shape.o * src_depth * 4 : w_shape.i * dst_depth * 4;
266 const int filters_count = aligned_ch_count * w_shape.h * w_shape.w;
267 const int float_size = definition.precision == CalculationsPrecision::F32
268 ? sizeof(float)
269 : sizeof(half);
270 const int filters_buffer_size = filters_count * float_size;
271 const int kConstantMaxSize = GetOptimalMaxConstantSize(gpu_info);
272 const int flt4_registers = DivideRoundUp(w_shape.o, 4);
273 return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
274 }
275
CreateConvConstants(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr)276 GPUOperation CreateConvConstants(const GpuInfo& gpu_info,
277 const OperationDef& definition,
278 const Convolution2DAttributes& attr) {
279 const bool use_dot_conv =
280 IsDotConvBetter(attr.weights.shape.i, attr.weights.shape.o);
281 GPUOperation op(definition);
282 UploadWeightsForConvConstants(attr.weights, definition.precision,
283 use_dot_conv, &op);
284 op.args_.AddInt("stride_x", attr.strides.w);
285 op.args_.AddInt("stride_y", attr.strides.h);
286 op.args_.AddInt("padding_x", -attr.padding.prepended.w);
287 op.args_.AddInt("padding_y", -attr.padding.prepended.h);
288 op.args_.AddInt("dilation_x", attr.dilations.w);
289 op.args_.AddInt("dilation_y", attr.dilations.h);
290 op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
291
292 const bool stride_correction =
293 definition.IsBatchSupported() && attr.strides.w != 1;
294
295 op.code_ = GenerateConvolutionConstantCode(
296 definition, attr.weights.shape, stride_correction, use_dot_conv, &op);
297 if (definition.precision == CalculationsPrecision::F16 &&
298 gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
299 op.compiler_options_.push_back(CompilerOptions::kAdrenoFullSimd);
300 }
301 if (definition.precision != CalculationsPrecision::F32 &&
302 gpu_info.IsPowerVR()) {
303 // BUG, some PowerVRs (GE8320) produce incorrect result without it
304 op.compiler_options_.push_back(CompilerOptions::kClDisableOptimizations);
305 }
306
307 TensorLinearDescriptor desc;
308 desc.storage_type = LinearStorageType::BUFFER;
309 desc.element_type = definition.GetDataType();
310 desc.memory_type = MemoryType::CONSTANT;
311 desc.UploadLinearData(attr.bias);
312 op.args_.AddObject(
313 "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
314 return op;
315 }
316
317 } // namespace gpu
318 } // namespace tflite
319