• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/conv_constants.h"
17 
18 #include <algorithm>
19 #include <string>
20 #include <utility>
21 
22 #include "absl/strings/match.h"
23 #include "absl/strings/str_cat.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
26 
27 namespace tflite {
28 namespace gpu {
29 
30 namespace {
31 // Adreno can provide up to ~3-4KB of constant memory, but in some cases even
32 // 3KB can have very bad performance.
GetAdrenoOptimalMaxConstantSize(const AdrenoInfo & adreno_info)33 int GetAdrenoOptimalMaxConstantSize(const AdrenoInfo& adreno_info) {
34   if (adreno_info.IsAdreno3xx() || adreno_info.IsAdreno4xx() ||
35       adreno_info.IsAdreno5xx()) {
36     return 256 * 10;  // 2.5KB
37   } else {
38     return 256 * 14;  // 3.5KB
39   }
40 }
41 
GetOptimalMaxConstantSize(const GpuInfo & info)42 int GetOptimalMaxConstantSize(const GpuInfo& info) {
43   if (!info.IsAdreno()) {
44     // In general we do not expect that this kernel will be used with non Adreno
45     // so as it tuned for __constant memory that have big profit on Adreno
46     return 1024;  // 1KB
47   } else {
48     return GetAdrenoOptimalMaxConstantSize(info.adreno_info);
49   }
50 }
51 
52 // src_size and dst_size must be <= 4;
GenerateConv(int src_size,int dst_size,bool use_dot_conv,int const_mem_offset,CalculationsPrecision precision,const std::string & dst,const std::string & src)53 std::string GenerateConv(int src_size, int dst_size, bool use_dot_conv,
54                          int const_mem_offset, CalculationsPrecision precision,
55                          const std::string& dst, const std::string& src) {
56   std::string result;
57   const std::string postfixes[] = {".x", ".y", ".z", ".w"};
58   if (use_dot_conv) {
59     const std::string src_postfixes[] = {".x", ".xy", ".xyz", ""};
60     const std::string src_postfix = src_postfixes[src_size - 1];
61     for (int i = 0; i < dst_size; ++i) {
62       result += "    " + dst + postfixes[i] + " += dot(" + src +
63                 ", constants[" + std::to_string(const_mem_offset + i) + "]" +
64                 src_postfix + ");\n";
65     }
66   } else {
67     const std::string dst_postfixes[] = {".x", ".xy", ".xyz", ""};
68     const std::string dst_postfix = dst_postfixes[dst_size - 1];
69     if (precision == CalculationsPrecision::F32_F16) {
70       for (int i = 0; i < src_size; ++i) {
71         if (i != 0) {
72           result += " + ";
73         }
74         std::string src_name = src;
75         if (src_size != 1) {
76           src_name += postfixes[i];
77         }
78         result += src_name + " * constants[" +
79                   std::to_string(const_mem_offset + i) + "]" + dst_postfix;
80       }
81       std::string size = dst_size == 1 ? "" : std::to_string(dst_size);
82       result = "    " + dst + dst_postfix + " += TO_ACCUM_FLT" + size + "(" +
83                result + ");\n";
84     } else {
85       for (int i = 0; i < src_size; ++i) {
86         std::string src_name = src;
87         if (src_size != 1) {
88           src_name += postfixes[i];
89         }
90         result += "    " + dst + dst_postfix + " += " + src_name +
91                   " * constants[" + std::to_string(const_mem_offset + i) + "]" +
92                   dst_postfix + ";\n";
93       }
94     }
95   }
96   return result;
97 }
98 
GenerateConvolutionConstantCode(const OperationDef & op_def,const OHWI & weights_shape,bool stride_correction,bool use_dot_conv,GPUOperation * op)99 std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
100                                             const OHWI& weights_shape,
101                                             bool stride_correction,
102                                             bool use_dot_conv,
103                                             GPUOperation* op) {
104   auto src_desc = op_def.src_tensors[0];
105   src_desc.SetAddressMode(AddressMode::kZero);
106   if (op_def.IsBatchSupported()) {
107     src_desc.SetStateVar("BatchedWidth", "true");
108   }
109   op->AddSrcTensor("src_tensor", src_desc);
110 
111   auto dst_desc = op_def.dst_tensors[0];
112   if (op_def.IsBatchSupported()) {
113     dst_desc.SetStateVar("BatchedWidth", "true");
114   }
115   op->AddDstTensor("dst_tensor", dst_desc);
116 
117   const int out_z = DivideRoundUp(weights_shape.o, 4);
118   const std::string kOutZ = std::to_string(out_z);
119   const int src_depth = DivideRoundUp(weights_shape.i, 4);
120 
121   const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
122 
123   std::string c;
124   c += "MAIN_FUNCTION($0) {\n";
125   c += "  int X = GLOBAL_ID_0;\n";
126   c += "  int Y = GLOBAL_ID_1;\n";
127   c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
128        "return;\n";
129   if (stride_correction) {
130     c += "  int start_x = " +
131          GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
132                                "args.padding_x") +
133          ";\n";
134   } else {
135     if (op_def.IsBatchSupported()) {
136       c += "  int start_x = X * args.stride_x + args.padding_x * "
137            "args.src_tensor.Batch();\n";
138     } else {
139       c += "  int start_x = X * args.stride_x + args.padding_x;\n";
140     }
141   }
142   c += "  int start_y = Y * args.stride_y + args.padding_y;\n";
143   c += "  __constant FLT4* constants = args.weights.GetPtr();\n";
144   for (int i = 0; i < out_z; ++i) {
145     c += "  ACCUM_FLT4 r" + std::to_string(i) + " = INIT_ACCUM_FLT4(0.0f);\n";
146   }
147   auto generate_check = [&]() {
148     std::string check;
149     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
150     const std::vector<std::string> names{"x_out", "y_out", "z_out"};
151     for (int i = 0; i < axes.size(); ++i) {
152       const auto& axis = axes[i];
153       if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
154         if (!check.empty()) {
155           check += " || ";
156         }
157         check += names[i];
158       }
159     }
160     return check;
161   };
162   const std::string check = generate_check();
163   int filters_counter = 0;
164   for (int s = 0; s < src_depth; ++s) {
165     const int src_ch_count = std::min(4, weights_shape.i - s * 4);
166     const std::string s_count =
167         src_ch_count == 1 ? "" : std::to_string(src_ch_count);
168     const std::string s_type = absl::StrCat("FLT", s_count);
169     const std::string s_postfix = postfixes[src_ch_count - 1];
170     const std::string dilation_x =
171         op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
172                                   : "args.dilation_x";
173     for (int ky = 0; ky < weights_shape.h; ++ky) {
174       std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
175       if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
176         c += "  {\n";
177         c += "  bool y_out = " + s_y + " < 0 || " + s_y +
178              " >= args.src_tensor.Height();\n";
179       }
180       for (int kx = 0; kx < weights_shape.w; ++kx) {
181         c += "  {\n";
182         std::string s_x =
183             absl::StrCat("(start_x + ", kx, " * " + dilation_x + ")");
184         if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
185           c += "    bool x_out = " + s_x + " < 0 || " + s_x +
186                ">= args.src_tensor.Width();\n";
187         }
188         if (check.empty()) {
189           c += "    " + s_type + " src = args.src_tensor.Read(" + s_x + ", " +
190                s_y + ", " + std::to_string(s) + ")" + s_postfix + ";\n";
191         } else {
192           c += "    FLT4 zero_vec = INIT_FLT4(0.0);\n";
193           c += "    " + s_type + " src = x_out || y_out ? ";
194           c += "zero_vec" + s_postfix + " : args.src_tensor.Read(" + s_x +
195                ", " + s_y + ", " + std::to_string(s) + ")" + s_postfix + ";\n";
196         }
197         for (int d = 0; d < out_z; ++d) {
198           const int dst_ch_count = std::min(4, weights_shape.o - d * 4);
199           c += GenerateConv(src_ch_count, dst_ch_count, use_dot_conv,
200                             filters_counter, op_def.precision,
201                             "r" + std::to_string(d), "src");
202           filters_counter += use_dot_conv ? dst_ch_count : src_ch_count;
203         }
204         c += "  }\n";
205       }
206       if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
207         c += "  }\n";
208       }
209     }
210   }
211   for (int i = 0; i < out_z; ++i) {
212     std::string s_i = std::to_string(i);
213     c += "  {\n";
214     c += "    FLT4 res = TO_FLT4(r" + s_i + ") + args.biases.Read(" + s_i +
215          ");\n";
216     c += "    args.dst_tensor.Write(res, X, Y, " + s_i + ");\n";
217     c += "  }\n";
218   }
219   c += "}\n";
220   return c;
221 }
222 
IsDotConvBetter(int src_channels,int dst_channels)223 bool IsDotConvBetter(int src_channels, int dst_channels) {
224   if (dst_channels % 4 == 0) {
225     return false;
226   }
227 
228   // dst_channels % 4 != 0
229   if (src_channels % 4 == 0) {
230     return true;
231   }
232 
233   // dst_channels % 4 != 0 && src_channels % 4 != 0
234   const int src_depth = DivideRoundUp(src_channels, 4);
235   const int dst_depth = DivideRoundUp(dst_channels, 4);
236   return dst_channels * src_depth < src_channels * dst_depth;
237 }
238 
239 }  // namespace
240 
IsConvConstantsSupported(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr)241 bool IsConvConstantsSupported(const GpuInfo& gpu_info,
242                               const OperationDef& definition,
243                               const Convolution2DAttributes& attr) {
244   if (gpu_info.IsAMD() && definition.precision != CalculationsPrecision::F32 &&
245       definition.src_tensors[0].storage_type != TensorStorageType::BUFFER) {
246     // BUG, some AMD GPUs crash without it
247     return false;
248   }
249 
250   if (gpu_info.IsApiOpenCl() && gpu_info.IsAdreno()) {
251     const std::string kBadDriver =
252         "OpenCL 2.0 QUALCOMM build: commit #7ff4f54 changeid #I4460aa6217 "
253         "Date: 12/30/18";
254     if (absl::StrContains(gpu_info.opencl_info.platform_version, kBadDriver)) {
255       return false;
256     }
257   }
258 
259   const bool use_dot_conv =
260       IsDotConvBetter(attr.weights.shape.i, attr.weights.shape.o);
261   const auto& w_shape = attr.weights.shape;
262   const int src_depth = DivideRoundUp(w_shape.i, 4);
263   const int dst_depth = DivideRoundUp(w_shape.o, 4);
264   const int aligned_ch_count =
265       use_dot_conv ? w_shape.o * src_depth * 4 : w_shape.i * dst_depth * 4;
266   const int filters_count = aligned_ch_count * w_shape.h * w_shape.w;
267   const int float_size = definition.precision == CalculationsPrecision::F32
268                              ? sizeof(float)
269                              : sizeof(half);
270   const int filters_buffer_size = filters_count * float_size;
271   const int kConstantMaxSize = GetOptimalMaxConstantSize(gpu_info);
272   const int flt4_registers = DivideRoundUp(w_shape.o, 4);
273   return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
274 }
275 
CreateConvConstants(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr)276 GPUOperation CreateConvConstants(const GpuInfo& gpu_info,
277                                  const OperationDef& definition,
278                                  const Convolution2DAttributes& attr) {
279   const bool use_dot_conv =
280       IsDotConvBetter(attr.weights.shape.i, attr.weights.shape.o);
281   GPUOperation op(definition);
282   UploadWeightsForConvConstants(attr.weights, definition.precision,
283                                 use_dot_conv, &op);
284   op.args_.AddInt("stride_x", attr.strides.w);
285   op.args_.AddInt("stride_y", attr.strides.h);
286   op.args_.AddInt("padding_x", -attr.padding.prepended.w);
287   op.args_.AddInt("padding_y", -attr.padding.prepended.h);
288   op.args_.AddInt("dilation_x", attr.dilations.w);
289   op.args_.AddInt("dilation_y", attr.dilations.h);
290   op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
291 
292   const bool stride_correction =
293       definition.IsBatchSupported() && attr.strides.w != 1;
294 
295   op.code_ = GenerateConvolutionConstantCode(
296       definition, attr.weights.shape, stride_correction, use_dot_conv, &op);
297   if (definition.precision == CalculationsPrecision::F16 &&
298       gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
299     op.compiler_options_.push_back(CompilerOptions::kAdrenoFullSimd);
300   }
301   if (definition.precision != CalculationsPrecision::F32 &&
302       gpu_info.IsPowerVR()) {
303     // BUG, some PowerVRs (GE8320) produce incorrect result without it
304     op.compiler_options_.push_back(CompilerOptions::kClDisableOptimizations);
305   }
306 
307   TensorLinearDescriptor desc;
308   desc.storage_type = LinearStorageType::BUFFER;
309   desc.element_type = definition.GetDataType();
310   desc.memory_type = MemoryType::CONSTANT;
311   desc.UploadLinearData(attr.bias);
312   op.args_.AddObject(
313       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
314   return op;
315 }
316 
317 }  // namespace gpu
318 }  // namespace tflite
319