1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/conv_buffer_1x1.h"
17
18 #include <array>
19 #include <string>
20 #include <utility>
21
22 #include "tensorflow/lite/delegates/gpu/common/status.h"
23 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
25
26 namespace tflite {
27 namespace gpu {
28 namespace {
29
30 // element_size must be 1, 2 or 4
31 // 1 - is FLT4
32 // 2 - is FLT8
33 // 4 - is FLT16
34 // This function generates code for arithmetic part of convolution
GetComputationPart(const int3 & block_size,int element_size,CalculationsPrecision precision)35 std::string GetComputationPart(const int3& block_size, int element_size,
36 CalculationsPrecision precision) {
37 const std::string hexes[16] = {"0", "1", "2", "3", "4", "5", "6", "7",
38 "8", "9", "a", "b", "c", "d", "e", "f"};
39 std::string c;
40 for (int z = 0; z < block_size.z; ++z) {
41 const std::string z_s = std::to_string(z);
42 c += " FLT16 W" + z_s + " = weights_cache[" + z_s + "];\n";
43 for (int y = 0; y < block_size.y; ++y) {
44 for (int x = 0; x < block_size.x; ++x) {
45 std::string s_index = std::to_string(y * block_size.x + x);
46 for (int e = 0; e < element_size; ++e) {
47 std::string r_index =
48 z_s + std::to_string(y) + std::to_string(x * element_size + e);
49 const std::string f0 = "W" + z_s + ".s0123";
50 const std::string f1 = "W" + z_s + ".s4567";
51 const std::string f2 = "W" + z_s + ".s89ab";
52 const std::string f3 = "W" + z_s + ".scdef";
53 switch (precision) {
54 case CalculationsPrecision::F32:
55 case CalculationsPrecision::F16:
56 c += " r" + r_index + " += " + f0 + " * s" + s_index + ".s" +
57 hexes[e * 4 + 0] + ";\n";
58 c += " r" + r_index + " += " + f1 + " * s" + s_index + ".s" +
59 hexes[e * 4 + 1] + ";\n";
60 c += " r" + r_index + " += " + f2 + " * s" + s_index + ".s" +
61 hexes[e * 4 + 2] + ";\n";
62 c += " r" + r_index + " += " + f3 + " * s" + s_index + ".s" +
63 hexes[e * 4 + 3] + ";\n";
64 break;
65 case CalculationsPrecision::F32_F16:
66 c += " r" + r_index + " += convert_float4(" + f0 + " * s" +
67 s_index + ".s" + hexes[e * 4 + 0] + " + " + f1 + " * s" +
68 s_index + ".s" + hexes[e * 4 + 1] + " + " + f2 + " * s" +
69 s_index + ".s" + hexes[e * 4 + 2] + " + " + f3 + " * s" +
70 s_index + ".s" + hexes[e * 4 + 3] + ");\n";
71 break;
72 }
73 }
74 }
75 }
76 }
77 return c;
78 }
79
GetBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const BHWC & shape,int src_depth,int dst_depth)80 ConvBuffer1x1::ConvParams GetBestParams(const GpuInfo& gpu_info,
81 const OperationDef& definition,
82 const BHWC& shape, int src_depth,
83 int dst_depth) {
84 ConvBuffer1x1::ConvParams conv_params;
85 conv_params.element_size = 4;
86 conv_params.block_size = int3(1, 1, 1);
87 if (!gpu_info.IsMali()) {
88 return conv_params;
89 }
90 bool can_use_flt8 = (shape.w * shape.b) % 2 == 0 &&
91 definition.precision != CalculationsPrecision::F32;
92 bool is_midgard = gpu_info.IsMali() && gpu_info.mali_info.IsMidgard();
93 if (is_midgard) {
94 if (can_use_flt8) {
95 conv_params.element_size = 8;
96 }
97 if (definition.precision == CalculationsPrecision::F16 || !can_use_flt8) {
98 conv_params.block_size.x = 2;
99 }
100 return conv_params;
101 }
102
103 int task_size = shape.w * shape.b * shape.h * dst_depth;
104 int block_size =
105 GetRecommendedBlockSizeForConv(gpu_info, definition.precision, task_size);
106
107 if (!can_use_flt8 && block_size > 4) {
108 block_size = 4;
109 }
110
111 if (can_use_flt8 && block_size >= 2) {
112 conv_params.element_size = 8;
113 block_size /= 2;
114 }
115 if (block_size == 4) {
116 conv_params.block_size.x = 2;
117 if (definition.precision == CalculationsPrecision::F32 && dst_depth < 32) {
118 conv_params.block_size.y = 2;
119 } else {
120 conv_params.block_size.z = 2;
121 }
122 } else if (block_size == 2) {
123 if (dst_depth >= 32) {
124 conv_params.block_size.z = 2;
125 } else {
126 conv_params.block_size.x = 2;
127 }
128 }
129
130 return conv_params;
131 }
132
GetBestParams(const GpuInfo & gpu_info,const OperationDef & definition,int src_depth,int dst_depth)133 ConvBuffer1x1::ConvParams GetBestParams(const GpuInfo& gpu_info,
134 const OperationDef& definition,
135 int src_depth, int dst_depth) {
136 ConvBuffer1x1::ConvParams conv_params;
137 conv_params.element_size = 4;
138 conv_params.block_size = int3(1, 1, 1);
139 if (gpu_info.IsMali() && definition.precision == CalculationsPrecision::F16 &&
140 gpu_info.GetComputeUnitsCount() <= 4) {
141 conv_params.block_size.x *= 2;
142 }
143 return conv_params;
144 }
145
146 } // namespace
147
ConvBuffer1x1(const OperationDef & definition,const ConvParams & conv_params)148 ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
149 const ConvParams& conv_params)
150 : GPUOperation(definition), conv_params_(conv_params) {
151 code_ = GenerateConvBuffer1x1(definition_, conv_params_, &args_);
152 work_group_size_ = int3(2, 4, 1);
153 }
154
ConvBuffer1x1(ConvBuffer1x1 && operation)155 ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
156 : GPUOperation(std::move(operation)),
157 conv_params_(std::move(operation.conv_params_)) {}
158
operator =(ConvBuffer1x1 && operation)159 ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
160 if (this != &operation) {
161 std::swap(conv_params_, operation.conv_params_);
162 GPUOperation::operator=(std::move(operation));
163 }
164 return *this;
165 }
166
GenerateConvBuffer1x1(const OperationDef & op_def,const ConvBuffer1x1::ConvParams & conv_params,Arguments * args)167 std::string ConvBuffer1x1::GenerateConvBuffer1x1(
168 const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
169 Arguments* args) {
170 auto src_desc = op_def.src_tensors[0];
171 if (op_def.IsBatchSupported()) {
172 src_desc.SetStateVar("BatchedWidth", "true");
173 }
174 if (conv_params_.element_size == 8) {
175 src_desc.SetStateVar("ElementsX2", "true");
176 } else if (conv_params_.element_size == 16) {
177 src_desc.SetStateVar("ElementsX4", "true");
178 }
179 AddSrcTensor("src_tensor", src_desc);
180 if (op_def.src_tensors.size() == 2) {
181 // dynamic weights
182 BufferDescriptor desc;
183 desc.element_type = op_def.src_tensors[1].data_type;
184 desc.element_size = 16;
185 desc.memory_type = MemoryType::GLOBAL;
186 AddSrcBuffer("weights", desc);
187 }
188
189 auto dst_desc = op_def.dst_tensors[0];
190 if (op_def.IsBatchSupported()) {
191 dst_desc.SetStateVar("BatchedWidth", "true");
192 }
193 AddDstTensor("dst_tensor", dst_desc);
194
195 std::string c;
196 switch (op_def.precision) {
197 case CalculationsPrecision::F32:
198 c += "#define FLT8 float8\n";
199 c += "#define FLT16 float16\n";
200 break;
201 case CalculationsPrecision::F32_F16:
202 case CalculationsPrecision::F16:
203 c += "#define FLT8 half8\n";
204 c += "#define FLT16 half16\n";
205 break;
206 }
207
208 const int3 block_size = conv_params.block_size;
209 const int element_size = conv_params.element_size / 4;
210
211 c += "__kernel void main_function(\n";
212 c += "$0) {\n";
213 c += " int X = get_global_id(0) * " +
214 std::to_string(block_size.x * element_size) + ";\n";
215 c += " int X_SRC = get_global_id(0) * " + std::to_string(block_size.x) +
216 ";\n";
217 c += " int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
218 c += " int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
219 c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
220 "Z >= args.dst_tensor.Slices()) return;\n";
221 if (conv_params.different_weights_for_height) {
222 c += " __global FLT16* weights_cache = args.weights.GetPtr() + (Z * "
223 "args.src_tensor.Height() + "
224 "Y * " +
225 std::to_string(block_size.z) +
226 ") * "
227 "args.src_tensor.Slices();\n";
228 } else {
229 c += " __global FLT16* weights_cache = args.weights.GetPtr() + Z * "
230 "args.src_tensor.Slices();\n";
231 }
232 for (int z = 0; z < block_size.z; ++z) {
233 const std::string z_s = std::to_string(z);
234 c += " ACCUM_FLT4 bias_val_" + z_s +
235 " = TO_ACCUM_TYPE(args.biases.Read(Z + " + z_s + "));\n";
236 for (int y = 0; y < block_size.y; ++y) {
237 for (int x = 0; x < block_size.x * element_size; ++x) {
238 c += " ACCUM_FLT4 r" + z_s + std::to_string(y) + std::to_string(x) +
239 " = bias_val_" + z_s + ";\n";
240 }
241 }
242 }
243 for (int x = 0; x < block_size.x; ++x) {
244 std::string x_s = std::to_string(x);
245 c += " int xc" + x_s + " = min(X_SRC + " + std::to_string(x) +
246 ", args.src_tensor.Width() - 1);\n";
247 }
248 for (int y = 0; y < block_size.y; ++y) {
249 std::string y_s = std::to_string(y);
250 c += " int yc" + y_s + " = min(Y + " + y_s +
251 ", args.src_tensor.Height() - 1);\n";
252 }
253 for (int y = 0; y < block_size.y; ++y) {
254 std::string y_s = std::to_string(y);
255 for (int x = 0; x < block_size.x; ++x) {
256 std::string x_s = std::to_string(x);
257 std::string i_s = std::to_string(y * block_size.x + x);
258 c += " int src_addr_" + i_s + " = (yc" + y_s +
259 ") * args.src_tensor.Width() + (xc" + x_s + ");\n";
260 }
261 }
262 c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
263 for (int y = 0; y < block_size.y; ++y) {
264 std::string y_s = std::to_string(y);
265 for (int x = 0; x < block_size.x; ++x) {
266 std::string x_s = std::to_string(x);
267 std::string i_s = std::to_string(y * block_size.x + x);
268 c += " FLT" + std::to_string(element_size * 4) + " s" + i_s +
269 " = args.src_tensor.Read(src_addr_" + i_s + ");\n";
270 }
271 }
272 c += GetComputationPart(block_size, element_size, op_def.precision);
273 for (int i = 0; i < block_size.x * block_size.y; ++i) {
274 std::string i_s = std::to_string(i);
275 c += " src_addr_" + i_s + " += args.src_tensor.SliceStride();\n";
276 }
277 c += " weights_cache += " + std::to_string(block_size.z) + ";\n";
278 c += " }\n"; // SRC_SLICES
279
280 for (int z = 0; z < block_size.z; ++z) {
281 const std::string z_s = std::to_string(z);
282 if (z != 0) {
283 c += " if (Z + " + z_s + " >= args.dst_tensor.Slices()) return;\n";
284 }
285 for (int y = 0; y < block_size.y; ++y) {
286 const std::string y_s = std::to_string(y);
287 for (int x = 0; x < block_size.x * element_size; ++x) {
288 const std::string x_s = std::to_string(x);
289 c += " if (X + " + x_s + " < args.dst_tensor.Width() && Y + " + y_s +
290 " < args.dst_tensor.Height()) {\n";
291 c += " FLT4 res = TO_FLT4(r" + z_s + y_s + x_s + ");\n";
292 c += " args.dst_tensor.Write(res, X + " + x_s + ", Y + " + y_s +
293 ", Z + " + z_s + ");\n";
294 c += " }\n";
295 }
296 }
297 }
298 c += "}\n";
299 return c;
300 }
301
GetGridSize() const302 int3 ConvBuffer1x1::GetGridSize() const {
303 const int dst_width_elements = DivideRoundUp(
304 dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4));
305 const int grid_x =
306 DivideRoundUp(dst_width_elements, conv_params_.block_size.x);
307 const int grid_y =
308 DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
309 const int grid_z =
310 DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
311 return int3(grid_x, grid_y, grid_z);
312 }
313
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const314 void ConvBuffer1x1::GetPossibleKernelWorkGroups(
315 TuningType tuning_type, const GpuInfo& gpu_info,
316 const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
317 GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
318 work_groups);
319 }
320
IsConvBuffer1x1Supported(const OperationDef & definition,const Convolution2DAttributes & attr)321 bool IsConvBuffer1x1Supported(const OperationDef& definition,
322 const Convolution2DAttributes& attr) {
323 auto src_storage_type = definition.src_tensors[0].storage_type;
324 return src_storage_type == TensorStorageType::BUFFER &&
325 attr.weights.shape.w == 1 && attr.weights.shape.h == 1 &&
326 attr.dilations.w == 1 && attr.dilations.h == 1 &&
327 attr.strides.w == 1 && attr.strides.h == 1 &&
328 attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 &&
329 attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
330 }
331
IsConvBuffer1x1Supported(const OperationDef & definition,const BHWC & weights_shape,const Convolution2DAttributes & attr)332 bool IsConvBuffer1x1Supported(const OperationDef& definition,
333 const BHWC& weights_shape,
334 const Convolution2DAttributes& attr) {
335 auto src_storage_type = definition.src_tensors[0].storage_type;
336 return src_storage_type == TensorStorageType::BUFFER &&
337 weights_shape.w == 1 && weights_shape.h == 1 &&
338 attr.dilations.w == 1 && attr.dilations.h == 1 &&
339 attr.strides.w == 1 && attr.strides.h == 1 &&
340 attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 &&
341 attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
342 }
343
CreateConvBuffer1x1(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * shape)344 ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
345 const OperationDef& definition,
346 const Convolution2DAttributes& attr,
347 const BHWC* shape) {
348 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
349 const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
350 ConvBuffer1x1::ConvParams conv_params;
351 if (shape) {
352 conv_params =
353 GetBestParams(gpu_info, definition, *shape, src_depth, dst_depth);
354 } else {
355 conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
356 }
357 ConvBuffer1x1 result(definition, conv_params);
358 result.UploadData(attr.weights, attr.bias);
359 return result;
360 }
361
CreateConvBuffer1x1(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr,const BHWC * shape)362 ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
363 const OperationDef& definition,
364 const FullyConnectedAttributes& attr,
365 const BHWC* shape) {
366 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
367 const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
368 ConvBuffer1x1::ConvParams conv_params;
369 if (shape) {
370 conv_params =
371 GetBestParams(gpu_info, definition, *shape, src_depth, dst_depth);
372 } else {
373 conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
374 }
375 conv_params.block_size.x *= conv_params.block_size.y;
376 conv_params.block_size.y = 1;
377 ConvBuffer1x1 result(definition, conv_params);
378 result.UploadData(attr.weights, attr.bias);
379 return result;
380 }
381
CreateConvBuffer1x1Wino4x4To6x6(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * shape)382 ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
383 const GpuInfo& gpu_info, const OperationDef& definition,
384 const Convolution2DAttributes& attr, const BHWC* shape) {
385 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
386 const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
387 ConvBuffer1x1::ConvParams conv_params;
388 if (shape) {
389 conv_params =
390 GetBestParams(gpu_info, definition, *shape, src_depth, dst_depth);
391 } else {
392 conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
393 }
394 conv_params.block_size.x *= conv_params.block_size.y;
395 conv_params.block_size.y = 1;
396 conv_params.different_weights_for_height = true;
397 ConvBuffer1x1 result(definition, conv_params);
398 result.UploadDataForWinograd4x4To6x6(attr.weights);
399 return result;
400 }
401
CreateConvBuffer1x1DynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const BHWC * dst_shape)402 ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
403 const GpuInfo& gpu_info, const OperationDef& definition,
404 const Convolution2DAttributes& attr, const BHWC& weights_shape,
405 const BHWC* dst_shape) {
406 const int dst_depth = DivideRoundUp(weights_shape.b, 4);
407 const int src_depth = DivideRoundUp(weights_shape.c, 4);
408 ConvBuffer1x1::ConvParams conv_params;
409 if (dst_shape) {
410 conv_params =
411 GetBestParams(gpu_info, definition, *dst_shape, src_depth, dst_depth);
412 } else {
413 conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
414 }
415 ConvBuffer1x1 result(definition, conv_params);
416 result.UploadBiases(attr.bias);
417 return result;
418 }
419
420 } // namespace gpu
421 } // namespace tflite
422