1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_4x4.h"
17
18 #include <string>
19 #include <utility>
20 #include <vector>
21
22 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
23
24 namespace tflite {
25 namespace gpu {
26
27 namespace {
GetBestWeightsUploadType(const GpuInfo & gpu_info)28 ConvolutionTransposed4x4::WeightsUploadType GetBestWeightsUploadType(
29 const GpuInfo& gpu_info) {
30 ConvolutionTransposed4x4::WeightsUploadType weights_upload_type =
31 ConvolutionTransposed4x4::WeightsUploadType::GLOBAL_MEM;
32 if (gpu_info.IsPowerVR()) {
33 weights_upload_type =
34 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC;
35 } else if (gpu_info.IsNvidia() || gpu_info.IsIntel()) {
36 weights_upload_type =
37 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS;
38 } else if (gpu_info.IsAMD()) {
39 weights_upload_type =
40 ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM;
41 } else {
42 weights_upload_type =
43 ConvolutionTransposed4x4::WeightsUploadType::GLOBAL_MEM;
44 }
45 return weights_upload_type;
46 }
47 } // namespace
48
ConvolutionTransposed4x4(const OperationDef & definition,const GpuInfo & gpu_info)49 ConvolutionTransposed4x4::ConvolutionTransposed4x4(
50 const OperationDef& definition, const GpuInfo& gpu_info)
51 : GPUOperation(definition) {
52 work_group_size_ = int3(8, 4, 1);
53 if (gpu_info.IsApple()) {
54 work_group_launch_order_ = int3(2, 0, 1);
55 }
56
57 if (gpu_info.IsApple()) {
58 weights_layout_ = WeightsLayout::kOICustomSpatialO4I4;
59 } else {
60 weights_layout_ = WeightsLayout::kOICustomSpatialI4O4;
61 }
62
63 code_ = GenerateConvolutionTransposedCode(gpu_info, definition_,
64 GetBestWeightsUploadType(gpu_info));
65 if (definition_.precision == CalculationsPrecision::F16 &&
66 gpu_info.IsPowerVR()) {
67 compiler_options_.push_back(CompilerOptions::kClPowervrFp16);
68 }
69 }
70
GenerateConvolutionTransposedCode(const GpuInfo & gpu_info,const OperationDef & op_def,WeightsUploadType weights_upload_type)71 std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
72 const GpuInfo& gpu_info, const OperationDef& op_def,
73 WeightsUploadType weights_upload_type) {
74 auto src_desc = op_def.src_tensors[0];
75 src_desc.SetAddressMode(AddressMode::kZero);
76 if (op_def.IsBatchSupported()) {
77 src_desc.SetStateVar("BatchedWidth", "true");
78 }
79 AddSrcTensor("src_tensor", src_desc);
80
81 auto dst_desc = op_def.dst_tensors[0];
82 if (op_def.IsBatchSupported()) {
83 dst_desc.SetStateVar("BatchedWidth", "true");
84 }
85 AddDstTensor("dst_tensor", dst_desc);
86
87 if (op_def.src_tensors.size() == 2) {
88 // dynamic weights
89 BufferDescriptor desc;
90 desc.element_type = op_def.src_tensors[1].data_type;
91 desc.element_size = 4;
92 desc.memory_type =
93 weights_upload_type ==
94 ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
95 ? MemoryType::CONSTANT
96 : MemoryType::GLOBAL;
97 AddSrcBuffer("weights", desc);
98 }
99
100 args_.AddInt("filter_offset");
101
102 const bool need_local_mem =
103 weights_upload_type ==
104 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
105 weights_upload_type ==
106 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC;
107
108 const int wg_total_size =
109 work_group_size_.x * work_group_size_.y * work_group_size_.z;
110 const std::string barrier =
111 wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
112 ? "SIMD_LOCAL_MEM_BARRIER"
113 : "LOCAL_MEM_BARRIER";
114
115 std::string c;
116 if (GetWeightsDescription().IsI4O4()) {
117 switch (op_def.precision) {
118 case CalculationsPrecision::F32:
119 case CalculationsPrecision::F16:
120 c += "#define CONV(R, SRC, F) \\\n";
121 c += " R += SRC.x * weights_cache[F]; \\\n";
122 c += " R += SRC.y * weights_cache[F + 1]; \\\n";
123 c += " R += SRC.z * weights_cache[F + 2]; \\\n";
124 c += " R += SRC.w * weights_cache[F + 3]; \n";
125 break;
126 case CalculationsPrecision::F32_F16:
127 c += "#define CONV(R, SRC, F) \\\n";
128 c += " R += TO_ACCUM_TYPE(SRC.x * weights_cache[F] + SRC.y * "
129 "weights_cache[F + 1] + SRC.z * weights_cache[F + 2] + SRC.w * "
130 "weights_cache[F + 3]);\n";
131 break;
132 }
133 } else {
134 // O4I4
135 c += "#define CONV(R, SRC, F) \\\n";
136 c += " R.x += dot(SRC, weights_cache[F]); \\\n";
137 c += " R.y += dot(SRC, weights_cache[F + 1]); \\\n";
138 c += " R.z += dot(SRC, weights_cache[F + 2]); \\\n";
139 c += " R.w += dot(SRC, weights_cache[F + 3]); \n";
140 }
141
142 const std::string weights_space =
143 weights_upload_type ==
144 ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
145 ? "__constant"
146 : "__global";
147
148 const std::string pixel_stride =
149 op_def.IsBatchSupported() ? "args.dst_tensor.Batch()" : "1";
150 if (gpu_info.IsApiOpenCl()) {
151 c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
152 }
153 c += "MAIN_FUNCTION($0) {\n";
154 std::string grid_coords[3];
155 int3 launch_remap;
156 launch_remap[work_group_launch_order_.x] = 0;
157 launch_remap[work_group_launch_order_.y] = 1;
158 launch_remap[work_group_launch_order_.z] = 2;
159 if (work_group_launch_order_[0] == 0) {
160 grid_coords[0] = "GLOBAL_ID_0";
161 } else {
162 grid_coords[0] = "(GROUP_ID_" + std::to_string(launch_remap[0]) +
163 " * GROUP_SIZE_0 + LOCAL_ID_0);\n";
164 }
165 if (work_group_launch_order_[1] == 1) {
166 grid_coords[1] = "GLOBAL_ID_1";
167 } else {
168 grid_coords[1] = "(GROUP_ID_" + std::to_string(launch_remap[1]) +
169 " * GROUP_SIZE_1 + LOCAL_ID_1);\n";
170 }
171 if (work_group_launch_order_[2] == 2) {
172 grid_coords[2] = "GLOBAL_ID_2";
173 } else {
174 grid_coords[2] = "(GROUP_ID_" + std::to_string(launch_remap[2]) +
175 " * GROUP_SIZE_2 + LOCAL_ID_2);\n";
176 }
177 if (op_def.IsBatchSupported()) {
178 c += " int linear_id = " + grid_coords[0] + ";\n";
179 c += " int X0 = linear_id / args.dst_tensor.Batch();\n";
180 c += " int B = linear_id % args.dst_tensor.Batch();\n";
181 }
182 c += " int X = " + grid_coords[0] + ";\n";
183 c += " int Y = " + grid_coords[1] + ";\n";
184 c += " int Z = " + grid_coords[2] + ";\n";
185 if (!need_local_mem) {
186 if (op_def.IsBatchSupported()) {
187 c += " if (X0 * 2 * args.dst_tensor.Batch() > args.dst_tensor.Width() "
188 "|| Y * 2 > args.dst_tensor.Height() || Z "
189 ">= args.dst_tensor.Slices()) return;\n";
190 } else {
191 c += " if (X * 2 > args.dst_tensor.Width() || Y * 2 > "
192 "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) "
193 "return;\n";
194 }
195 }
196 c += " ACCUM_FLT4 r0 = INIT_ACCUM_FLT4(0.0f);\n";
197 c += " ACCUM_FLT4 r1 = INIT_ACCUM_FLT4(0.0f);\n";
198 c += " ACCUM_FLT4 r2 = INIT_ACCUM_FLT4(0.0f);\n";
199 c += " ACCUM_FLT4 r3 = INIT_ACCUM_FLT4(0.0f);\n";
200 c += " int f_offset = Z * args.filter_offset;\n";
201 if (need_local_mem) {
202 c += " __local FLT4 weights_cache[64];\n";
203 }
204 if (weights_upload_type ==
205 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
206 c += " int local_id = LOCAL_ID_1 * 8 + LOCAL_ID_0;\n";
207 }
208 const std::string prev_x = "X - " + pixel_stride;
209 if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
210 c += " bool in_x0 = " + prev_x + " >= 0 && " + prev_x +
211 " < args.src_tensor.Width();\n";
212 c += " bool in_x1 = X >= 0 && X < args.src_tensor.Width();\n";
213 }
214 if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
215 c += " bool in_y0 = Y - 1 >= 0 && Y - 1 < args.src_tensor.Height();\n";
216 c += " bool in_y1 = Y >= 0 && Y < args.src_tensor.Height();\n";
217 }
218 auto generate_check = [&](int x, int y) {
219 std::string check;
220 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT};
221 const std::vector<std::string> names{"in_x" + std::to_string(x),
222 "in_y" + std::to_string(y)};
223 for (int i = 0; i < axes.size(); ++i) {
224 const auto& axis = axes[i];
225 if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
226 if (!check.empty()) {
227 check += " && ";
228 }
229 check += names[i];
230 }
231 }
232 return check;
233 };
234 if (src_desc.IsLinear()) {
235 if (src_desc.ReturnsZeroForNegOneRead()) {
236 c += " args.src_tensor.GetAddress(addr_0, " + prev_x + ", Y - 1, 0);\n";
237 c += " args.src_tensor.GetAddress(addr_1, X, Y - 1, 0);\n";
238 c += " args.src_tensor.GetAddress(addr_2, " + prev_x + ", Y, 0);\n";
239 c += " args.src_tensor.GetAddress(addr_3, X, Y, 0);\n";
240 c += " addr_0 = select(-1, addr_0, (in_x0 && in_y0));\n";
241 c += " addr_1 = select(-1, addr_1, (in_x1 && in_y0));\n";
242 c += " addr_2 = select(-1, addr_2, (in_x0 && in_y1));\n";
243 c += " addr_3 = select(-1, addr_3, (in_x1 && in_y1));\n";
244 c += " int dz_0 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
245 "in_y0));\n";
246 c += " int dz_1 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
247 "in_y0));\n";
248 c += " int dz_2 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
249 "in_y1));\n";
250 c += " int dz_3 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
251 "in_y1));\n";
252 } else {
253 c += " int xc0 = clamp(" + prev_x +
254 ", 0, args.src_tensor.Width() - 1);\n";
255 c += " int xc1 = clamp(X, 0, args.src_tensor.Width() - 1);\n";
256 c += " int yc0 = clamp(Y - 1, 0, args.src_tensor.Height() - 1);\n";
257 c += " int yc1 = clamp(Y, 0, args.src_tensor.Height() - 1);\n";
258 c += " args.src_tensor.GetAddress(addr_0, xc0, yc0, 0);\n";
259 c += " args.src_tensor.GetAddress(addr_1, xc1, yc0, 0);\n";
260 c += " args.src_tensor.GetAddress(addr_2, xc0, yc1, 0);\n";
261 c += " args.src_tensor.GetAddress(addr_3, xc1, yc1, 0);\n";
262 c += " int dz = args.src_tensor.SliceStride();\n";
263 }
264 }
265 auto read_src = [&](int x, int y) {
266 if (src_desc.IsLinear()) {
267 const std::string id = std::to_string(y * 2 + x);
268 const std::string addr = "addr_" + std::to_string(y * 2 + x);
269 if (src_desc.ReturnsZeroForNegOneRead()) {
270 return "args.src_tensor.Read(" + addr + "); " + addr + " += dz_" + id +
271 ";";
272 } else {
273 return "args.src_tensor.Read(" + addr + ") * INIT_FLT(in_x" +
274 std::to_string(x) + " && in_y" + std::to_string(y) + "); " +
275 addr + " += dz;";
276 }
277 } else {
278 std::string check = generate_check(x, y);
279 if (!check.empty()) {
280 check = " * INIT_FLT(" + check + ")";
281 }
282 return "args.src_tensor.Read(X + " + std::to_string(x - 1) + " * " +
283 pixel_stride + ", Y + " + std::to_string(y - 1) + ", s)" + check +
284 ";";
285 }
286 };
287 c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
288 if (need_local_mem) {
289 c += " " + barrier + ";\n";
290 }
291 if (weights_upload_type ==
292 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC) {
293 c += " async_work_group_copy(weights_cache, "
294 "args.weights.GetPtr(f_offset), 64, "
295 "0);\n";
296 } else if (weights_upload_type ==
297 ConvolutionTransposed4x4::WeightsUploadType::
298 LOCAL_MEM_BY_THREADS) {
299 c += " weights_cache[local_id] = args.weights.Read(f_offset + "
300 "local_id);\n";
301 c += " weights_cache[local_id + 32] = args.weights.Read(f_offset + "
302 "local_id + "
303 "32);\n";
304 } else { // GLOBAL_MEM
305 c += " " + weights_space +
306 " FLT4* weights_cache = args.weights.GetPtr(f_offset);\n";
307 }
308 c += " FLT4 src0 = " + read_src(0, 0) + ";\n";
309 c += " FLT4 src1 = " + read_src(1, 0) + ";\n";
310 c += " FLT4 src2 = " + read_src(0, 1) + ";\n";
311 c += " FLT4 src3 = " + read_src(1, 1) + ";\n";
312 c += " f_offset += 64;\n";
313 if (need_local_mem) {
314 c += " " + barrier + ";\n";
315 }
316 c += " CONV(r0, src0, 0);\n";
317 c += " CONV(r1, src0, 4);\n";
318 c += " CONV(r2, src0, 8);\n";
319 c += " CONV(r3, src0, 12);\n";
320 c += " CONV(r0, src1, 16);\n";
321 c += " CONV(r1, src1, 20);\n";
322 c += " CONV(r2, src1, 24);\n";
323 c += " CONV(r3, src1, 28);\n";
324 c += " CONV(r0, src2, 32);\n";
325 c += " CONV(r1, src2, 36);\n";
326 c += " CONV(r2, src2, 40);\n";
327 c += " CONV(r3, src2, 44);\n";
328 c += " CONV(r0, src3, 48);\n";
329 c += " CONV(r1, src3, 52);\n";
330 c += " CONV(r2, src3, 56);\n";
331 c += " CONV(r3, src3, 60);\n";
332 c += " }\n";
333 c += "\n";
334 if (need_local_mem) {
335 if (op_def.IsBatchSupported()) {
336 c += " if (X0 * 2 * args.dst_tensor.Batch() > args.dst_tensor.Width() "
337 "|| Y * 2 > args.dst_tensor.Height() || Z "
338 ">= args.dst_tensor.Slices()) return;\n";
339 } else {
340 c += " if (X * 2 > args.dst_tensor.Width() || Y * 2 > "
341 "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) "
342 "return;\n";
343 }
344 }
345 if (op_def.IsBatchSupported()) {
346 c += " X = X0 * 2 * args.dst_tensor.Batch() + B - "
347 "args.dst_tensor.Batch();\n";
348 } else {
349 c += " X = X * 2 - 1;\n";
350 }
351 c += " Y = Y * 2 - 1;\n";
352 c += "\n";
353 c += " FLT4 bias_val = args.biases.Read(Z);\n";
354 c += " if (X >= 0 && Y >= 0) {\n";
355 c += " FLT4 result = TO_FLT4(r0) + bias_val;\n";
356 c += " args.dst_tensor.Write(result, X, Y, Z);\n";
357 c += " }\n";
358 c +=
359 " if (X + " + pixel_stride + " < args.dst_tensor.Width() && Y >= 0) {\n";
360 c += " FLT4 result = TO_FLT4(r1) + bias_val;\n";
361 c += " args.dst_tensor.Write(result, X + " + pixel_stride + ", Y, Z);\n";
362 c += " }\n";
363 c += " if (X >= 0 && Y + 1 < args.dst_tensor.Height()) {\n";
364 c += " FLT4 result = TO_FLT4(r2) + bias_val;\n";
365 c += " args.dst_tensor.Write(result, X, Y + 1, Z);\n";
366 c += " }\n";
367 c += " if (X + " + pixel_stride +
368 " < args.dst_tensor.Width() && Y + 1 < args.dst_tensor.Height()) {\n";
369 c += " FLT4 result = TO_FLT4(r3) + bias_val;\n";
370 c += " args.dst_tensor.Write(result, X + " + pixel_stride + ", Y+1, Z);\n";
371 c += " }\n";
372 c += "}\n";
373 return c;
374 }
375
BindArguments(ArgumentsBinder * args)376 absl::Status ConvolutionTransposed4x4::BindArguments(ArgumentsBinder* args) {
377 return args->SetInt("filter_offset", 4 * 16 * src_[0]->Slices());
378 }
379
GetGridSize() const380 int3 ConvolutionTransposed4x4::GetGridSize() const {
381 const int grid_x = DivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
382 const int grid_y = DivideRoundUp(dst_[0]->Height() + 2, 2);
383 const int grid_z = dst_[0]->Slices();
384 return int3(grid_x, grid_y, grid_z);
385 }
386
GetSpatialWeightsRemap() const387 std::vector<int> ConvolutionTransposed4x4::GetSpatialWeightsRemap() const {
388 return std::vector<int>{10, 11, 14, 15, 8, 9, 12, 13, 2, 3, 6, 7, 0, 1, 4, 5};
389 }
390
UploadWeights(const tflite::gpu::Tensor<OHWI,DataType::FLOAT32> & weights,WeightsUploadType weights_upload_type)391 void ConvolutionTransposed4x4::UploadWeights(
392 const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights,
393 WeightsUploadType weights_upload_type) {
394 const int flt_count =
395 GetTotalElementsCountForLayout(GetWeightsDescription(), weights.shape);
396
397 DataType weights_type = definition_.precision == CalculationsPrecision::F32
398 ? DataType::FLOAT32
399 : DataType::FLOAT16;
400
401 BufferDescriptor desc;
402 desc.element_type = weights_type;
403 desc.element_size = 4;
404 desc.memory_type =
405 weights_upload_type ==
406 ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
407 ? MemoryType::CONSTANT
408 : MemoryType::GLOBAL;
409 desc.size = flt_count * SizeOf(desc.element_type);
410 desc.data.resize(desc.size);
411
412 RearrangeWeights(weights, GetWeightsDescription(), weights_type,
413 absl::MakeSpan(desc.data));
414 args_.AddObject("weights",
415 absl::make_unique<BufferDescriptor>(std::move(desc)));
416 }
417
IsConvolutionTransposed4x4Supported(const OperationDef & definition,const ConvolutionTransposedAttributes & attr)418 bool IsConvolutionTransposed4x4Supported(
419 const OperationDef& definition,
420 const ConvolutionTransposedAttributes& attr) {
421 return attr.weights.shape.w == 4 && attr.weights.shape.h == 4 &&
422 attr.stride.w == 2 && attr.stride.h == 2 &&
423 attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1;
424 }
425
CreateConvolutionTransposed4x4(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)426 ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
427 const GpuInfo& gpu_info, const OperationDef& definition,
428 const ConvolutionTransposedAttributes& attr) {
429 ConvolutionTransposed4x4 result(definition, gpu_info);
430 result.UploadWeights(attr.weights, GetBestWeightsUploadType(gpu_info));
431
432 TensorLinearDescriptor desc;
433 desc.storage_type = gpu_info.IsApple() || !gpu_info.SupportsImages()
434 ? LinearStorageType::BUFFER
435 : LinearStorageType::TEXTURE_2D;
436 desc.element_type = definition.GetDataType();
437 desc.UploadLinearData(attr.bias);
438 result.args_.AddObject(
439 "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
440 return result;
441 }
442
CreateConvolutionTransposed4x4DynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)443 ConvolutionTransposed4x4 CreateConvolutionTransposed4x4DynamicWeights(
444 const GpuInfo& gpu_info, const OperationDef& definition,
445 const ConvolutionTransposedAttributes& attr) {
446 OperationDef new_def = definition;
447 new_def.src_tensors = {
448 definition.src_tensors[0]}; // leaving only src_tensor def, weights defs
449 // will be added later
450 const DataType weights_type = definition.GetDataType();
451 // add 1 src_tensor(buffer) for weights
452 new_def.src_tensors.push_back(
453 {weights_type, TensorStorageType::BUFFER, Layout::HWC});
454
455 ConvolutionTransposed4x4 result(new_def, gpu_info);
456
457 TensorLinearDescriptor desc;
458 desc.storage_type = gpu_info.IsApple() || !gpu_info.SupportsImages()
459 ? LinearStorageType::BUFFER
460 : LinearStorageType::TEXTURE_2D;
461 desc.element_type = new_def.GetDataType();
462 desc.UploadLinearData(attr.bias);
463 result.args_.AddObject(
464 "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
465 return result;
466 }
467
468 } // namespace gpu
469 } // namespace tflite
470