1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/winograd.h"
17
18 #include <string>
19 #include <utility>
20 #include <vector>
21
22 #include "absl/strings/str_format.h"
23 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
27 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
28
29 namespace tflite {
30 namespace gpu {
31 namespace {
GetKernelWinograd4x4To36()32 std::string GetKernelWinograd4x4To36() {
33 std::string c;
34 auto bt_mat = BtMatrixForWinograd4x4To6x6();
35 c += "__constant FLT Bt[36] = {\n";
36 for (int y = 0; y < 6; ++y) {
37 c += "\t";
38 for (int x = 0; x < 6; ++x) {
39 c += absl::StrFormat("%.10f", bt_mat[y * 6 + x]) + "f";
40 if (!(x == 5 && y == 5)) {
41 c += ", ";
42 }
43 }
44 c += "\n";
45 }
46 c += "};\n";
47 c += R"(
48 MAIN_FUNCTION($0) {
49 int X = GLOBAL_ID_0 * 4;
50 int Y = GLOBAL_ID_1 * 4;
51 int S = GLOBAL_ID_2;
52
53 if (GLOBAL_ID_0 >= args.tiles_x || GLOBAL_ID_1 >= args.tiles_y) return;
54
55 FLT4 I[6][6];
56 for (int y = 0; y < 6; ++y) {
57 for (int x = 0; x < 6; ++x) {
58 I[y][x] = INIT_FLT4(0.0f);
59 }
60 }
61 int src_base = S * args.src_tensor.Height() * args.src_tensor.Width();
62 )";
63 for (int y = 0; y < 6; ++y) {
64 const std::string s_y = std::to_string(y);
65 c += " {\n";
66 c += " int coord_y = Y + " + s_y + " + args.padding_y;\n";
67 c += " bool in_y = coord_y >= 0 && coord_y < "
68 "args.src_tensor.Height();\n";
69 c += " coord_y = clamp(coord_y, 0, args.src_tensor.Height() - 1);\n";
70 c += " int src_adress_y = src_base + coord_y * "
71 "args.src_tensor.Width();\n";
72 for (int x = 0; x < 6; ++x) {
73 const std::string s_x = std::to_string(x);
74 c += " {\n";
75 c += " int coord_x = X + " + s_x + " + args.padding_x;\n";
76 c += " bool in_x = coord_x >= 0 && coord_x < "
77 "args.src_tensor.Width();\n";
78 c += " FLT mult = INIT_FLT(in_y && in_x);\n";
79 c += " coord_x = clamp(coord_x, 0, args.src_tensor.Width() - 1);\n";
80 c += " FLT4 src = args.src_tensor.Read(src_adress_y + coord_x) * "
81 "mult;\n";
82 c += " I[0][" + s_x + "] += Bt[" + std::to_string(y) + "] * src;\n";
83 c += " I[1][" + s_x + "] += Bt[" + std::to_string(y + 6) +
84 "] * src;\n";
85 c += " I[2][" + s_x + "] += Bt[" + std::to_string(y + 12) +
86 "] * src;\n";
87 c += " I[3][" + s_x + "] += Bt[" + std::to_string(y + 18) +
88 "] * src;\n";
89 c += " I[4][" + s_x + "] += Bt[" + std::to_string(y + 24) +
90 "] * src;\n";
91 c += " I[5][" + s_x + "] += Bt[" + std::to_string(y + 30) +
92 "] * src;\n";
93 c += " }\n";
94 }
95 c += " }\n";
96 }
97 c += R"(
98
99 int dst_x = GLOBAL_ID_1 * args.tiles_x + GLOBAL_ID_0;
100 args.dst_tensor.GetAddress(dst_adress, dst_x, 0, S);
101 for (int y = 0; y < 6; ++y) {
102 FLT4 value = I[y][0] + Bt[2] * I[y][2] + Bt[4] * I[y][4];
103 args.dst_tensor.WriteLinear(value, dst_adress);
104 dst_adress += args.dst_tensor.Width();
105 value = Bt[7] * I[y][1] + Bt[8] * I[y][2] + Bt[9] * I[y][3] + Bt[10] * I[y][4];
106 args.dst_tensor.WriteLinear(value, dst_adress);
107 dst_adress += args.dst_tensor.Width();
108 value = Bt[13] * I[y][1] + Bt[14] * I[y][2] + Bt[15] * I[y][3] + Bt[16] * I[y][4];
109 args.dst_tensor.WriteLinear(value, dst_adress);
110 dst_adress += args.dst_tensor.Width();
111 value = Bt[19] * I[y][1] + Bt[20] * I[y][2] + Bt[21] * I[y][3] + Bt[22] * I[y][4];
112 args.dst_tensor.WriteLinear(value, dst_adress);
113 dst_adress += args.dst_tensor.Width();
114 value = Bt[25] * I[y][1] + Bt[26] * I[y][2] + Bt[27] * I[y][3] + Bt[28] * I[y][4];
115 args.dst_tensor.WriteLinear(value, dst_adress);
116 dst_adress += args.dst_tensor.Width();
117 value = Bt[31] * I[y][1] + Bt[33] * I[y][3] + I[y][5];
118 args.dst_tensor.WriteLinear(value, dst_adress);
119 dst_adress += args.dst_tensor.Width();
120 }
121 }
122 )";
123 return c;
124 }
125
GetKernelWinograd36To4x4()126 std::string GetKernelWinograd36To4x4() {
127 std::string c;
128 auto at_mat = AtMatrixForWinograd4x4To6x6();
129 c += "__constant FLT At[24] = {\n";
130 for (int y = 0; y < 4; ++y) {
131 c += "\t";
132 for (int x = 0; x < 6; ++x) {
133 c += absl::StrFormat("%.10f", at_mat[y * 6 + x]) + "f";
134 if (!(x == 5 && y == 3)) {
135 c += ", ";
136 }
137 }
138 c += "\n";
139 }
140 c += "};\n";
141 c += R"(
142 MAIN_FUNCTION($0) {
143 int tile_id = GLOBAL_ID_0;
144 int Z = GLOBAL_ID_2;
145 int tiles_count_x = (args.dst_tensor.Width() + 3) / 4;
146 int tile_x = (tile_id % tiles_count_x) * 4;
147 int tile_y = (tile_id / tiles_count_x) * 4;
148 if (tile_x >= args.dst_tensor.Width() || tile_y >= args.dst_tensor.Height()) return;
149
150 int src_adress = Z * args.src_tensor.Height() * args.src_tensor.Width() + tile_id;
151 FLT4 I[4][6];
152 for (int y = 0; y < 4; ++y) {
153 for (int x = 0; x < 6; ++x) {
154 I[y][x] = INIT_FLT4(0.0f);
155 }
156 }
157 for (int y = 0; y < 6; ++y) {
158 for (int x = 0; x < 6; ++x, src_adress += args.src_tensor.Width()) {
159 FLT4 src = args.src_tensor.Read(src_adress);
160 I[0][x] += src * At[y];
161 I[1][x] += src * At[y + 6];
162 I[2][x] += src * At[y + 12];
163 I[3][x] += src * At[y + 18];
164 }
165 }
166
167 FLT4 bias_val = args.biases.Read(Z);
168 for (int y = 0; y < 4 && tile_y + y < args.dst_tensor.Height(); ++y) {
169 FLT4 t0 = I[y][1] + I[y][2];
170 FLT4 t1 = I[y][3] + I[y][4];
171 if (tile_x < args.dst_tensor.Width()) {
172 FLT4 value = I[y][0] + t0 + t1 + bias_val;
173 args.dst_tensor.Write(value, tile_x, tile_y + y, Z);
174 }
175 FLT4 t2 = I[y][1] - I[y][2];
176 FLT4 t3 = I[y][3] - I[y][4];
177 if (tile_x + 1 < args.dst_tensor.Width()) {
178 FLT4 value = t2 * At[7] + t3 * At[9] + bias_val;
179 args.dst_tensor.Write(value, tile_x + 1, tile_y + y, Z);
180 }
181 if (tile_x + 2 < args.dst_tensor.Width()) {
182 FLT4 value = t0 * At[13] + t1 * At[15] + bias_val;
183 args.dst_tensor.Write(value, tile_x + 2, tile_y + y, Z);
184 }
185 if (tile_x + 3 < args.dst_tensor.Width()) {
186 FLT4 value = t2 * At[19] + t3 * At[21] + I[y][5] + bias_val;
187 args.dst_tensor.Write(value, tile_x + 3, tile_y + y, Z);
188 }
189 }
190 }
191 )";
192 return c;
193 }
194 } // namespace
195
GetGridSize() const196 int3 Winograd4x4To36::GetGridSize() const {
197 int new_width =
198 src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2;
199 int new_height =
200 src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2;
201 int tiles_x = DivideRoundUp(new_width, 4);
202 int tiles_y = DivideRoundUp(new_height, 4);
203 return int3(tiles_x, tiles_y, src_[0]->Slices());
204 }
205
BindArguments(ArgumentsBinder * args)206 absl::Status Winograd4x4To36::BindArguments(ArgumentsBinder* args) {
207 int new_width =
208 src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2;
209 int new_height =
210 src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2;
211 int tiles_x = DivideRoundUp(new_width, 4);
212 int tiles_y = DivideRoundUp(new_height, 4);
213 RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
214 RETURN_IF_ERROR(args->SetInt("tiles_y", tiles_y));
215 return absl::OkStatus();
216 }
217
CreateWinograd4x4To36(const OperationDef & definition,const Padding2D & padding)218 Winograd4x4To36 CreateWinograd4x4To36(const OperationDef& definition,
219 const Padding2D& padding) {
220 Winograd4x4To36 desc(definition, padding);
221 desc.code_ = GetKernelWinograd4x4To36();
222
223 desc.AddSrcTensor("src_tensor", definition.src_tensors[0]);
224 desc.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
225
226 desc.args_.AddInt("padding_x", -padding.prepended.w);
227 desc.args_.AddInt("padding_y", -padding.prepended.h);
228 desc.args_.AddInt("tiles_x");
229 desc.args_.AddInt("tiles_y");
230
231 desc.work_group_size_ = int3(8, 4, 1);
232 return desc;
233 }
234
Winograd4x4To36TileX6(const OperationDef & definition,const Padding2D & padding,const GpuInfo & gpu_info)235 Winograd4x4To36TileX6::Winograd4x4To36TileX6(const OperationDef& definition,
236 const Padding2D& padding,
237 const GpuInfo& gpu_info)
238 : GPUOperation(definition), padding_(padding) {
239 work_group_size_ = int3(32, 1, 1);
240 code_ = GetWinograd4x4To36TileX6Code(definition_, gpu_info);
241 if (gpu_info.IsAdreno()) {
242 compiler_options_.push_back(CompilerOptions::kAdrenoMoreWaves);
243 }
244 if (definition_.precision == CalculationsPrecision::F16 &&
245 gpu_info.IsPowerVR()) {
246 compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
247 }
248 }
249
GetWinograd4x4To36TileX6Code(const OperationDef & op_def,const GpuInfo & gpu_info)250 std::string Winograd4x4To36TileX6::GetWinograd4x4To36TileX6Code(
251 const OperationDef& op_def, const GpuInfo& gpu_info) {
252 std::string c;
253
254 const auto src_tensor_type = op_def.src_tensors[0].storage_type;
255 const bool is_image_buffer =
256 src_tensor_type == TensorStorageType::IMAGE_BUFFER;
257 const bool is_buffer = src_tensor_type == TensorStorageType::BUFFER;
258
259 switch (op_def.precision) {
260 case CalculationsPrecision::F32:
261 case CalculationsPrecision::F32_F16:
262 c += "#define ACCUM_FLT float\n";
263 break;
264 case CalculationsPrecision::F16:
265 c += "#define ACCUM_FLT half\n";
266 break;
267 }
268
269 const DataType accum_type = op_def.precision == CalculationsPrecision::F16
270 ? DataType::FLOAT16
271 : DataType::FLOAT32;
272
273 auto bt_mat = BtMatrixForWinograd4x4To6x6();
274 c += "__constant ACCUM_FLT Bt[36] = {\n";
275 for (int y = 0; y < 6; ++y) {
276 c += "\t";
277 for (int x = 0; x < 6; ++x) {
278 c += absl::StrFormat("%.10f", bt_mat[y * 6 + x]) + "f";
279 if (!(x == 5 && y == 5)) {
280 c += ", ";
281 }
282 }
283 c += "\n";
284 }
285 c += "};\n";
286
287 std::string cl_type = accum_type == DataType::FLOAT16 ? "half" : "float";
288 auto src_desc = op_def.src_tensors[0];
289 src_desc.SetStateVar("ACCUM_FLT", cl_type);
290 AddSrcTensor("src_tensor", src_desc);
291 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
292 args_.AddInt("padding_x");
293 args_.AddInt("padding_y");
294 args_.AddInt("tiles_total");
295 args_.AddInt("tiles_x");
296
297 c += "MAIN_FUNCTION($0) {\n";
298 c += " int DST_X = GLOBAL_ID_0;\n";
299 c += " int DST_Y = GLOBAL_ID_1;\n";
300 c += " int DST_Z = GLOBAL_ID_2;\n";
301 c += " if (DST_X >= args.tiles_total || DST_Y >= 6 || DST_Z >= "
302 "args.dst_tensor.Slices()) {\n";
303 c += " return; \n";
304 c += " }\n";
305 c += " int tile_x = (DST_X % args.tiles_x) * 4;\n";
306 c += " int tile_y = (DST_X / args.tiles_x) * 4;\n";
307 c += " ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
308 c += " ACCUM_FLT bt_ar[6];\n";
309 c += " ACCUM_FLT4 t0 = TO_ACCUM_TYPE(args.bt.Read(DST_Y * 2 + 0));\n";
310 c += " ACCUM_FLT4 t1 = TO_ACCUM_TYPE(args.bt.Read(DST_Y * 2 + 1));\n";
311 c += " DST_Y *= 6;\n";
312 c += " bt_ar[0] = t0.x;\n";
313 c += " bt_ar[1] = t0.y;\n";
314 c += " bt_ar[2] = t0.z;\n";
315 c += " bt_ar[3] = t0.w;\n";
316 c += " bt_ar[4] = t1.x;\n";
317 c += " bt_ar[5] = t1.y;\n";
318 auto read_src = [&](const std::string& src, const std::string& xs) {
319 if (is_image_buffer) {
320 c += " ACCUM_FLT4 " + src +
321 " = args.src_tensor.Read<ACCUM_FLT>(src_a_" + xs + " + offset);\n";
322 } else if (is_buffer) {
323 c += " ACCUM_FLT4 " + src +
324 " = args.src_tensor.Read<ACCUM_FLT>(src_a_" + xs + " + offset) * m" +
325 xs + "_x;\n";
326 } else {
327 c += " ACCUM_FLT4 " + src +
328 " = args.src_tensor.Read<ACCUM_FLT>(tile_x + args.padding_x + " +
329 xs + ", yc, DST_Z);\n";
330 }
331 };
332 if (is_buffer || is_image_buffer) {
333 for (int x = 0; x < 6; ++x) {
334 const std::string xs = std::to_string(x);
335 c += " int xc" + xs + " = tile_x + args.padding_x + " + xs + ";\n";
336 c += " ACCUM_FLT m" + xs + "_x = TO_ACCUM_FLT(xc" + xs + " >= 0 && xc" +
337 xs + " < args.src_tensor.Width());\n";
338 c += " bool inx" + xs + " = (xc" + xs + " >= 0 && xc" + xs +
339 " < args.src_tensor.Width());\n";
340 c += " xc" + xs + " = clamp(xc" + xs +
341 ", 0, args.src_tensor.Width() - 1);\n";
342 c += " args.src_tensor.GetAddress(src_a_" + xs + ", xc" + xs +
343 ", 0, DST_Z);\n";
344 if (is_image_buffer) {
345 c += " src_a_" + xs +
346 " = select(-args.src_tensor.Width() * args.src_tensor.Height(), "
347 "src_a_" +
348 xs + ", inx" + xs + ");\n";
349 }
350 }
351 }
352 const bool manual_unroll =
353 !(op_def.precision == CalculationsPrecision::F32 && gpu_info.IsMali());
354 if (manual_unroll) {
355 c += " {\n";
356 c += " int yc = tile_y + args.padding_y;\n";
357 if (is_buffer || is_image_buffer) {
358 c += " bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
359 c += " int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
360 c += " ACCUM_FLT bt = bt_ar[0] * TO_ACCUM_FLT(iny);\n";
361 } else {
362 c += " ACCUM_FLT bt = bt_ar[0];\n";
363 }
364 for (int x = 0; x < 6; ++x) {
365 const std::string xs = std::to_string(x);
366 const std::string src = "src" + xs;
367 read_src(src, xs);
368 c += " I" + xs + " = bt * " + src + ";\n";
369 }
370 c += " }\n";
371 for (int y = 1; y < 6; ++y) {
372 const std::string ys = std::to_string(y);
373 c += " {\n";
374 c += " int yc = tile_y + args.padding_y + (" + ys + ");\n";
375 if (is_buffer || is_image_buffer) {
376 c += " bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
377 c += " int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
378 c += " ACCUM_FLT bt = bt_ar[" + ys + "] * TO_ACCUM_FLT(iny);\n";
379 } else {
380 c += " ACCUM_FLT bt = bt_ar[" + ys + "];\n";
381 }
382 for (int x = 0; x < 6; ++x) {
383 const std::string xs = std::to_string(x);
384 const std::string src = "src" + xs;
385 read_src(src, xs);
386 c += " I" + xs + " += bt * " + src + ";\n";
387 }
388 c += " }\n";
389 }
390 } else {
391 c += " I0 = INIT_ACCUM_FLT4(0.0f);\n";
392 c += " I1 = INIT_ACCUM_FLT4(0.0f);\n";
393 c += " I2 = INIT_ACCUM_FLT4(0.0f);\n";
394 c += " I3 = INIT_ACCUM_FLT4(0.0f);\n";
395 c += " I4 = INIT_ACCUM_FLT4(0.0f);\n";
396 c += " I5 = INIT_ACCUM_FLT4(0.0f);\n";
397 c += " for (int y = 0; y < 6; ++y) {\n";
398 c += " int yc = tile_y + args.padding_y + y;\n";
399 if (is_buffer || is_image_buffer) {
400 c += " bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
401 c += " int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
402 c += " ACCUM_FLT bt = bt_ar[y] * TO_ACCUM_FLT(iny);\n";
403 } else {
404 c += " ACCUM_FLT bt = bt_ar[y];\n";
405 }
406 for (int x = 0; x < 6; ++x) {
407 const std::string xs = std::to_string(x);
408 const std::string src = "src" + xs;
409 read_src(src, xs);
410 c += " I" + xs + " += bt * " + src + ";\n";
411 }
412 c += " }\n";
413 }
414 c += " {\n";
415 c += " FLT4 r0 = TO_FLT4(I0 + Bt[2] * I2 + Bt[4] * I4);\n";
416 c += " args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
417 c += " DST_Y++;\n";
418 c += " }\n";
419 c += " {\n";
420 c += " FLT4 r0 = TO_FLT4(Bt[7] * I1 + Bt[8] * I2 + Bt[9] * I3 + Bt[10] * "
421 "I4);\n";
422 c += " args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
423 c += " DST_Y++;\n";
424 c += " }\n";
425 c += " {\n";
426 c += " FLT4 r0 = TO_FLT4(Bt[13] * I1 + Bt[14] * I2 + Bt[15] * I3 + Bt[16] "
427 "* "
428 "I4);\n";
429 c += " args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
430 c += " DST_Y++;\n";
431 c += " }\n";
432 c += " {\n";
433 c += " FLT4 r0 = TO_FLT4(Bt[19] * I1 + Bt[20] * I2 + Bt[21] * I3 + Bt[22] "
434 "* "
435 "I4);\n";
436 c += " args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
437 c += " DST_Y++;\n";
438 c += " }\n";
439 c += " {\n";
440 c += " FLT4 r0 = TO_FLT4(Bt[25] * I1 + Bt[26] * I2 + Bt[27] * I3 + Bt[28] "
441 "* "
442 "I4);\n";
443 c += " args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
444 c += " DST_Y++;\n";
445 c += " }\n";
446 c += " {\n";
447 c += " FLT4 r0 = TO_FLT4(Bt[31] * I1 + Bt[33] * I3 + I5);\n";
448 c += " args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
449 c += " DST_Y++;\n";
450 c += " }\n";
451 c += "}\n";
452 return c;
453 }
454
UploadBt()455 void Winograd4x4To36TileX6::UploadBt() {
456 tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
457 bt_aligned.shape = Linear(6 * 8);
458 bt_aligned.data.resize(6 * 8);
459 auto bt_mat = BtMatrixForWinograd4x4To6x6();
460 for (int y = 0; y < 6; ++y) {
461 for (int x = 0; x < 6; ++x) {
462 bt_aligned.data[y * 8 + x] = bt_mat[y * 6 + x];
463 }
464 bt_aligned.data[y * 8 + 6] = 0.0f;
465 bt_aligned.data[y * 8 + 7] = 0.0f;
466 }
467
468 TensorLinearDescriptor desc;
469 desc.storage_type = LinearStorageType::TEXTURE_2D;
470 desc.element_type = definition_.GetDataType();
471 desc.UploadLinearData(bt_aligned);
472 args_.AddObject("bt",
473 absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
474 }
475
SelectBestWorkGroup(const KernelInfo & kernel_info) const476 int3 Winograd4x4To36TileX6::SelectBestWorkGroup(
477 const KernelInfo& kernel_info) const {
478 const std::vector<int3> wgs = {{8, 6, 4}, {8, 6, 2}, {4, 6, 2},
479 {4, 6, 2}, {2, 6, 2}, {2, 6, 1},
480 {1, 6, 1}, {1, 3, 1}, {1, 1, 1}};
481 return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
482 }
483
BindArguments(ArgumentsBinder * args)484 absl::Status Winograd4x4To36TileX6::BindArguments(ArgumentsBinder* args) {
485 const int tiles_x = DivideRoundUp(
486 src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
487 const int tiles_y = DivideRoundUp(
488 src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
489 const int tiles_total = tiles_x * tiles_y;
490 RETURN_IF_ERROR(args->SetInt("padding_x", -padding_.prepended.w));
491 RETURN_IF_ERROR(args->SetInt("padding_y", -padding_.prepended.h));
492 RETURN_IF_ERROR(args->SetInt("tiles_total", tiles_total));
493 RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
494 return absl::OkStatus();
495 }
496
GetGridSize() const497 int3 Winograd4x4To36TileX6::GetGridSize() const {
498 const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
499 const int grid_y = 6;
500 const int grid_z = dst_[0]->Slices();
501 return int3(grid_x, grid_y, grid_z);
502 }
503
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const504 void Winograd4x4To36TileX6::GetPossibleKernelWorkGroups(
505 TuningType tuning_type, const GpuInfo& gpu_info,
506 const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
507 if (gpu_info.IsIntel()) {
508 work_groups->push_back(int3(4, 6, 1));
509 return;
510 }
511 switch (tuning_type) {
512 case TuningType::kExhaustive:
513 GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
514 work_groups);
515 return;
516 case TuningType::kFast:
517 default:
518 work_groups->push_back(SelectBestWorkGroup(kernel_info));
519 return;
520 }
521 }
522
CreateWinograd4x4To36TileX6(const GpuInfo & gpu_info,const OperationDef & definition,const Padding2D & padding)523 Winograd4x4To36TileX6 CreateWinograd4x4To36TileX6(
524 const GpuInfo& gpu_info, const OperationDef& definition,
525 const Padding2D& padding) {
526 Winograd4x4To36TileX6 result(definition, padding, gpu_info);
527 result.UploadBt();
528 return result;
529 }
530
GetGridSize() const531 int3 Winograd36To4x4::GetGridSize() const {
532 return int3(src_[0]->Width(), 1, src_[0]->Slices());
533 }
534
CreateWinograd36To4x4(const OperationDef & definition,const tflite::gpu::Tensor<Linear,DataType::FLOAT32> & biases)535 Winograd36To4x4 CreateWinograd36To4x4(
536 const OperationDef& definition,
537 const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
538 Winograd36To4x4 desc(definition);
539 desc.code_ = GetKernelWinograd36To4x4();
540
541 desc.AddSrcTensor("src_tensor", definition.src_tensors[0]);
542 desc.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
543
544 TensorLinearDescriptor bias_desc;
545 bias_desc.storage_type = LinearStorageType::BUFFER;
546 bias_desc.element_type = definition.GetDataType();
547 bias_desc.UploadLinearData(biases);
548 desc.args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(
549 std::move(bias_desc)));
550
551 desc.work_group_size_ = int3(32, 1, 1);
552 return desc;
553 }
554
Winograd36To4x4Tile4x1(const OperationDef & definition,const GpuInfo & gpu_info)555 Winograd36To4x4Tile4x1::Winograd36To4x4Tile4x1(const OperationDef& definition,
556 const GpuInfo& gpu_info)
557 : GPUOperation(definition) {
558 work_group_size_ = int3(32, 1, 1);
559 if (definition_.precision == CalculationsPrecision::F16 &&
560 gpu_info.IsPowerVR()) {
561 compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
562 }
563 code_ = GetWinograd36To4x4Tile4x1Code(definition_, gpu_info);
564 }
565
GetWinograd36To4x4Tile4x1Code(const OperationDef & op_def,const GpuInfo & gpu_info)566 std::string Winograd36To4x4Tile4x1::GetWinograd36To4x4Tile4x1Code(
567 const OperationDef& op_def, const GpuInfo& gpu_info) {
568 std::string c;
569
570 switch (op_def.precision) {
571 case CalculationsPrecision::F32:
572 case CalculationsPrecision::F32_F16:
573 c += "#define ACCUM_FLT float\n";
574 break;
575 case CalculationsPrecision::F16:
576 c += "#define ACCUM_FLT half\n";
577 break;
578 }
579
580 const DataType accum_type = op_def.precision == CalculationsPrecision::F16
581 ? DataType::FLOAT16
582 : DataType::FLOAT32;
583
584 std::string cl_type = accum_type == DataType::FLOAT16 ? "half" : "float";
585 auto src_desc = op_def.src_tensors[0];
586 src_desc.SetStateVar("ACCUM_FLT", cl_type);
587 AddSrcTensor("src_tensor", src_desc);
588 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
589 args_.AddInt("tiles_x");
590
591 auto at_mat = AtMatrixForWinograd4x4To6x6();
592 c += "__constant ACCUM_FLT At[24] = {\n";
593 for (int y = 0; y < 4; ++y) {
594 c += "\t";
595 for (int x = 0; x < 6; ++x) {
596 c += absl::StrFormat("%.10f", at_mat[y * 6 + x]) + "f";
597 if (!(x == 5 && y == 3)) {
598 c += ", ";
599 }
600 }
601 c += "\n";
602 }
603 c += "};\n";
604
605 c += "MAIN_FUNCTION($0) {\n";
606 c += " int tile_id = GLOBAL_ID_0;\n";
607 c += " int DST_Y = GLOBAL_ID_1;\n";
608 c += " int DST_Z = GLOBAL_ID_2;\n";
609 c += " int tile_x = (tile_id % args.tiles_x) * 4;\n";
610 c += " int tile_y = (tile_id / args.tiles_x) * 4 + DST_Y;\n";
611
612 c += " if (tile_x >= args.dst_tensor.Width() || tile_y >= "
613 "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Slices()) {\n";
614 c += " return; \n";
615 c += " }\n";
616 c += " ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
617 c += " ACCUM_FLT at_ar[6];\n";
618 c += " ACCUM_FLT4 t00 = TO_ACCUM_TYPE(args.at.Read(DST_Y * 2 + 0));\n";
619 c += " ACCUM_FLT4 t01 = TO_ACCUM_TYPE(args.at.Read(DST_Y * 2 + 1));\n";
620 c += " at_ar[0] = t00.x;\n";
621 c += " at_ar[1] = t00.y;\n";
622 c += " at_ar[2] = t00.z;\n";
623 c += " at_ar[3] = t00.w;\n";
624 c += " at_ar[4] = t01.x;\n";
625 c += " at_ar[5] = t01.y;\n";
626 const bool manual_unroll =
627 !(op_def.precision == CalculationsPrecision::F32 && gpu_info.IsMali());
628 if (manual_unroll) {
629 c += " {\n";
630 c += " ACCUM_FLT at = at_ar[0];\n";
631 for (int x = 0; x < 6; ++x) {
632 const std::string yc = std::to_string(x);
633 const std::string src = "src" + std::to_string(x);
634 c += " ACCUM_FLT4 " + src +
635 " = args.src_tensor.Read<ACCUM_FLT>(tile_id, " + yc + ", DST_Z);\n";
636 c += " I" + std::to_string(x) + " = at * " + src + ";\n";
637 }
638 c += " }\n";
639 for (int y = 1; y < 6; ++y) {
640 c += " {\n";
641 c += " ACCUM_FLT at = at_ar[" + std::to_string(y) + "];\n";
642 for (int x = 0; x < 6; ++x) {
643 const std::string yc = std::to_string(y * 6 + x);
644 const std::string src = "src" + std::to_string(x);
645 c += " ACCUM_FLT4 " + src +
646 " = args.src_tensor.Read<ACCUM_FLT>(tile_id, " + yc +
647 ", DST_Z);\n";
648 c += " I" + std::to_string(x) + " += at * " + src + ";\n";
649 }
650 c += " }\n";
651 }
652 } else {
653 c += " I0 = INIT_ACCUM_FLT4(0.0f);\n";
654 c += " I1 = INIT_ACCUM_FLT4(0.0f);\n";
655 c += " I2 = INIT_ACCUM_FLT4(0.0f);\n";
656 c += " I3 = INIT_ACCUM_FLT4(0.0f);\n";
657 c += " I4 = INIT_ACCUM_FLT4(0.0f);\n";
658 c += " I5 = INIT_ACCUM_FLT4(0.0f);\n";
659 c += " for (int y = 0; y < 6; ++y) {\n";
660 c += " ACCUM_FLT at = at_ar[y];\n";
661 for (int x = 0; x < 6; ++x) {
662 const std::string src = "src" + std::to_string(x);
663 c += " ACCUM_FLT4 " + src +
664 " = args.src_tensor.Read<ACCUM_FLT>(tile_id, y * 6 + " +
665 std::to_string(x) + ", DST_Z);\n";
666 c += " I" + std::to_string(x) + " += at * " + src + ";\n";
667 }
668 c += " }\n";
669 }
670 c += " ACCUM_FLT4 t0 = I1 + I2;\n";
671 c += " ACCUM_FLT4 t1 = I3 + I4;\n";
672 c += " FLT4 bias_val = args.biases.Read(DST_Z);\n";
673 c += " {\n";
674 c += " FLT4 r0 = TO_FLT4(I0 + t0 + t1) + bias_val;\n";
675 c += " args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
676 c += " tile_x++;\n";
677 c += " }\n";
678 c += " ACCUM_FLT4 t2 = I1 - I2;\n";
679 c += " ACCUM_FLT4 t3 = I3 - I4;\n";
680 c += " if (tile_x < args.dst_tensor.Width()) {\n";
681 c += " FLT4 r0 = TO_FLT4(t2 * At[7] + t3 * At[9]) + bias_val;\n";
682 c += " args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
683 c += " tile_x++;\n";
684 c += " }\n";
685 c += " if (tile_x < args.dst_tensor.Width()) {\n";
686 c += " FLT4 r0 = TO_FLT4(t0 * At[13] + t1 * At[15]) + bias_val;\n";
687 c += " args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
688 c += " tile_x++;\n";
689 c += " }\n";
690 c += " if (tile_x < args.dst_tensor.Width()) {\n";
691 c += " FLT4 r0 = TO_FLT4(t2 * At[19] + t3 * At[21] + I5) + bias_val;\n";
692 c += " args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
693 c += " tile_x++;\n";
694 c += " }\n";
695 c += "}\n";
696 return c;
697 }
698
UploadAt()699 void Winograd36To4x4Tile4x1::UploadAt() {
700 tflite::gpu::Tensor<Linear, DataType::FLOAT32> at_aligned;
701 at_aligned.shape = Linear(4 * 8);
702 at_aligned.data.resize(4 * 8);
703 auto at_mat = AtMatrixForWinograd4x4To6x6();
704 for (int y = 0; y < 4; ++y) {
705 for (int x = 0; x < 6; ++x) {
706 at_aligned.data[y * 8 + x] = at_mat[y * 6 + x];
707 }
708 at_aligned.data[y * 8 + 6] = 0.0f;
709 at_aligned.data[y * 8 + 7] = 0.0f;
710 }
711
712 TensorLinearDescriptor desc;
713 desc.storage_type = LinearStorageType::TEXTURE_2D;
714 desc.element_type = definition_.GetDataType();
715 desc.UploadLinearData(at_aligned);
716 args_.AddObject("at",
717 absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
718 }
719
SelectBestWorkGroup(const KernelInfo & kernel_info) const720 int3 Winograd36To4x4Tile4x1::SelectBestWorkGroup(
721 const KernelInfo& kernel_info) const {
722 const std::vector<int3> wgs = {{32, 4, 2}, {16, 4, 2}, {16, 4, 1},
723 {8, 4, 1}, {4, 4, 1}, {2, 4, 1},
724 {1, 4, 1}, {1, 2, 1}, {1, 1, 1}};
725 return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
726 }
727
BindArguments(ArgumentsBinder * args)728 absl::Status Winograd36To4x4Tile4x1::BindArguments(ArgumentsBinder* args) {
729 const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
730 RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
731 return absl::OkStatus();
732 }
733
GetGridSize() const734 int3 Winograd36To4x4Tile4x1::GetGridSize() const {
735 const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
736 const int tiles_y = DivideRoundUp(dst_[0]->Height(), 4);
737 const int grid_x = tiles_x * tiles_y * dst_[0]->Batch();
738 const int grid_y = 4;
739 const int grid_z = dst_[0]->Slices();
740 return int3(grid_x, grid_y, grid_z);
741 }
742
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const743 void Winograd36To4x4Tile4x1::GetPossibleKernelWorkGroups(
744 TuningType tuning_type, const GpuInfo& gpu_info,
745 const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
746 if (gpu_info.IsIntel()) {
747 work_groups->push_back(int3(8, 4, 1));
748 return;
749 }
750 switch (tuning_type) {
751 case TuningType::kExhaustive:
752 GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
753 work_groups);
754 return;
755 case TuningType::kFast:
756 default:
757 work_groups->push_back(SelectBestWorkGroup(kernel_info));
758 return;
759 }
760 }
761
CreateWinograd36To4x4Tile4x1(const GpuInfo & gpu_info,const OperationDef & definition,const tflite::gpu::Tensor<Linear,DataType::FLOAT32> & biases)762 Winograd36To4x4Tile4x1 CreateWinograd36To4x4Tile4x1(
763 const GpuInfo& gpu_info, const OperationDef& definition,
764 const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
765 Winograd36To4x4Tile4x1 result(definition, gpu_info);
766 TensorLinearDescriptor desc;
767 desc.storage_type = LinearStorageType::TEXTURE_2D;
768 desc.element_type = definition.GetDataType();
769 desc.UploadLinearData(biases);
770 result.args_.AddObject(
771 "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
772 result.UploadAt();
773 return result;
774 }
775
776 } // namespace gpu
777 } // namespace tflite
778