1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/winograd.h"
17
18 #include <string>
19 #include <vector>
20
21 #include "absl/strings/str_format.h"
22 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
23 #include "tensorflow/lite/delegates/gpu/common/shape.h"
24 #include "tensorflow/lite/delegates/gpu/common/status.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
26 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
27
28 namespace tflite {
29 namespace gpu {
30 namespace {
GetKernelWinograd4x4To36()31 std::string GetKernelWinograd4x4To36() {
32 std::string c;
33 auto bt_mat = BtMatrixForWinograd4x4To6x6();
34 c += "__constant FLT Bt[36] = {\n";
35 for (int y = 0; y < 6; ++y) {
36 c += "\t";
37 for (int x = 0; x < 6; ++x) {
38 c += absl::StrFormat("%.10f", bt_mat[y * 6 + x]) + "f, ";
39 }
40 c += "\n";
41 }
42 c += "};\n";
43 c += R"(
44 MAIN_FUNCTION($0) {
45 int X = GLOBAL_ID_0 * 4;
46 int Y = GLOBAL_ID_1 * 4;
47 int S = GLOBAL_ID_2;
48
49 if (GLOBAL_ID_0 >= args.tiles_x || GLOBAL_ID_1 >= args.tiles_y) return;
50
51 FLT4 I[6][6];
52 for (int y = 0; y < 6; ++y) {
53 for (int x = 0; x < 6; ++x) {
54 I[y][x] = INIT_FLT4(0.0f);
55 }
56 }
57 const int src_base = S * args.src_tensor.Height() * args.src_tensor.Width();
58 )";
59 for (int y = 0; y < 6; ++y) {
60 const std::string s_y = std::to_string(y);
61 c += " {\n";
62 c += " int coord_y = Y + " + s_y + " + args.padding_y;\n";
63 c += " bool in_y = coord_y >= 0 && coord_y < "
64 "args.src_tensor.Height();\n";
65 c += " coord_y = clamp(coord_y, 0, args.src_tensor.Height() - 1);\n";
66 c += " const int src_adress_y = src_base + coord_y * "
67 "args.src_tensor.Width();\n";
68 for (int x = 0; x < 6; ++x) {
69 const std::string s_x = std::to_string(x);
70 c += " {\n";
71 c += " int coord_x = X + " + s_x + " + args.padding_x;\n";
72 c += " bool in_x = coord_x >= 0 && coord_x < "
73 "args.src_tensor.Width();\n";
74 c += " FLT mult = INIT_FLT(in_y && in_x);\n";
75 c += " coord_x = clamp(coord_x, 0, args.src_tensor.Width() - 1);\n";
76 c += " FLT4 src = args.src_tensor.Read(src_adress_y + coord_x) * "
77 "mult;\n";
78 c += " I[0][" + s_x + "] += Bt[" + std::to_string(y) + "] * src;\n";
79 c += " I[1][" + s_x + "] += Bt[" + std::to_string(y + 6) +
80 "] * src;\n";
81 c += " I[2][" + s_x + "] += Bt[" + std::to_string(y + 12) +
82 "] * src;\n";
83 c += " I[3][" + s_x + "] += Bt[" + std::to_string(y + 18) +
84 "] * src;\n";
85 c += " I[4][" + s_x + "] += Bt[" + std::to_string(y + 24) +
86 "] * src;\n";
87 c += " I[5][" + s_x + "] += Bt[" + std::to_string(y + 30) +
88 "] * src;\n";
89 c += " }\n";
90 }
91 c += " }\n";
92 }
93 c += R"(
94
95 int dst_x = GLOBAL_ID_1 * args.tiles_x + GLOBAL_ID_0;
96 args.dst_tensor.GetAddress(dst_adress, dst_x, 0, S);
97 for (int y = 0; y < 6; ++y) {
98 FLT4 value = I[y][0] + Bt[2] * I[y][2] + Bt[4] * I[y][4];
99 args.dst_tensor.WriteLinear(value, dst_adress);
100 dst_adress += args.dst_tensor.Width();
101 value = Bt[7] * I[y][1] + Bt[8] * I[y][2] + Bt[9] * I[y][3] + Bt[10] * I[y][4];
102 args.dst_tensor.WriteLinear(value, dst_adress);
103 dst_adress += args.dst_tensor.Width();
104 value = Bt[13] * I[y][1] + Bt[14] * I[y][2] + Bt[15] * I[y][3] + Bt[16] * I[y][4];
105 args.dst_tensor.WriteLinear(value, dst_adress);
106 dst_adress += args.dst_tensor.Width();
107 value = Bt[19] * I[y][1] + Bt[20] * I[y][2] + Bt[21] * I[y][3] + Bt[22] * I[y][4];
108 args.dst_tensor.WriteLinear(value, dst_adress);
109 dst_adress += args.dst_tensor.Width();
110 value = Bt[25] * I[y][1] + Bt[26] * I[y][2] + Bt[27] * I[y][3] + Bt[28] * I[y][4];
111 args.dst_tensor.WriteLinear(value, dst_adress);
112 dst_adress += args.dst_tensor.Width();
113 value = Bt[31] * I[y][1] + Bt[33] * I[y][3] + I[y][5];
114 args.dst_tensor.WriteLinear(value, dst_adress);
115 dst_adress += args.dst_tensor.Width();
116 }
117 }
118 )";
119 return c;
120 }
121
GetKernelWinograd36To4x4()122 std::string GetKernelWinograd36To4x4() {
123 std::string c;
124 auto at_mat = AtMatrixForWinograd4x4To6x6();
125 c += "__constant FLT At[24] = {\n";
126 for (int y = 0; y < 4; ++y) {
127 c += "\t";
128 for (int x = 0; x < 6; ++x) {
129 c += absl::StrFormat("%.10f", at_mat[y * 6 + x]) + "f, ";
130 }
131 c += "\n";
132 }
133 c += "};\n";
134 c += R"(
135 MAIN_FUNCTION($0) {
136 int tile_id = GLOBAL_ID_0;
137 int Z = GLOBAL_ID_2;
138 int tiles_count_x = (args.dst_tensor.Width() + 3) / 4;
139 int tile_x = (tile_id % tiles_count_x) * 4;
140 int tile_y = (tile_id / tiles_count_x) * 4;
141 if (tile_x >= args.dst_tensor.Width() || tile_y >= args.dst_tensor.Height()) return;
142
143 int src_adress = Z * args.src_tensor.Height() * args.src_tensor.Width() + tile_id;
144 FLT4 I[4][6];
145 for (int y = 0; y < 4; ++y) {
146 for (int x = 0; x < 6; ++x) {
147 I[y][x] = INIT_FLT4(0.0f);
148 }
149 }
150 for (int y = 0; y < 6; ++y) {
151 for (int x = 0; x < 6; ++x, src_adress += args.src_tensor.Width()) {
152 FLT4 src = args.src_tensor.Read(src_adress);
153 I[0][x] += src * At[y];
154 I[1][x] += src * At[y + 6];
155 I[2][x] += src * At[y + 12];
156 I[3][x] += src * At[y + 18];
157 }
158 }
159
160 FLT4 bias_val = args.biases.Read(Z);
161 for (int y = 0; y < 4 && tile_y + y < args.dst_tensor.Height(); ++y) {
162 FLT4 t0 = I[y][1] + I[y][2];
163 FLT4 t1 = I[y][3] + I[y][4];
164 if (tile_x < args.dst_tensor.Width()) {
165 FLT4 value = I[y][0] + t0 + t1 + bias_val;
166 args.dst_tensor.Write(value, tile_x, tile_y + y, Z);
167 }
168 FLT4 t2 = I[y][1] - I[y][2];
169 FLT4 t3 = I[y][3] - I[y][4];
170 if (tile_x + 1 < args.dst_tensor.Width()) {
171 FLT4 value = t2 * At[7] + t3 * At[9] + bias_val;
172 args.dst_tensor.Write(value, tile_x + 1, tile_y + y, Z);
173 }
174 if (tile_x + 2 < args.dst_tensor.Width()) {
175 FLT4 value = t0 * At[13] + t1 * At[15] + bias_val;
176 args.dst_tensor.Write(value, tile_x + 2, tile_y + y, Z);
177 }
178 if (tile_x + 3 < args.dst_tensor.Width()) {
179 FLT4 value = t2 * At[19] + t3 * At[21] + I[y][5] + bias_val;
180 args.dst_tensor.Write(value, tile_x + 3, tile_y + y, Z);
181 }
182 }
183 }
184 )";
185 return c;
186 }
187 } // namespace
188
GetGridSize() const189 int3 Winograd4x4To36::GetGridSize() const {
190 int new_width =
191 src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2;
192 int new_height =
193 src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2;
194 int tiles_x = DivideRoundUp(new_width, 4);
195 int tiles_y = DivideRoundUp(new_height, 4);
196 return int3(tiles_x, tiles_y, src_[0]->Slices());
197 }
198
BindArguments(ArgumentsBinder * args)199 absl::Status Winograd4x4To36::BindArguments(ArgumentsBinder* args) {
200 int new_width =
201 src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2;
202 int new_height =
203 src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2;
204 int tiles_x = DivideRoundUp(new_width, 4);
205 int tiles_y = DivideRoundUp(new_height, 4);
206 RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
207 RETURN_IF_ERROR(args->SetInt("tiles_y", tiles_y));
208 return absl::OkStatus();
209 }
210
CreateWinograd4x4To36(const OperationDef & definition,const Padding2D & padding)211 Winograd4x4To36 CreateWinograd4x4To36(const OperationDef& definition,
212 const Padding2D& padding) {
213 Winograd4x4To36 desc(definition, padding);
214 desc.code_ = GetKernelWinograd4x4To36();
215
216 desc.AddSrcTensor("src_tensor", definition.src_tensors[0]);
217 desc.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
218
219 desc.args_.AddInt("padding_x", -padding.prepended.w);
220 desc.args_.AddInt("padding_y", -padding.prepended.h);
221 desc.args_.AddInt("tiles_x");
222 desc.args_.AddInt("tiles_y");
223
224 desc.work_group_size_ = int3(8, 4, 1);
225 return desc;
226 }
227
Winograd4x4To36TileX6(const OperationDef & definition,const Padding2D & padding,const GpuInfo & gpu_info)228 Winograd4x4To36TileX6::Winograd4x4To36TileX6(const OperationDef& definition,
229 const Padding2D& padding,
230 const GpuInfo& gpu_info)
231 : GPUOperation(definition), padding_(padding) {
232 work_group_size_ = int3(32, 1, 1);
233 code_ = GetWinograd4x4To36TileX6Code(definition_);
234 if (gpu_info.IsAdreno()) {
235 compiler_options_.push_back(CompilerOptions::kAdrenoMoreWaves);
236 }
237 if (definition_.precision == CalculationsPrecision::F16 &&
238 gpu_info.IsPowerVR()) {
239 compiler_options_.push_back(CompilerOptions::kClPowervrFp16);
240 }
241 }
242
GetWinograd4x4To36TileX6Code(const OperationDef & op_def)243 std::string Winograd4x4To36TileX6::GetWinograd4x4To36TileX6Code(
244 const OperationDef& op_def) {
245 std::string c;
246
247 const auto src_tensor_type = op_def.src_tensors[0].storage_type;
248 const bool is_image_buffer =
249 src_tensor_type == TensorStorageType::IMAGE_BUFFER;
250 const bool is_buffer = src_tensor_type == TensorStorageType::BUFFER;
251
252 switch (op_def.precision) {
253 case CalculationsPrecision::F32:
254 case CalculationsPrecision::F32_F16:
255 c += "#define ACCUM_FLT float\n";
256 break;
257 case CalculationsPrecision::F16:
258 c += "#define ACCUM_FLT half\n";
259 break;
260 }
261
262 const DataType accum_type = op_def.precision == CalculationsPrecision::F16
263 ? DataType::FLOAT16
264 : DataType::FLOAT32;
265
266 auto bt_mat = BtMatrixForWinograd4x4To6x6();
267 c += "constant ACCUM_FLT Bt[36] = {\n";
268 for (int y = 0; y < 6; ++y) {
269 c += "\t";
270 for (int x = 0; x < 6; ++x) {
271 c += absl::StrFormat("%.10f", bt_mat[y * 6 + x]) + "f, ";
272 }
273 c += "\n";
274 }
275 c += "};\n";
276
277 std::string cl_type = accum_type == DataType::FLOAT16 ? "half" : "float";
278 auto src_desc = op_def.src_tensors[0];
279 src_desc.SetStateVar("ACCUM_FLT", cl_type);
280 AddSrcTensor("src_tensor", src_desc);
281 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
282 args_.AddInt("padding_x");
283 args_.AddInt("padding_y");
284 args_.AddInt("tiles_total");
285 args_.AddInt("tiles_x");
286
287 c += "MAIN_FUNCTION($0) {\n";
288 c += " int DST_X = GLOBAL_ID_0;\n";
289 c += " int DST_Y = GLOBAL_ID_1;\n";
290 c += " int DST_Z = GLOBAL_ID_2;\n";
291 c += " if (DST_X >= args.tiles_total || DST_Y >= 6 || DST_Z >= "
292 "args.dst_tensor.Slices()) {\n";
293 c += " return; \n";
294 c += " }\n";
295 c += " int tile_x = (DST_X % args.tiles_x) * 4;\n";
296 c += " int tile_y = (DST_X / args.tiles_x) * 4;\n";
297 c += " ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
298 c += " ACCUM_FLT bt_ar[6];\n";
299 c += " ACCUM_FLT4 t0 = TO_ACCUM_TYPE(args.bt.Read(DST_Y * 2 + 0));\n";
300 c += " ACCUM_FLT4 t1 = TO_ACCUM_TYPE(args.bt.Read(DST_Y * 2 + 1));\n";
301 c += " DST_Y *= 6;\n";
302 c += " bt_ar[0] = t0.x;\n";
303 c += " bt_ar[1] = t0.y;\n";
304 c += " bt_ar[2] = t0.z;\n";
305 c += " bt_ar[3] = t0.w;\n";
306 c += " bt_ar[4] = t1.x;\n";
307 c += " bt_ar[5] = t1.y;\n";
308 auto read_src = [&](const std::string& src, const std::string& xs) {
309 if (is_image_buffer) {
310 c += " ACCUM_FLT4 " + src +
311 " = args.src_tensor.Read<ACCUM_FLT>(src_a_" + xs + " + offset);\n";
312 } else if (is_buffer) {
313 c += " ACCUM_FLT4 " + src +
314 " = args.src_tensor.Read<ACCUM_FLT>(src_a_" + xs + " + offset) * m" +
315 xs + "_x;\n";
316 } else {
317 c += " ACCUM_FLT4 " + src +
318 " = args.src_tensor.Read<ACCUM_FLT>(tile_x + args.padding_x + " +
319 xs + ", yc, DST_Z);\n";
320 }
321 };
322 if (is_buffer || is_image_buffer) {
323 for (int x = 0; x < 6; ++x) {
324 const std::string xs = std::to_string(x);
325 c += " int xc" + xs + " = tile_x + args.padding_x + " + xs + ";\n";
326 c += " ACCUM_FLT m" + xs + "_x = TO_ACCUM_FLT(xc" + xs + " >= 0 && xc" +
327 xs + " < args.src_tensor.Width());\n";
328 c += " bool inx" + xs + " = (xc" + xs + " >= 0 && xc" + xs +
329 " < args.src_tensor.Width());\n";
330 c += " xc" + xs + " = clamp(xc" + xs +
331 ", 0, args.src_tensor.Width() - 1);\n";
332 c += " args.src_tensor.GetAddress(src_a_" + xs + ", xc" + xs +
333 ", 0, DST_Z);\n";
334 if (is_image_buffer) {
335 c += " src_a_" + xs +
336 " = select(-args.src_tensor.Width() * args.src_tensor.Height(), "
337 "src_a_" +
338 xs + ", inx" + xs + ");\n";
339 }
340 }
341 }
342 c += " {\n";
343 c += " int yc = tile_y + args.padding_y;\n";
344 if (is_buffer || is_image_buffer) {
345 c += " bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
346 c += " int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
347 c += " ACCUM_FLT bt = bt_ar[0] * TO_ACCUM_FLT(iny);\n";
348 } else {
349 c += " ACCUM_FLT bt = bt_ar[0];\n";
350 }
351 for (int x = 0; x < 6; ++x) {
352 const std::string xs = std::to_string(x);
353 const std::string src = "src" + xs;
354 read_src(src, xs);
355 c += " I" + xs + " = bt * " + src + ";\n";
356 }
357 c += " }\n";
358 for (int y = 1; y < 6; ++y) {
359 const std::string ys = std::to_string(y);
360 c += " {\n";
361 c += " int yc = tile_y + args.padding_y + (" + ys + ");\n";
362 if (is_buffer || is_image_buffer) {
363 c += " bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
364 c += " int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
365 c += " ACCUM_FLT bt = bt_ar[" + ys + "] * TO_ACCUM_FLT(iny);\n";
366 } else {
367 c += " ACCUM_FLT bt = bt_ar[" + ys + "];\n";
368 }
369 for (int x = 0; x < 6; ++x) {
370 const std::string xs = std::to_string(x);
371 const std::string src = "src" + xs;
372 read_src(src, xs);
373 c += " I" + xs + " += bt * " + src + ";\n";
374 }
375 c += " }\n";
376 }
377 c += " {\n";
378 c += " FLT4 r0 = TO_FLT4(I0 + Bt[2] * I2 + Bt[4] * I4);\n";
379 c += " args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
380 c += " DST_Y++;\n";
381 c += " }\n";
382 c += " {\n";
383 c += " FLT4 r0 = TO_FLT4(Bt[7] * I1 + Bt[8] * I2 + Bt[9] * I3 + Bt[10] * "
384 "I4);\n";
385 c += " args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
386 c += " DST_Y++;\n";
387 c += " }\n";
388 c += " {\n";
389 c += " FLT4 r0 = TO_FLT4(Bt[13] * I1 + Bt[14] * I2 + Bt[15] * I3 + Bt[16] "
390 "* "
391 "I4);\n";
392 c += " args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
393 c += " DST_Y++;\n";
394 c += " }\n";
395 c += " {\n";
396 c += " FLT4 r0 = TO_FLT4(Bt[19] * I1 + Bt[20] * I2 + Bt[21] * I3 + Bt[22] "
397 "* "
398 "I4);\n";
399 c += " args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
400 c += " DST_Y++;\n";
401 c += " }\n";
402 c += " {\n";
403 c += " FLT4 r0 = TO_FLT4(Bt[25] * I1 + Bt[26] * I2 + Bt[27] * I3 + Bt[28] "
404 "* "
405 "I4);\n";
406 c += " args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
407 c += " DST_Y++;\n";
408 c += " }\n";
409 c += " {\n";
410 c += " FLT4 r0 = TO_FLT4(Bt[31] * I1 + Bt[33] * I3 + I5);\n";
411 c += " args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
412 c += " DST_Y++;\n";
413 c += " }\n";
414 c += "}\n";
415 return c;
416 }
417
UploadBt()418 void Winograd4x4To36TileX6::UploadBt() {
419 tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
420 bt_aligned.shape = Linear(6 * 8);
421 bt_aligned.data.resize(6 * 8);
422 auto bt_mat = BtMatrixForWinograd4x4To6x6();
423 for (int y = 0; y < 6; ++y) {
424 for (int x = 0; x < 6; ++x) {
425 bt_aligned.data[y * 8 + x] = bt_mat[y * 6 + x];
426 }
427 bt_aligned.data[y * 8 + 6] = 0.0f;
428 bt_aligned.data[y * 8 + 7] = 0.0f;
429 }
430
431 TensorLinearDescriptor desc;
432 desc.storage_type = LinearStorageType::TEXTURE_2D;
433 desc.element_type = definition_.GetDataType();
434 desc.UploadLinearData(bt_aligned);
435 args_.AddObject("bt",
436 absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
437 }
438
SelectBestWorkGroup(const KernelInfo & kernel_info) const439 int3 Winograd4x4To36TileX6::SelectBestWorkGroup(
440 const KernelInfo& kernel_info) const {
441 const std::vector<int3> wgs = {{8, 6, 4}, {8, 6, 2}, {4, 6, 2},
442 {4, 6, 2}, {2, 6, 2}, {2, 6, 1},
443 {1, 6, 1}, {1, 3, 1}, {1, 1, 1}};
444 return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
445 }
446
BindArguments(ArgumentsBinder * args)447 absl::Status Winograd4x4To36TileX6::BindArguments(ArgumentsBinder* args) {
448 const int tiles_x = DivideRoundUp(
449 src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
450 const int tiles_y = DivideRoundUp(
451 src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
452 const int tiles_total = tiles_x * tiles_y;
453 RETURN_IF_ERROR(args->SetInt("padding_x", -padding_.prepended.w));
454 RETURN_IF_ERROR(args->SetInt("padding_y", -padding_.prepended.h));
455 RETURN_IF_ERROR(args->SetInt("tiles_total", tiles_total));
456 RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
457 return absl::OkStatus();
458 }
459
GetGridSize() const460 int3 Winograd4x4To36TileX6::GetGridSize() const {
461 const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
462 const int grid_y = 6;
463 const int grid_z = dst_[0]->Slices();
464 return int3(grid_x, grid_y, grid_z);
465 }
466
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const467 void Winograd4x4To36TileX6::GetPossibleKernelWorkGroups(
468 TuningType tuning_type, const GpuInfo& gpu_info,
469 const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
470 if (gpu_info.IsIntel()) {
471 work_groups->push_back(int3(4, 6, 1));
472 return;
473 }
474 switch (tuning_type) {
475 case TuningType::kExhaustive:
476 GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
477 work_groups);
478 return;
479 case TuningType::kFast:
480 default:
481 work_groups->push_back(SelectBestWorkGroup(kernel_info));
482 return;
483 }
484 }
485
CreateWinograd4x4To36TileX6(const GpuInfo & gpu_info,const OperationDef & definition,const Padding2D & padding)486 Winograd4x4To36TileX6 CreateWinograd4x4To36TileX6(
487 const GpuInfo& gpu_info, const OperationDef& definition,
488 const Padding2D& padding) {
489 Winograd4x4To36TileX6 result(definition, padding, gpu_info);
490 result.UploadBt();
491 return result;
492 }
493
GetGridSize() const494 int3 Winograd36To4x4::GetGridSize() const {
495 return int3(src_[0]->Width(), 1, src_[0]->Slices());
496 }
497
CreateWinograd36To4x4(const OperationDef & definition,const tflite::gpu::Tensor<Linear,DataType::FLOAT32> & biases)498 Winograd36To4x4 CreateWinograd36To4x4(
499 const OperationDef& definition,
500 const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
501 Winograd36To4x4 desc(definition);
502 desc.code_ = GetKernelWinograd36To4x4();
503
504 desc.AddSrcTensor("src_tensor", definition.src_tensors[0]);
505 desc.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
506
507 TensorLinearDescriptor bias_desc;
508 bias_desc.storage_type = LinearStorageType::BUFFER;
509 bias_desc.element_type = definition.GetDataType();
510 bias_desc.UploadLinearData(biases);
511 desc.args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(
512 std::move(bias_desc)));
513
514 desc.work_group_size_ = int3(32, 1, 1);
515 return desc;
516 }
517
Winograd36To4x4Tile4x1(const OperationDef & definition,const GpuInfo & gpu_info)518 Winograd36To4x4Tile4x1::Winograd36To4x4Tile4x1(const OperationDef& definition,
519 const GpuInfo& gpu_info)
520 : GPUOperation(definition) {
521 work_group_size_ = int3(32, 1, 1);
522 if (definition_.precision == CalculationsPrecision::F16 &&
523 gpu_info.IsPowerVR()) {
524 compiler_options_.push_back(CompilerOptions::kClPowervrFp16);
525 }
526 code_ = GetWinograd36To4x4Tile4x1Code(definition_);
527 }
528
GetWinograd36To4x4Tile4x1Code(const OperationDef & op_def)529 std::string Winograd36To4x4Tile4x1::GetWinograd36To4x4Tile4x1Code(
530 const OperationDef& op_def) {
531 std::string c;
532
533 switch (op_def.precision) {
534 case CalculationsPrecision::F32:
535 case CalculationsPrecision::F32_F16:
536 c += "#define ACCUM_FLT float\n";
537 break;
538 case CalculationsPrecision::F16:
539 c += "#define ACCUM_FLT half\n";
540 break;
541 }
542
543 const DataType accum_type = op_def.precision == CalculationsPrecision::F16
544 ? DataType::FLOAT16
545 : DataType::FLOAT32;
546
547 std::string cl_type = accum_type == DataType::FLOAT16 ? "half" : "float";
548 auto src_desc = op_def.src_tensors[0];
549 src_desc.SetStateVar("ACCUM_FLT", cl_type);
550 AddSrcTensor("src_tensor", src_desc);
551 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
552 args_.AddInt("tiles_x");
553
554 auto at_mat = AtMatrixForWinograd4x4To6x6();
555 c += "constant ACCUM_FLT At[24] = {\n";
556 for (int y = 0; y < 4; ++y) {
557 c += "\t";
558 for (int x = 0; x < 6; ++x) {
559 c += absl::StrFormat("%.10f", at_mat[y * 6 + x]) + "f, ";
560 }
561 c += "\n";
562 }
563 c += "};\n";
564
565 c += "MAIN_FUNCTION($0) {\n";
566 c += " int tile_id = GLOBAL_ID_0;\n";
567 c += " int DST_Y = GLOBAL_ID_1;\n";
568 c += " int DST_Z = GLOBAL_ID_2;\n";
569 c += " int tile_x = (tile_id % args.tiles_x) * 4;\n";
570 c += " int tile_y = (tile_id / args.tiles_x) * 4 + DST_Y;\n";
571
572 c += " if (tile_x >= args.dst_tensor.Width() || tile_y >= "
573 "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Slices()) {\n";
574 c += " return; \n";
575 c += " }\n";
576 c += " ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
577 c += " ACCUM_FLT at_ar[6];\n";
578 c += " ACCUM_FLT4 t00 = TO_ACCUM_TYPE(args.at.Read(DST_Y * 2 + 0));\n";
579 c += " ACCUM_FLT4 t01 = TO_ACCUM_TYPE(args.at.Read(DST_Y * 2 + 1));\n";
580 c += " at_ar[0] = t00.x;\n";
581 c += " at_ar[1] = t00.y;\n";
582 c += " at_ar[2] = t00.z;\n";
583 c += " at_ar[3] = t00.w;\n";
584 c += " at_ar[4] = t01.x;\n";
585 c += " at_ar[5] = t01.y;\n";
586 c += " {\n";
587 c += " ACCUM_FLT at = at_ar[0];\n";
588 for (int x = 0; x < 6; ++x) {
589 const std::string yc = std::to_string(x);
590 const std::string src = "src" + std::to_string(x);
591 c += " ACCUM_FLT4 " + src +
592 " = args.src_tensor.Read<ACCUM_FLT>(tile_id, " + yc + ", DST_Z);\n";
593 c += " I" + std::to_string(x) + " = at * " + src + ";\n";
594 }
595 c += " }\n";
596 for (int y = 1; y < 6; ++y) {
597 c += " {\n";
598 c += " ACCUM_FLT at = at_ar[" + std::to_string(y) + "];\n";
599 for (int x = 0; x < 6; ++x) {
600 const std::string yc = std::to_string(y * 6 + x);
601 const std::string src = "src" + std::to_string(x);
602 c += " ACCUM_FLT4 " + src +
603 " = args.src_tensor.Read<ACCUM_FLT>(tile_id, " + yc + ", DST_Z);\n";
604 c += " I" + std::to_string(x) + " += at * " + src + ";\n";
605 }
606 c += " }\n";
607 }
608 c += " ACCUM_FLT4 t0 = I1 + I2;\n";
609 c += " ACCUM_FLT4 t1 = I3 + I4;\n";
610 c += " FLT4 bias_val = args.biases.Read(DST_Z);\n";
611 c += " {\n";
612 c += " FLT4 r0 = TO_FLT4(I0 + t0 + t1) + bias_val;\n";
613 c += " args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
614 c += " tile_x++;\n";
615 c += " }\n";
616 c += " ACCUM_FLT4 t2 = I1 - I2;\n";
617 c += " ACCUM_FLT4 t3 = I3 - I4;\n";
618 c += " if (tile_x < args.dst_tensor.Width()) {\n";
619 c += " FLT4 r0 = TO_FLT4(t2 * At[7] + t3 * At[9]) + bias_val;\n";
620 c += " args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
621 c += " tile_x++;\n";
622 c += " }\n";
623 c += " if (tile_x < args.dst_tensor.Width()) {\n";
624 c += " FLT4 r0 = TO_FLT4(t0 * At[13] + t1 * At[15]) + bias_val;\n";
625 c += " args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
626 c += " tile_x++;\n";
627 c += " }\n";
628 c += " if (tile_x < args.dst_tensor.Width()) {\n";
629 c += " FLT4 r0 = TO_FLT4(t2 * At[19] + t3 * At[21] + I5) + bias_val;\n";
630 c += " args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
631 c += " tile_x++;\n";
632 c += " }\n";
633 c += "}\n";
634 return c;
635 }
636
UploadAt()637 void Winograd36To4x4Tile4x1::UploadAt() {
638 tflite::gpu::Tensor<Linear, DataType::FLOAT32> at_aligned;
639 at_aligned.shape = Linear(4 * 8);
640 at_aligned.data.resize(4 * 8);
641 auto at_mat = AtMatrixForWinograd4x4To6x6();
642 for (int y = 0; y < 4; ++y) {
643 for (int x = 0; x < 6; ++x) {
644 at_aligned.data[y * 8 + x] = at_mat[y * 6 + x];
645 }
646 at_aligned.data[y * 8 + 6] = 0.0f;
647 at_aligned.data[y * 8 + 7] = 0.0f;
648 }
649
650 TensorLinearDescriptor desc;
651 desc.storage_type = LinearStorageType::TEXTURE_2D;
652 desc.element_type = definition_.GetDataType();
653 desc.UploadLinearData(at_aligned);
654 args_.AddObject("at",
655 absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
656 }
657
SelectBestWorkGroup(const KernelInfo & kernel_info) const658 int3 Winograd36To4x4Tile4x1::SelectBestWorkGroup(
659 const KernelInfo& kernel_info) const {
660 const std::vector<int3> wgs = {{32, 4, 2}, {16, 4, 2}, {16, 4, 1},
661 {8, 4, 1}, {4, 4, 1}, {2, 4, 1},
662 {1, 4, 1}, {1, 2, 1}, {1, 1, 1}};
663 return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
664 }
665
BindArguments(ArgumentsBinder * args)666 absl::Status Winograd36To4x4Tile4x1::BindArguments(ArgumentsBinder* args) {
667 const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
668 RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
669 return absl::OkStatus();
670 }
671
GetGridSize() const672 int3 Winograd36To4x4Tile4x1::GetGridSize() const {
673 const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
674 const int tiles_y = DivideRoundUp(dst_[0]->Height(), 4);
675 const int grid_x = tiles_x * tiles_y * dst_[0]->Batch();
676 const int grid_y = 4;
677 const int grid_z = dst_[0]->Slices();
678 return int3(grid_x, grid_y, grid_z);
679 }
680
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const681 void Winograd36To4x4Tile4x1::GetPossibleKernelWorkGroups(
682 TuningType tuning_type, const GpuInfo& gpu_info,
683 const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
684 if (gpu_info.IsIntel()) {
685 work_groups->push_back(int3(8, 4, 1));
686 return;
687 }
688 switch (tuning_type) {
689 case TuningType::kExhaustive:
690 GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
691 work_groups);
692 return;
693 case TuningType::kFast:
694 default:
695 work_groups->push_back(SelectBestWorkGroup(kernel_info));
696 return;
697 }
698 }
699
CreateWinograd36To4x4Tile4x1(const GpuInfo & gpu_info,const OperationDef & definition,const tflite::gpu::Tensor<Linear,DataType::FLOAT32> & biases)700 Winograd36To4x4Tile4x1 CreateWinograd36To4x4Tile4x1(
701 const GpuInfo& gpu_info, const OperationDef& definition,
702 const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
703 Winograd36To4x4Tile4x1 result(definition, gpu_info);
704 TensorLinearDescriptor desc;
705 desc.storage_type = LinearStorageType::TEXTURE_2D;
706 desc.element_type = definition.GetDataType();
707 desc.UploadLinearData(biases);
708 result.args_.AddObject(
709 "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
710 result.UploadAt();
711 return result;
712 }
713
714 } // namespace gpu
715 } // namespace tflite
716