• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/winograd.h"
17 
18 #include <string>
19 #include <utility>
20 #include <vector>
21 
22 #include "absl/strings/str_format.h"
23 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
27 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
28 
29 namespace tflite {
30 namespace gpu {
31 namespace {
GetKernelWinograd4x4To36()32 std::string GetKernelWinograd4x4To36() {
33   std::string c;
34   auto bt_mat = BtMatrixForWinograd4x4To6x6();
35   c += "__constant FLT Bt[36] = {\n";
36   for (int y = 0; y < 6; ++y) {
37     c += "\t";
38     for (int x = 0; x < 6; ++x) {
39       c += absl::StrFormat("%.10f", bt_mat[y * 6 + x]) + "f";
40       if (!(x == 5 && y == 5)) {
41         c += ", ";
42       }
43     }
44     c += "\n";
45   }
46   c += "};\n";
47   c += R"(
48 MAIN_FUNCTION($0) {
49   int X = GLOBAL_ID_0 * 4;
50   int Y = GLOBAL_ID_1 * 4;
51   int S = GLOBAL_ID_2;
52 
53   if (GLOBAL_ID_0 >= args.tiles_x || GLOBAL_ID_1 >= args.tiles_y) return;
54 
55   FLT4 I[6][6];
56   for (int y = 0; y < 6; ++y) {
57     for (int x = 0; x < 6; ++x) {
58       I[y][x] = INIT_FLT4(0.0f);
59     }
60   }
61   int src_base = S * args.src_tensor.Height() * args.src_tensor.Width();
62 )";
63   for (int y = 0; y < 6; ++y) {
64     const std::string s_y = std::to_string(y);
65     c += "  {\n";
66     c += "    int coord_y = Y + " + s_y + " + args.padding_y;\n";
67     c += "    bool in_y = coord_y >= 0 && coord_y < "
68          "args.src_tensor.Height();\n";
69     c += "    coord_y = clamp(coord_y, 0, args.src_tensor.Height() - 1);\n";
70     c += "    int src_adress_y = src_base + coord_y * "
71          "args.src_tensor.Width();\n";
72     for (int x = 0; x < 6; ++x) {
73       const std::string s_x = std::to_string(x);
74       c += "    {\n";
75       c += "      int coord_x = X + " + s_x + " + args.padding_x;\n";
76       c += "      bool in_x = coord_x >= 0 && coord_x < "
77            "args.src_tensor.Width();\n";
78       c += "      FLT mult = INIT_FLT(in_y && in_x);\n";
79       c += "      coord_x = clamp(coord_x, 0, args.src_tensor.Width() - 1);\n";
80       c += "      FLT4 src = args.src_tensor.Read(src_adress_y + coord_x) * "
81            "mult;\n";
82       c += "      I[0][" + s_x + "] += Bt[" + std::to_string(y) + "] * src;\n";
83       c += "      I[1][" + s_x + "] += Bt[" + std::to_string(y + 6) +
84            "] * src;\n";
85       c += "      I[2][" + s_x + "] += Bt[" + std::to_string(y + 12) +
86            "] * src;\n";
87       c += "      I[3][" + s_x + "] += Bt[" + std::to_string(y + 18) +
88            "] * src;\n";
89       c += "      I[4][" + s_x + "] += Bt[" + std::to_string(y + 24) +
90            "] * src;\n";
91       c += "      I[5][" + s_x + "] += Bt[" + std::to_string(y + 30) +
92            "] * src;\n";
93       c += "    }\n";
94     }
95     c += "  }\n";
96   }
97   c += R"(
98 
99   int dst_x = GLOBAL_ID_1 * args.tiles_x + GLOBAL_ID_0;
100   args.dst_tensor.GetAddress(dst_adress, dst_x, 0, S);
101   for (int y = 0; y < 6; ++y) {
102     FLT4 value = I[y][0] + Bt[2] * I[y][2] + Bt[4] * I[y][4];
103     args.dst_tensor.WriteLinear(value, dst_adress);
104     dst_adress += args.dst_tensor.Width();
105     value = Bt[7] * I[y][1] + Bt[8] * I[y][2] + Bt[9] * I[y][3] + Bt[10] * I[y][4];
106     args.dst_tensor.WriteLinear(value, dst_adress);
107     dst_adress += args.dst_tensor.Width();
108     value = Bt[13] * I[y][1] + Bt[14] * I[y][2] + Bt[15] * I[y][3] + Bt[16] * I[y][4];
109     args.dst_tensor.WriteLinear(value, dst_adress);
110     dst_adress += args.dst_tensor.Width();
111     value = Bt[19] * I[y][1] + Bt[20] * I[y][2] + Bt[21] * I[y][3] + Bt[22] * I[y][4];
112     args.dst_tensor.WriteLinear(value, dst_adress);
113     dst_adress += args.dst_tensor.Width();
114     value = Bt[25] * I[y][1] + Bt[26] * I[y][2] + Bt[27] * I[y][3] + Bt[28] * I[y][4];
115     args.dst_tensor.WriteLinear(value, dst_adress);
116     dst_adress += args.dst_tensor.Width();
117     value = Bt[31] * I[y][1] + Bt[33] * I[y][3] + I[y][5];
118     args.dst_tensor.WriteLinear(value, dst_adress);
119     dst_adress += args.dst_tensor.Width();
120   }
121 }
122 )";
123   return c;
124 }
125 
GetKernelWinograd36To4x4()126 std::string GetKernelWinograd36To4x4() {
127   std::string c;
128   auto at_mat = AtMatrixForWinograd4x4To6x6();
129   c += "__constant FLT At[24] = {\n";
130   for (int y = 0; y < 4; ++y) {
131     c += "\t";
132     for (int x = 0; x < 6; ++x) {
133       c += absl::StrFormat("%.10f", at_mat[y * 6 + x]) + "f";
134       if (!(x == 5 && y == 3)) {
135         c += ", ";
136       }
137     }
138     c += "\n";
139   }
140   c += "};\n";
141   c += R"(
142 MAIN_FUNCTION($0) {
143   int tile_id = GLOBAL_ID_0;
144   int Z = GLOBAL_ID_2;
145   int tiles_count_x = (args.dst_tensor.Width() + 3) / 4;
146   int tile_x = (tile_id % tiles_count_x) * 4;
147   int tile_y = (tile_id / tiles_count_x) * 4;
148   if (tile_x >= args.dst_tensor.Width() || tile_y >= args.dst_tensor.Height()) return;
149 
150   int src_adress = Z * args.src_tensor.Height() * args.src_tensor.Width() + tile_id;
151   FLT4 I[4][6];
152   for (int y = 0; y < 4; ++y) {
153     for (int x = 0; x < 6; ++x) {
154       I[y][x] = INIT_FLT4(0.0f);
155     }
156   }
157   for (int y = 0; y < 6; ++y) {
158     for (int x = 0; x < 6; ++x, src_adress += args.src_tensor.Width()) {
159       FLT4 src = args.src_tensor.Read(src_adress);
160       I[0][x] += src * At[y];
161       I[1][x] += src * At[y + 6];
162       I[2][x] += src * At[y + 12];
163       I[3][x] += src * At[y + 18];
164     }
165   }
166 
167   FLT4 bias_val = args.biases.Read(Z);
168   for (int y = 0; y < 4 && tile_y + y < args.dst_tensor.Height(); ++y) {
169     FLT4 t0 = I[y][1] + I[y][2];
170     FLT4 t1 = I[y][3] + I[y][4];
171     if (tile_x < args.dst_tensor.Width()) {
172       FLT4 value = I[y][0] + t0 + t1 + bias_val;
173       args.dst_tensor.Write(value, tile_x, tile_y + y, Z);
174     }
175     FLT4 t2 = I[y][1] - I[y][2];
176     FLT4 t3 = I[y][3] - I[y][4];
177     if (tile_x + 1 < args.dst_tensor.Width()) {
178       FLT4 value = t2 * At[7] + t3 * At[9] + bias_val;
179       args.dst_tensor.Write(value, tile_x + 1, tile_y + y, Z);
180     }
181     if (tile_x + 2 < args.dst_tensor.Width()) {
182       FLT4 value = t0 * At[13] + t1 * At[15] + bias_val;
183       args.dst_tensor.Write(value, tile_x + 2, tile_y + y, Z);
184     }
185     if (tile_x + 3 < args.dst_tensor.Width()) {
186       FLT4 value = t2 * At[19] + t3 * At[21] + I[y][5] + bias_val;
187       args.dst_tensor.Write(value, tile_x + 3, tile_y + y, Z);
188     }
189   }
190 }
191 )";
192   return c;
193 }
194 }  // namespace
195 
GetGridSize() const196 int3 Winograd4x4To36::GetGridSize() const {
197   int new_width =
198       src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2;
199   int new_height =
200       src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2;
201   int tiles_x = DivideRoundUp(new_width, 4);
202   int tiles_y = DivideRoundUp(new_height, 4);
203   return int3(tiles_x, tiles_y, src_[0]->Slices());
204 }
205 
BindArguments(ArgumentsBinder * args)206 absl::Status Winograd4x4To36::BindArguments(ArgumentsBinder* args) {
207   int new_width =
208       src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2;
209   int new_height =
210       src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2;
211   int tiles_x = DivideRoundUp(new_width, 4);
212   int tiles_y = DivideRoundUp(new_height, 4);
213   RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
214   RETURN_IF_ERROR(args->SetInt("tiles_y", tiles_y));
215   return absl::OkStatus();
216 }
217 
CreateWinograd4x4To36(const OperationDef & definition,const Padding2D & padding)218 Winograd4x4To36 CreateWinograd4x4To36(const OperationDef& definition,
219                                       const Padding2D& padding) {
220   Winograd4x4To36 desc(definition, padding);
221   desc.code_ = GetKernelWinograd4x4To36();
222 
223   desc.AddSrcTensor("src_tensor", definition.src_tensors[0]);
224   desc.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
225 
226   desc.args_.AddInt("padding_x", -padding.prepended.w);
227   desc.args_.AddInt("padding_y", -padding.prepended.h);
228   desc.args_.AddInt("tiles_x");
229   desc.args_.AddInt("tiles_y");
230 
231   desc.work_group_size_ = int3(8, 4, 1);
232   return desc;
233 }
234 
Winograd4x4To36TileX6(const OperationDef & definition,const Padding2D & padding,const GpuInfo & gpu_info)235 Winograd4x4To36TileX6::Winograd4x4To36TileX6(const OperationDef& definition,
236                                              const Padding2D& padding,
237                                              const GpuInfo& gpu_info)
238     : GPUOperation(definition), padding_(padding) {
239   work_group_size_ = int3(32, 1, 1);
240   code_ = GetWinograd4x4To36TileX6Code(definition_, gpu_info);
241   if (gpu_info.IsAdreno()) {
242     compiler_options_.push_back(CompilerOptions::kAdrenoMoreWaves);
243   }
244   if (definition_.precision == CalculationsPrecision::F16 &&
245       gpu_info.IsPowerVR()) {
246     compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
247   }
248 }
249 
GetWinograd4x4To36TileX6Code(const OperationDef & op_def,const GpuInfo & gpu_info)250 std::string Winograd4x4To36TileX6::GetWinograd4x4To36TileX6Code(
251     const OperationDef& op_def, const GpuInfo& gpu_info) {
252   std::string c;
253 
254   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
255   const bool is_image_buffer =
256       src_tensor_type == TensorStorageType::IMAGE_BUFFER;
257   const bool is_buffer = src_tensor_type == TensorStorageType::BUFFER;
258 
259   switch (op_def.precision) {
260     case CalculationsPrecision::F32:
261     case CalculationsPrecision::F32_F16:
262       c += "#define ACCUM_FLT float\n";
263       break;
264     case CalculationsPrecision::F16:
265       c += "#define ACCUM_FLT half\n";
266       break;
267   }
268 
269   const DataType accum_type = op_def.precision == CalculationsPrecision::F16
270                                   ? DataType::FLOAT16
271                                   : DataType::FLOAT32;
272 
273   auto bt_mat = BtMatrixForWinograd4x4To6x6();
274   c += "__constant ACCUM_FLT Bt[36] = {\n";
275   for (int y = 0; y < 6; ++y) {
276     c += "\t";
277     for (int x = 0; x < 6; ++x) {
278       c += absl::StrFormat("%.10f", bt_mat[y * 6 + x]) + "f";
279       if (!(x == 5 && y == 5)) {
280         c += ", ";
281       }
282     }
283     c += "\n";
284   }
285   c += "};\n";
286 
287   std::string cl_type = accum_type == DataType::FLOAT16 ? "half" : "float";
288   auto src_desc = op_def.src_tensors[0];
289   src_desc.SetStateVar("ACCUM_FLT", cl_type);
290   AddSrcTensor("src_tensor", src_desc);
291   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
292   args_.AddInt("padding_x");
293   args_.AddInt("padding_y");
294   args_.AddInt("tiles_total");
295   args_.AddInt("tiles_x");
296 
297   c += "MAIN_FUNCTION($0) {\n";
298   c += "  int DST_X = GLOBAL_ID_0;\n";
299   c += "  int DST_Y = GLOBAL_ID_1;\n";
300   c += "  int DST_Z = GLOBAL_ID_2;\n";
301   c += "  if (DST_X >= args.tiles_total || DST_Y >= 6 || DST_Z >= "
302        "args.dst_tensor.Slices()) {\n";
303   c += "    return; \n";
304   c += "  }\n";
305   c += "  int tile_x = (DST_X % args.tiles_x) * 4;\n";
306   c += "  int tile_y = (DST_X / args.tiles_x) * 4;\n";
307   c += "  ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
308   c += "  ACCUM_FLT bt_ar[6];\n";
309   c += "  ACCUM_FLT4 t0 = TO_ACCUM_TYPE(args.bt.Read(DST_Y * 2 + 0));\n";
310   c += "  ACCUM_FLT4 t1 = TO_ACCUM_TYPE(args.bt.Read(DST_Y * 2 + 1));\n";
311   c += "  DST_Y *= 6;\n";
312   c += "  bt_ar[0] = t0.x;\n";
313   c += "  bt_ar[1] = t0.y;\n";
314   c += "  bt_ar[2] = t0.z;\n";
315   c += "  bt_ar[3] = t0.w;\n";
316   c += "  bt_ar[4] = t1.x;\n";
317   c += "  bt_ar[5] = t1.y;\n";
318   auto read_src = [&](const std::string& src, const std::string& xs) {
319     if (is_image_buffer) {
320       c += "    ACCUM_FLT4 " + src +
321            " = args.src_tensor.Read<ACCUM_FLT>(src_a_" + xs + " + offset);\n";
322     } else if (is_buffer) {
323       c += "    ACCUM_FLT4 " + src +
324            " = args.src_tensor.Read<ACCUM_FLT>(src_a_" + xs + " + offset) * m" +
325            xs + "_x;\n";
326     } else {
327       c += "    ACCUM_FLT4 " + src +
328            " = args.src_tensor.Read<ACCUM_FLT>(tile_x + args.padding_x + " +
329            xs + ", yc, DST_Z);\n";
330     }
331   };
332   if (is_buffer || is_image_buffer) {
333     for (int x = 0; x < 6; ++x) {
334       const std::string xs = std::to_string(x);
335       c += "  int xc" + xs + " = tile_x + args.padding_x + " + xs + ";\n";
336       c += "  ACCUM_FLT m" + xs + "_x = TO_ACCUM_FLT(xc" + xs + " >= 0 && xc" +
337            xs + " < args.src_tensor.Width());\n";
338       c += "  bool inx" + xs + " = (xc" + xs + " >= 0 && xc" + xs +
339            " < args.src_tensor.Width());\n";
340       c += "  xc" + xs + " = clamp(xc" + xs +
341            ", 0, args.src_tensor.Width() - 1);\n";
342       c += "  args.src_tensor.GetAddress(src_a_" + xs + ", xc" + xs +
343            ", 0, DST_Z);\n";
344       if (is_image_buffer) {
345         c += "  src_a_" + xs +
346              " = select(-args.src_tensor.Width() * args.src_tensor.Height(), "
347              "src_a_" +
348              xs + ", inx" + xs + ");\n";
349       }
350     }
351   }
352   const bool manual_unroll =
353       !(op_def.precision == CalculationsPrecision::F32 && gpu_info.IsMali());
354   if (manual_unroll) {
355     c += "  {\n";
356     c += "    int yc = tile_y + args.padding_y;\n";
357     if (is_buffer || is_image_buffer) {
358       c += "    bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
359       c += "    int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
360       c += "    ACCUM_FLT bt = bt_ar[0] * TO_ACCUM_FLT(iny);\n";
361     } else {
362       c += "    ACCUM_FLT bt = bt_ar[0];\n";
363     }
364     for (int x = 0; x < 6; ++x) {
365       const std::string xs = std::to_string(x);
366       const std::string src = "src" + xs;
367       read_src(src, xs);
368       c += "    I" + xs + " = bt * " + src + ";\n";
369     }
370     c += "  }\n";
371     for (int y = 1; y < 6; ++y) {
372       const std::string ys = std::to_string(y);
373       c += "  {\n";
374       c += "    int yc = tile_y + args.padding_y + (" + ys + ");\n";
375       if (is_buffer || is_image_buffer) {
376         c += "    bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
377         c += "    int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
378         c += "    ACCUM_FLT bt = bt_ar[" + ys + "] * TO_ACCUM_FLT(iny);\n";
379       } else {
380         c += "    ACCUM_FLT bt = bt_ar[" + ys + "];\n";
381       }
382       for (int x = 0; x < 6; ++x) {
383         const std::string xs = std::to_string(x);
384         const std::string src = "src" + xs;
385         read_src(src, xs);
386         c += "    I" + xs + " += bt * " + src + ";\n";
387       }
388       c += "  }\n";
389     }
390   } else {
391     c += "  I0 = INIT_ACCUM_FLT4(0.0f);\n";
392     c += "  I1 = INIT_ACCUM_FLT4(0.0f);\n";
393     c += "  I2 = INIT_ACCUM_FLT4(0.0f);\n";
394     c += "  I3 = INIT_ACCUM_FLT4(0.0f);\n";
395     c += "  I4 = INIT_ACCUM_FLT4(0.0f);\n";
396     c += "  I5 = INIT_ACCUM_FLT4(0.0f);\n";
397     c += "  for (int y = 0; y < 6; ++y) {\n";
398     c += "    int yc = tile_y + args.padding_y + y;\n";
399     if (is_buffer || is_image_buffer) {
400       c += "    bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
401       c += "    int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
402       c += "    ACCUM_FLT bt = bt_ar[y] * TO_ACCUM_FLT(iny);\n";
403     } else {
404       c += "    ACCUM_FLT bt = bt_ar[y];\n";
405     }
406     for (int x = 0; x < 6; ++x) {
407       const std::string xs = std::to_string(x);
408       const std::string src = "src" + xs;
409       read_src(src, xs);
410       c += "    I" + xs + " += bt * " + src + ";\n";
411     }
412     c += "  }\n";
413   }
414   c += "  {\n";
415   c += "    FLT4 r0 = TO_FLT4(I0 + Bt[2] * I2 + Bt[4] * I4);\n";
416   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
417   c += "    DST_Y++;\n";
418   c += "  }\n";
419   c += "  {\n";
420   c += "    FLT4 r0 = TO_FLT4(Bt[7] * I1 + Bt[8] * I2 + Bt[9] * I3 + Bt[10] * "
421        "I4);\n";
422   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
423   c += "    DST_Y++;\n";
424   c += "  }\n";
425   c += "  {\n";
426   c += "    FLT4 r0 = TO_FLT4(Bt[13] * I1 + Bt[14] * I2 + Bt[15] * I3 + Bt[16] "
427        "* "
428        "I4);\n";
429   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
430   c += "    DST_Y++;\n";
431   c += "  }\n";
432   c += "  {\n";
433   c += "    FLT4 r0 = TO_FLT4(Bt[19] * I1 + Bt[20] * I2 + Bt[21] * I3 + Bt[22] "
434        "* "
435        "I4);\n";
436   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
437   c += "    DST_Y++;\n";
438   c += "  }\n";
439   c += "  {\n";
440   c += "    FLT4 r0 = TO_FLT4(Bt[25] * I1 + Bt[26] * I2 + Bt[27] * I3 + Bt[28] "
441        "* "
442        "I4);\n";
443   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
444   c += "    DST_Y++;\n";
445   c += "  }\n";
446   c += "  {\n";
447   c += "    FLT4 r0 = TO_FLT4(Bt[31] * I1 + Bt[33] * I3 + I5);\n";
448   c += "    args.dst_tensor.Write(r0, DST_X, DST_Y, DST_Z);\n";
449   c += "    DST_Y++;\n";
450   c += "  }\n";
451   c += "}\n";
452   return c;
453 }
454 
UploadBt()455 void Winograd4x4To36TileX6::UploadBt() {
456   tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
457   bt_aligned.shape = Linear(6 * 8);
458   bt_aligned.data.resize(6 * 8);
459   auto bt_mat = BtMatrixForWinograd4x4To6x6();
460   for (int y = 0; y < 6; ++y) {
461     for (int x = 0; x < 6; ++x) {
462       bt_aligned.data[y * 8 + x] = bt_mat[y * 6 + x];
463     }
464     bt_aligned.data[y * 8 + 6] = 0.0f;
465     bt_aligned.data[y * 8 + 7] = 0.0f;
466   }
467 
468   TensorLinearDescriptor desc;
469   desc.storage_type = LinearStorageType::TEXTURE_2D;
470   desc.element_type = definition_.GetDataType();
471   desc.UploadLinearData(bt_aligned);
472   args_.AddObject("bt",
473                   absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
474 }
475 
SelectBestWorkGroup(const KernelInfo & kernel_info) const476 int3 Winograd4x4To36TileX6::SelectBestWorkGroup(
477     const KernelInfo& kernel_info) const {
478   const std::vector<int3> wgs = {{8, 6, 4}, {8, 6, 2}, {4, 6, 2},
479                                  {4, 6, 2}, {2, 6, 2}, {2, 6, 1},
480                                  {1, 6, 1}, {1, 3, 1}, {1, 1, 1}};
481   return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
482 }
483 
BindArguments(ArgumentsBinder * args)484 absl::Status Winograd4x4To36TileX6::BindArguments(ArgumentsBinder* args) {
485   const int tiles_x = DivideRoundUp(
486       src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
487   const int tiles_y = DivideRoundUp(
488       src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
489   const int tiles_total = tiles_x * tiles_y;
490   RETURN_IF_ERROR(args->SetInt("padding_x", -padding_.prepended.w));
491   RETURN_IF_ERROR(args->SetInt("padding_y", -padding_.prepended.h));
492   RETURN_IF_ERROR(args->SetInt("tiles_total", tiles_total));
493   RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
494   return absl::OkStatus();
495 }
496 
GetGridSize() const497 int3 Winograd4x4To36TileX6::GetGridSize() const {
498   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
499   const int grid_y = 6;
500   const int grid_z = dst_[0]->Slices();
501   return int3(grid_x, grid_y, grid_z);
502 }
503 
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const504 void Winograd4x4To36TileX6::GetPossibleKernelWorkGroups(
505     TuningType tuning_type, const GpuInfo& gpu_info,
506     const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
507   if (gpu_info.IsIntel()) {
508     work_groups->push_back(int3(4, 6, 1));
509     return;
510   }
511   switch (tuning_type) {
512     case TuningType::kExhaustive:
513       GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
514                             work_groups);
515       return;
516     case TuningType::kFast:
517     default:
518       work_groups->push_back(SelectBestWorkGroup(kernel_info));
519       return;
520   }
521 }
522 
CreateWinograd4x4To36TileX6(const GpuInfo & gpu_info,const OperationDef & definition,const Padding2D & padding)523 Winograd4x4To36TileX6 CreateWinograd4x4To36TileX6(
524     const GpuInfo& gpu_info, const OperationDef& definition,
525     const Padding2D& padding) {
526   Winograd4x4To36TileX6 result(definition, padding, gpu_info);
527   result.UploadBt();
528   return result;
529 }
530 
GetGridSize() const531 int3 Winograd36To4x4::GetGridSize() const {
532   return int3(src_[0]->Width(), 1, src_[0]->Slices());
533 }
534 
CreateWinograd36To4x4(const OperationDef & definition,const tflite::gpu::Tensor<Linear,DataType::FLOAT32> & biases)535 Winograd36To4x4 CreateWinograd36To4x4(
536     const OperationDef& definition,
537     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
538   Winograd36To4x4 desc(definition);
539   desc.code_ = GetKernelWinograd36To4x4();
540 
541   desc.AddSrcTensor("src_tensor", definition.src_tensors[0]);
542   desc.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
543 
544   TensorLinearDescriptor bias_desc;
545   bias_desc.storage_type = LinearStorageType::BUFFER;
546   bias_desc.element_type = definition.GetDataType();
547   bias_desc.UploadLinearData(biases);
548   desc.args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(
549                                      std::move(bias_desc)));
550 
551   desc.work_group_size_ = int3(32, 1, 1);
552   return desc;
553 }
554 
Winograd36To4x4Tile4x1(const OperationDef & definition,const GpuInfo & gpu_info)555 Winograd36To4x4Tile4x1::Winograd36To4x4Tile4x1(const OperationDef& definition,
556                                                const GpuInfo& gpu_info)
557     : GPUOperation(definition) {
558   work_group_size_ = int3(32, 1, 1);
559   if (definition_.precision == CalculationsPrecision::F16 &&
560       gpu_info.IsPowerVR()) {
561     compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
562   }
563   code_ = GetWinograd36To4x4Tile4x1Code(definition_, gpu_info);
564 }
565 
GetWinograd36To4x4Tile4x1Code(const OperationDef & op_def,const GpuInfo & gpu_info)566 std::string Winograd36To4x4Tile4x1::GetWinograd36To4x4Tile4x1Code(
567     const OperationDef& op_def, const GpuInfo& gpu_info) {
568   std::string c;
569 
570   switch (op_def.precision) {
571     case CalculationsPrecision::F32:
572     case CalculationsPrecision::F32_F16:
573       c += "#define ACCUM_FLT float\n";
574       break;
575     case CalculationsPrecision::F16:
576       c += "#define ACCUM_FLT half\n";
577       break;
578   }
579 
580   const DataType accum_type = op_def.precision == CalculationsPrecision::F16
581                                   ? DataType::FLOAT16
582                                   : DataType::FLOAT32;
583 
584   std::string cl_type = accum_type == DataType::FLOAT16 ? "half" : "float";
585   auto src_desc = op_def.src_tensors[0];
586   src_desc.SetStateVar("ACCUM_FLT", cl_type);
587   AddSrcTensor("src_tensor", src_desc);
588   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
589   args_.AddInt("tiles_x");
590 
591   auto at_mat = AtMatrixForWinograd4x4To6x6();
592   c += "__constant ACCUM_FLT At[24] = {\n";
593   for (int y = 0; y < 4; ++y) {
594     c += "\t";
595     for (int x = 0; x < 6; ++x) {
596       c += absl::StrFormat("%.10f", at_mat[y * 6 + x]) + "f";
597       if (!(x == 5 && y == 3)) {
598         c += ", ";
599       }
600     }
601     c += "\n";
602   }
603   c += "};\n";
604 
605   c += "MAIN_FUNCTION($0) {\n";
606   c += "  int tile_id = GLOBAL_ID_0;\n";
607   c += "  int DST_Y = GLOBAL_ID_1;\n";
608   c += "  int DST_Z = GLOBAL_ID_2;\n";
609   c += "  int tile_x = (tile_id % args.tiles_x) * 4;\n";
610   c += "  int tile_y = (tile_id / args.tiles_x) * 4 + DST_Y;\n";
611 
612   c += "  if (tile_x >= args.dst_tensor.Width() || tile_y >= "
613        "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Slices()) {\n";
614   c += "    return; \n";
615   c += "  }\n";
616   c += "  ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
617   c += "  ACCUM_FLT at_ar[6];\n";
618   c += "  ACCUM_FLT4 t00 = TO_ACCUM_TYPE(args.at.Read(DST_Y * 2 + 0));\n";
619   c += "  ACCUM_FLT4 t01 = TO_ACCUM_TYPE(args.at.Read(DST_Y * 2 + 1));\n";
620   c += "  at_ar[0] = t00.x;\n";
621   c += "  at_ar[1] = t00.y;\n";
622   c += "  at_ar[2] = t00.z;\n";
623   c += "  at_ar[3] = t00.w;\n";
624   c += "  at_ar[4] = t01.x;\n";
625   c += "  at_ar[5] = t01.y;\n";
626   const bool manual_unroll =
627       !(op_def.precision == CalculationsPrecision::F32 && gpu_info.IsMali());
628   if (manual_unroll) {
629     c += "  {\n";
630     c += "    ACCUM_FLT at = at_ar[0];\n";
631     for (int x = 0; x < 6; ++x) {
632       const std::string yc = std::to_string(x);
633       const std::string src = "src" + std::to_string(x);
634       c += "    ACCUM_FLT4 " + src +
635            " = args.src_tensor.Read<ACCUM_FLT>(tile_id, " + yc + ", DST_Z);\n";
636       c += "    I" + std::to_string(x) + " = at * " + src + ";\n";
637     }
638     c += "  }\n";
639     for (int y = 1; y < 6; ++y) {
640       c += "  {\n";
641       c += "    ACCUM_FLT at = at_ar[" + std::to_string(y) + "];\n";
642       for (int x = 0; x < 6; ++x) {
643         const std::string yc = std::to_string(y * 6 + x);
644         const std::string src = "src" + std::to_string(x);
645         c += "    ACCUM_FLT4 " + src +
646              " = args.src_tensor.Read<ACCUM_FLT>(tile_id, " + yc +
647              ", DST_Z);\n";
648         c += "    I" + std::to_string(x) + " += at * " + src + ";\n";
649       }
650       c += "  }\n";
651     }
652   } else {
653     c += "  I0 = INIT_ACCUM_FLT4(0.0f);\n";
654     c += "  I1 = INIT_ACCUM_FLT4(0.0f);\n";
655     c += "  I2 = INIT_ACCUM_FLT4(0.0f);\n";
656     c += "  I3 = INIT_ACCUM_FLT4(0.0f);\n";
657     c += "  I4 = INIT_ACCUM_FLT4(0.0f);\n";
658     c += "  I5 = INIT_ACCUM_FLT4(0.0f);\n";
659     c += "  for (int y = 0; y < 6; ++y) {\n";
660     c += "    ACCUM_FLT at = at_ar[y];\n";
661     for (int x = 0; x < 6; ++x) {
662       const std::string src = "src" + std::to_string(x);
663       c += "    ACCUM_FLT4 " + src +
664            " = args.src_tensor.Read<ACCUM_FLT>(tile_id, y * 6 + " +
665            std::to_string(x) + ", DST_Z);\n";
666       c += "    I" + std::to_string(x) + " += at * " + src + ";\n";
667     }
668     c += "  }\n";
669   }
670   c += "  ACCUM_FLT4 t0 = I1 + I2;\n";
671   c += "  ACCUM_FLT4 t1 = I3 + I4;\n";
672   c += "  FLT4 bias_val = args.biases.Read(DST_Z);\n";
673   c += "  {\n";
674   c += "    FLT4 r0 = TO_FLT4(I0 + t0 + t1) + bias_val;\n";
675   c += "    args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
676   c += "    tile_x++;\n";
677   c += "  }\n";
678   c += "  ACCUM_FLT4 t2 = I1 - I2;\n";
679   c += "  ACCUM_FLT4 t3 = I3 - I4;\n";
680   c += "  if (tile_x < args.dst_tensor.Width()) {\n";
681   c += "    FLT4 r0 = TO_FLT4(t2 * At[7] + t3 * At[9]) + bias_val;\n";
682   c += "    args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
683   c += "    tile_x++;\n";
684   c += "  }\n";
685   c += "  if (tile_x < args.dst_tensor.Width()) {\n";
686   c += "    FLT4 r0 = TO_FLT4(t0 * At[13] + t1 * At[15]) + bias_val;\n";
687   c += "    args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
688   c += "    tile_x++;\n";
689   c += "  }\n";
690   c += "  if (tile_x < args.dst_tensor.Width()) {\n";
691   c += "    FLT4 r0 = TO_FLT4(t2 * At[19] + t3 * At[21] + I5) + bias_val;\n";
692   c += "    args.dst_tensor.Write(r0, tile_x, tile_y, DST_Z);\n";
693   c += "    tile_x++;\n";
694   c += "  }\n";
695   c += "}\n";
696   return c;
697 }
698 
UploadAt()699 void Winograd36To4x4Tile4x1::UploadAt() {
700   tflite::gpu::Tensor<Linear, DataType::FLOAT32> at_aligned;
701   at_aligned.shape = Linear(4 * 8);
702   at_aligned.data.resize(4 * 8);
703   auto at_mat = AtMatrixForWinograd4x4To6x6();
704   for (int y = 0; y < 4; ++y) {
705     for (int x = 0; x < 6; ++x) {
706       at_aligned.data[y * 8 + x] = at_mat[y * 6 + x];
707     }
708     at_aligned.data[y * 8 + 6] = 0.0f;
709     at_aligned.data[y * 8 + 7] = 0.0f;
710   }
711 
712   TensorLinearDescriptor desc;
713   desc.storage_type = LinearStorageType::TEXTURE_2D;
714   desc.element_type = definition_.GetDataType();
715   desc.UploadLinearData(at_aligned);
716   args_.AddObject("at",
717                   absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
718 }
719 
SelectBestWorkGroup(const KernelInfo & kernel_info) const720 int3 Winograd36To4x4Tile4x1::SelectBestWorkGroup(
721     const KernelInfo& kernel_info) const {
722   const std::vector<int3> wgs = {{32, 4, 2}, {16, 4, 2}, {16, 4, 1},
723                                  {8, 4, 1},  {4, 4, 1},  {2, 4, 1},
724                                  {1, 4, 1},  {1, 2, 1},  {1, 1, 1}};
725   return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
726 }
727 
BindArguments(ArgumentsBinder * args)728 absl::Status Winograd36To4x4Tile4x1::BindArguments(ArgumentsBinder* args) {
729   const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
730   RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
731   return absl::OkStatus();
732 }
733 
GetGridSize() const734 int3 Winograd36To4x4Tile4x1::GetGridSize() const {
735   const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
736   const int tiles_y = DivideRoundUp(dst_[0]->Height(), 4);
737   const int grid_x = tiles_x * tiles_y * dst_[0]->Batch();
738   const int grid_y = 4;
739   const int grid_z = dst_[0]->Slices();
740   return int3(grid_x, grid_y, grid_z);
741 }
742 
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const743 void Winograd36To4x4Tile4x1::GetPossibleKernelWorkGroups(
744     TuningType tuning_type, const GpuInfo& gpu_info,
745     const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
746   if (gpu_info.IsIntel()) {
747     work_groups->push_back(int3(8, 4, 1));
748     return;
749   }
750   switch (tuning_type) {
751     case TuningType::kExhaustive:
752       GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
753                             work_groups);
754       return;
755     case TuningType::kFast:
756     default:
757       work_groups->push_back(SelectBestWorkGroup(kernel_info));
758       return;
759   }
760 }
761 
CreateWinograd36To4x4Tile4x1(const GpuInfo & gpu_info,const OperationDef & definition,const tflite::gpu::Tensor<Linear,DataType::FLOAT32> & biases)762 Winograd36To4x4Tile4x1 CreateWinograd36To4x4Tile4x1(
763     const GpuInfo& gpu_info, const OperationDef& definition,
764     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
765   Winograd36To4x4Tile4x1 result(definition, gpu_info);
766   TensorLinearDescriptor desc;
767   desc.storage_type = LinearStorageType::TEXTURE_2D;
768   desc.element_type = definition.GetDataType();
769   desc.UploadLinearData(biases);
770   result.args_.AddObject(
771       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
772   result.UploadAt();
773   return result;
774 }
775 
776 }  // namespace gpu
777 }  // namespace tflite
778