• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/conv_powervr.h"
17 
18 #include <algorithm>
19 #include <string>
20 #include <utility>
21 
22 #include "absl/strings/substitute.h"
23 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
28 
29 namespace tflite {
30 namespace gpu {
31 
32 namespace {
GenerateUploadByThreads(const std::string & local_ptr_name,const std::string & global_ptr_name,const std::string & global_offset_name,const std::string & lid_name,int total_work_items,int elements_to_upload)33 std::string GenerateUploadByThreads(const std::string& local_ptr_name,
34                                     const std::string& global_ptr_name,
35                                     const std::string& global_offset_name,
36                                     const std::string& lid_name,
37                                     int total_work_items,
38                                     int elements_to_upload) {
39   std::string c;
40   std::string offset =
41       global_offset_name.empty() ? "" : global_offset_name + " + ";
42   const int groups = elements_to_upload / total_work_items;
43   const int reminder = elements_to_upload % total_work_items;
44   for (int i = 0; i < groups; ++i) {
45     c += "    " + local_ptr_name + "[" + lid_name + " + " +
46          std::to_string(total_work_items * i) + "] = " + global_ptr_name + "[" +
47          offset + lid_name + " + " + std::to_string(total_work_items * i) +
48          "];\n";
49   }
50   if (reminder != 0) {
51     c += "    if (" + lid_name + " < " + std::to_string(reminder) + ") {\n";
52     c += "      " + local_ptr_name + "[" + lid_name + " + " +
53          std::to_string(total_work_items * groups) + "] = " + global_ptr_name +
54          "[" + offset + lid_name + " + " +
55          std::to_string(total_work_items * groups) + "];\n";
56     c += "    }\n";
57   }
58   return c;
59 }
60 
GenerateAsyncUpload(const std::string & local_ptr_name,const std::string & global_ptr_name,const std::string & global_offset_name,int elements_to_upload)61 std::string GenerateAsyncUpload(const std::string& local_ptr_name,
62                                 const std::string& global_ptr_name,
63                                 const std::string& global_offset_name,
64                                 int elements_to_upload) {
65   std::string c;
66   std::string offset =
67       global_offset_name.empty() ? "" : " + " + global_offset_name;
68   c += "    async_work_group_copy(" + local_ptr_name + ", " + global_ptr_name +
69        offset + ", " + std::to_string(elements_to_upload) + ", 0);\n";
70   return c;
71 }
72 
GenerateBlockCoords(const int4 & block_size,const int3 & work_group_launch_order,bool linear_spatial,bool linear_all,bool need_depth)73 std::string GenerateBlockCoords(const int4& block_size,
74                                 const int3& work_group_launch_order,
75                                 bool linear_spatial, bool linear_all,
76                                 bool need_depth) {
77   std::string c;
78   int3 launch_remap;
79   launch_remap[work_group_launch_order.x] = 0;
80   launch_remap[work_group_launch_order.y] = 1;
81   launch_remap[work_group_launch_order.z] = 2;
82   if (linear_all) {
83     c += "  int linear_id = GLOBAL_ID_0;\n";
84     c += "  int DST_S = (linear_id / args.task_size_spatial) * " +
85          std::to_string(block_size.w) + ";\n";
86     c += "  int linear_spatial = linear_id % args.task_size_spatial;\n";
87     if (need_depth) {
88       c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
89            std::to_string(block_size.x) + ";\n";
90       c += "  linear_spatial = linear_spatial / args.task_size_x;\n";
91       c += "  int DST_Y = (linear_spatial % args.task_size_y) * " +
92            std::to_string(block_size.y) + ";\n";
93       c += "  int DST_Z = (linear_spatial / args.task_size_y) * " +
94            std::to_string(block_size.z) + ";\n";
95     } else {
96       c += "  int DST_Y = (linear_spatial / args.task_size_x) * " +
97            std::to_string(block_size.y) + ";\n";
98       c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
99            std::to_string(block_size.x) + ";\n";
100     }
101   } else if (linear_spatial) {
102     if (work_group_launch_order[0] == 0) {
103       c += "  int linear_spatial = GLOBAL_ID_0;\n";
104     } else {
105       c += "  int linear_spatial = GROUP_ID_" +
106            std::to_string(launch_remap[0]) + " * GROUP_SIZE_0 + LOCAL_ID_0;\n";
107     }
108     if (need_depth) {
109       c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
110            std::to_string(block_size.x) + ";\n";
111       c += "  linear_spatial = linear_spatial / args.task_size_x;\n";
112       c += "  int DST_Y = (linear_spatial % args.task_size_y) * " +
113            std::to_string(block_size.y) + ";\n";
114       c += "  int DST_Z = (linear_spatial / args.task_size_y) * " +
115            std::to_string(block_size.z) + ";\n";
116     } else {
117       c += "  int DST_Y = (linear_spatial / args.task_size_x) * " +
118            std::to_string(block_size.y) + ";\n";
119       c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
120            std::to_string(block_size.x) + ";\n";
121     }
122     if (work_group_launch_order[1] == 1) {
123       c +=
124           "  int DST_S = GLOBAL_ID_1 * " + std::to_string(block_size.w) + ";\n";
125     } else {
126       c += "  int DST_S = (GROUP_ID_" + std::to_string(launch_remap[1]) +
127            " * GROUP_SIZE_1 + LOCAL_ID_1) * " + std::to_string(block_size.w) +
128            ";\n";
129     }
130   } else {
131     if (work_group_launch_order[0] == 0) {
132       c +=
133           "  int DST_X = GLOBAL_ID_0 * " + std::to_string(block_size.x) + ";\n";
134     } else {
135       c += "  int DST_X = (GROUP_ID_" + std::to_string(launch_remap[0]) +
136            " * GROUP_SIZE_0 + LOCAL_ID_0) * " + std::to_string(block_size.x) +
137            ";\n";
138     }
139     std::string global_id_1;
140     if (work_group_launch_order[1] == 1) {
141       global_id_1 = "GLOBAL_ID_1";
142     } else {
143       global_id_1 = "(GROUP_ID_" + std::to_string(launch_remap[1]) +
144                     " * GROUP_SIZE_1 + LOCAL_ID_1)";
145     }
146     if (need_depth) {
147       c += "  int linear_id_1 = " + global_id_1 + ";\n";
148       c += "  int DST_Z = (linear_id_1 / args.task_size_y) * " +
149            std::to_string(block_size.z) + ";\n";
150       c += "  int DST_Y = (linear_id_1 % args.task_size_y) * " +
151            std::to_string(block_size.y) + ";\n";
152     } else {
153       c += "  int DST_Y = " + global_id_1 + " * " +
154            std::to_string(block_size.y) + ";\n";
155     }
156     if (work_group_launch_order[2] == 2) {
157       c +=
158           "  int DST_S = GLOBAL_ID_2 * " + std::to_string(block_size.w) + ";\n";
159     } else {
160       c += "  int DST_S = (GROUP_ID_" + std::to_string(launch_remap[2]) +
161            " * GROUP_SIZE_2 + LOCAL_ID_2) * " + std::to_string(block_size.w) +
162            ";\n";
163     }
164   }
165 
166   return c;
167 }
168 }  // namespace
169 
ConvPowerVR(const OperationDef & definition,const Convolution2DAttributes & attr,const GpuInfo & gpu_info,const BHWC * dst_shape)170 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
171                          const Convolution2DAttributes& attr,
172                          const GpuInfo& gpu_info, const BHWC* dst_shape)
173     : GPUOperation(definition),
174       stride_(attr.strides.w, attr.strides.h, 1, 1),
175       padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
176       kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 1, 1),
177       dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
178       conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
179 
ConvPowerVR(const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const GpuInfo & gpu_info,const BHWC * dst_shape)180 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
181                          const Convolution2DAttributes& attr,
182                          const BHWC& weights_shape, const GpuInfo& gpu_info,
183                          const BHWC* dst_shape)
184     : GPUOperation(definition),
185       stride_(attr.strides.w, attr.strides.h, 1, 1),
186       padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
187       kernel_size_(weights_shape.w, weights_shape.h, 1, 1),
188       dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
189       conv_params_(GuessBestParams(gpu_info, definition, attr, weights_shape,
190                                    dst_shape)) {}
191 
ConvPowerVR(const OperationDef & definition,const FullyConnectedAttributes & attr,const GpuInfo & gpu_info,const BHWC * dst_shape)192 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
193                          const FullyConnectedAttributes& attr,
194                          const GpuInfo& gpu_info, const BHWC* dst_shape)
195     : GPUOperation(definition),
196       stride_(1, 1, 1, 1),
197       padding_(0, 0, 0, 0),
198       kernel_size_(1, 1, 1, 1),
199       dilation_(1, 1, 1, 1),
200       conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
201 
ConvPowerVR(const OperationDef & definition)202 ConvPowerVR::ConvPowerVR(const OperationDef& definition)
203     : GPUOperation(definition),
204       stride_(1, 1, 1, 1),
205       padding_(0, 0, 0, 0),
206       kernel_size_(1, 1, 1, 1),
207       dilation_(1, 1, 1, 1) {}
208 
ConvPowerVR(ConvPowerVR && operation)209 ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
210     : GPUOperation(std::move(operation)),
211       stride_(operation.stride_),
212       padding_(operation.padding_),
213       kernel_size_(operation.kernel_size_),
214       dilation_(operation.dilation_),
215       conv_params_(operation.conv_params_) {}
216 
ConvPowerVR(const OperationDef & definition,const Convolution3DAttributes & attr,const GpuInfo & gpu_info,const BHWDC * dst_shape)217 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
218                          const Convolution3DAttributes& attr,
219                          const GpuInfo& gpu_info, const BHWDC* dst_shape)
220     : GPUOperation(definition),
221       stride_(attr.strides.w, attr.strides.h, attr.strides.d, 1),
222       padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
223                -attr.padding.prepended.d, 0),
224       kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
225                    attr.weights.shape.d, 1),
226       dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 1),
227       conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
228 
operator =(ConvPowerVR && operation)229 ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
230   if (this != &operation) {
231     std::swap(stride_, operation.stride_);
232     std::swap(padding_, operation.padding_);
233     std::swap(kernel_size_, operation.kernel_size_);
234     std::swap(dilation_, operation.dilation_);
235     std::swap(conv_params_, operation.conv_params_);
236     GPUOperation::operator=(std::move(operation));
237   }
238   return *this;
239 }
240 
GenerateCode(const GpuInfo & gpu_info)241 void ConvPowerVR::GenerateCode(const GpuInfo& gpu_info) {
242   if (conv_params_.linear_all) {
243     grid_dimension_ = 1;
244   } else if (conv_params_.linear_spatial) {
245     grid_dimension_ = 2;
246   }
247   const bool stride_correction =
248       definition_.IsBatchSupported() && stride_.x != 1;
249   code_ = GenerateConv(gpu_info, definition_, stride_correction, conv_params_);
250   if (definition_.precision == CalculationsPrecision::F16 &&
251       gpu_info.IsPowerVR()) {
252     compiler_options_.push_back(CompilerOptions::kClPowervrFp16);
253   }
254   if (conv_params_.IsPrivateMemBroadcast() && gpu_info.IsCL20OrHigher()) {
255     compiler_options_.push_back(CompilerOptions::kCl20);
256   }
257   bool kernel_is_trivial =
258       conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
259   if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
260     kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1;
261   }
262   if (gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx() &&
263       definition_.precision == CalculationsPrecision::F16 &&
264       kernel_is_trivial) {
265     compiler_options_.push_back(CompilerOptions::kAdrenoFullSimd);
266   }
267 }
268 
BindArguments(ArgumentsBinder * args)269 absl::Status ConvPowerVR::BindArguments(ArgumentsBinder* args) {
270   if (!conv_params_.x_kernel_is_1) {
271     RETURN_IF_ERROR(args->SetInt("stride_x", stride_.x));
272     RETURN_IF_ERROR(args->SetInt("padding_x", padding_.x * src_[0]->Batch()));
273     RETURN_IF_ERROR(args->SetInt("kernel_size_x", kernel_size_.x));
274     RETURN_IF_ERROR(args->SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
275   }
276   if (!conv_params_.y_kernel_is_1) {
277     RETURN_IF_ERROR(args->SetInt("stride_y", stride_.y));
278     RETURN_IF_ERROR(args->SetInt("padding_y", padding_.y));
279     RETURN_IF_ERROR(args->SetInt("kernel_size_y", kernel_size_.y));
280     RETURN_IF_ERROR(args->SetInt("dilation_y", dilation_.y));
281   }
282   if (definition_.src_tensors[0].HasAxis(Axis::DEPTH) &&
283       !conv_params_.z_kernel_is_1) {
284     RETURN_IF_ERROR(args->SetInt("stride_z", stride_.z));
285     RETURN_IF_ERROR(args->SetInt("padding_z", padding_.z));
286     RETURN_IF_ERROR(args->SetInt("kernel_size_z", kernel_size_.z));
287     RETURN_IF_ERROR(args->SetInt("dilation_z", dilation_.z));
288   }
289   const int task_size_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
290                                         conv_params_.block_size.x);
291   const int task_size_y =
292       DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
293   const int task_size_z =
294       DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
295   RETURN_IF_ERROR(args->SetInt("task_size_x", task_size_x));
296   RETURN_IF_ERROR(args->SetInt("task_size_y", task_size_y));
297   const int task_size_spatial = task_size_x * task_size_y * task_size_z;
298   RETURN_IF_ERROR(args->SetInt("task_size_spatial", task_size_spatial));
299   return absl::OkStatus();
300 }
301 
GetGridSize() const302 int3 ConvPowerVR::GetGridSize() const {
303   const int task_size_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
304                                         conv_params_.block_size.x);
305   const int task_size_y =
306       DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
307   const int task_size_z =
308       DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
309   const int task_size_s =
310       DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w);
311   int3 wg;
312 
313   if (conv_params_.linear_all) {
314     return int3(task_size_x * task_size_y * task_size_z * task_size_s, 1, 1);
315   } else if (conv_params_.linear_spatial) {
316     return int3(task_size_x * task_size_y * task_size_z, task_size_s, 1);
317   } else {
318     return int3(task_size_x, task_size_y * task_size_z, task_size_s);
319   }
320 }
321 
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const322 void ConvPowerVR::GetPossibleKernelWorkGroups(
323     TuningType tuning_type, const GpuInfo& gpu_info,
324     const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
325   if (conv_params_.weights_upload_type ==
326           WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
327       conv_params_.weights_upload_type ==
328           WeightsUploadType::LOCAL_MEM_BY_THREADS ||
329       conv_params_.fixed_work_group_size) {
330     work_groups->push_back(work_group_size_);
331     return;
332   }
333   GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
334                             work_groups);
335 }
336 
GenerateConv(const GpuInfo & gpu_info,const OperationDef & op_def,bool stride_correction,const ConvParams & conv_params)337 std::string ConvPowerVR::GenerateConv(const GpuInfo& gpu_info,
338                                       const OperationDef& op_def,
339                                       bool stride_correction,
340                                       const ConvParams& conv_params) {
341   auto src_desc = op_def.src_tensors[0];
342   src_desc.SetAddressMode(AddressMode::kZero);
343   if (op_def.IsBatchSupported()) {
344     src_desc.SetStateVar("BatchedWidth", "true");
345   }
346   AddSrcTensor("src_tensor", src_desc);
347   if (op_def.src_tensors.size() == 2) {
348     // dynamic weights
349     BufferDescriptor desc;
350     desc.element_type = op_def.src_tensors[1].data_type;
351     desc.element_size = 4;
352     desc.memory_type = conv_params.weights_upload_type ==
353                                ConvPowerVR::WeightsUploadType::CONSTANT_MEM
354                            ? MemoryType::CONSTANT
355                            : MemoryType::GLOBAL;
356 
357     AddSrcBuffer("weights", desc);
358   }
359 
360   const auto& src_def = op_def.src_tensors[0];
361 
362   auto generate_id = [&](const std::string& x, const std::string& y,
363                          const std::string& z) {
364     std::string id;
365     if (src_def.HasAxis(Axis::WIDTH)) {
366       id += "_w" + x;
367     }
368     if (src_def.HasAxis(Axis::HEIGHT)) {
369       id += "_h" + y;
370     }
371     if (src_def.HasAxis(Axis::DEPTH)) {
372       id += "_d" + z;
373     }
374     return id;
375   };
376 
377   auto generate_id_full = [&](const std::string& x, const std::string& y,
378                               const std::string& z, const std::string& s) {
379     return generate_id(x, y, z) + "_s" + s;
380   };
381 
382   auto generate_check = [&](const std::string& x, const std::string& y,
383                             const std::string& z) {
384     std::string check;
385     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
386     const std::vector<std::string> names{"in_x", "in_y", "in_z"};
387     const std::vector<bool> is_1{conv_params_.x_kernel_is_1,
388                                  conv_params_.y_kernel_is_1,
389                                  conv_params_.z_kernel_is_1};
390     const std::vector<std::string> coords{x, y, z};
391     for (int i = 0; i < axes.size(); ++i) {
392       const auto& axis = axes[i];
393       if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis) &&
394           !is_1[i]) {
395         if (!check.empty()) {
396           check += " && ";
397         }
398         check += names[i] + coords[i];
399       }
400     }
401     return check;
402   };
403 
404   auto dst_desc = op_def.dst_tensors[0];
405   if (op_def.IsBatchSupported()) {
406     dst_desc.SetStateVar("BatchedWidth", "true");
407   }
408   AddDstTensor("dst_tensor", dst_desc);
409 
410   if (!conv_params_.x_kernel_is_1) {
411     args_.AddInt("stride_x");
412     args_.AddInt("padding_x");
413     args_.AddInt("kernel_size_x");
414     args_.AddInt("dilation_x");
415   }
416   if (!conv_params_.y_kernel_is_1) {
417     args_.AddInt("stride_y");
418     args_.AddInt("padding_y");
419     args_.AddInt("kernel_size_y");
420     args_.AddInt("dilation_y");
421   }
422   if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
423     args_.AddInt("stride_z");
424     args_.AddInt("padding_z");
425     args_.AddInt("kernel_size_z");
426     args_.AddInt("dilation_z");
427   }
428   args_.AddInt("task_size_x");
429   args_.AddInt("task_size_y");
430   args_.AddInt("task_size_spatial");
431 
432   const int wg_total_size =
433       work_group_size_.x * work_group_size_.y * work_group_size_.z;
434   const std::string barrier =
435       wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
436           ? "SIMD_LOCAL_MEM_BARRIER"
437           : "LOCAL_MEM_BARRIER";
438 
439   const bool need_local_mem =
440       conv_params.weights_upload_type ==
441           ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
442       conv_params.weights_upload_type ==
443           ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
444 
445   const int local_mem_size =
446       conv_params.block_size.w * 4 * conv_params.src_depth_loop_size;
447 
448   const bool use_simd_broadcast = conv_params.IsPrivateMemBroadcast();
449   const int simd_size = conv_params.simd_size;
450 
451   const bool late_oob_check = need_local_mem || use_simd_broadcast;
452 
453   const std::string weights_space =
454       conv_params.weights_upload_type ==
455               ConvPowerVR::WeightsUploadType::CONSTANT_MEM
456           ? "__constant"
457           : "__global";
458 
459   const std::string weights_data_type =
460       conv_params.weights_data_type == DataType::FLOAT32 ? "float4" : "half4";
461 
462   const std::string weights_global_ptr =
463       weights_space + " " + weights_data_type + "*";
464 
465   std::string c;
466   if (use_simd_broadcast && gpu_info.IsApiOpenCl()) {
467     if (gpu_info.opencl_info.cl_version == OpenClVersion::kCl2_0) {
468       c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
469     } else if (gpu_info.SupportsExtension("cl_intel_subgroups")) {
470       c += "#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n";
471     }
472   }
473   const int4 block_size = conv_params.block_size;
474   if (conv_params.fixed_work_group_size && gpu_info.IsApiOpenCl()) {
475     c += "__attribute__((reqd_work_group_size(" +
476          std::to_string(work_group_size_.x) + ", " +
477          std::to_string(work_group_size_.y) + ", " +
478          std::to_string(work_group_size_.z) + ")))\n";
479   }
480   if (use_simd_broadcast && gpu_info.IsIntel() && gpu_info.IsApiOpenCl()) {
481     c += "__attribute__((intel_reqd_sub_group_size(" +
482          std::to_string(simd_size) + ")))\n";
483   }
484   std::string dst_oob_check;
485   if (src_def.HasAxis(Axis::DEPTH)) {
486     if (conv_params.linear_all) {
487       dst_oob_check = "DST_S >= args.dst_tensor.Slices()";
488     } else if (conv_params.linear_spatial) {
489       dst_oob_check =
490           "DST_Z >= args.dst_tensor.Depth() || DST_S >= "
491           "args.dst_tensor.Slices()";
492     } else {
493       dst_oob_check =
494           "DST_X >= args.dst_tensor.Width() || DST_Z >= "
495           "args.dst_tensor.Depth() || DST_S >= args.dst_tensor.Slices()";
496     }
497   } else {
498     if (conv_params.linear_all) {
499       dst_oob_check = "DST_S >= args.dst_tensor.Slices()";
500     } else if (conv_params.linear_spatial) {
501       dst_oob_check =
502           "DST_Y >= args.dst_tensor.Height() || DST_S >= "
503           "args.dst_tensor.Slices()";
504     } else {
505       dst_oob_check =
506           "DST_X >= args.dst_tensor.Width() || DST_Y >= "
507           "args.dst_tensor.Height() || DST_S >= args.dst_tensor.Slices()";
508     }
509   }
510   c += "MAIN_FUNCTION($0) {\n";
511   c += GenerateBlockCoords(conv_params.block_size, work_group_launch_order_,
512                            conv_params.linear_spatial, conv_params.linear_all,
513                            src_def.HasAxis(Axis::DEPTH));
514   if (!late_oob_check) {
515     c += "  if (" + dst_oob_check + ") {\n";
516     c += "    return;\n";
517     c += "  }\n";
518   }
519   if (conv_params.weights_upload_type ==
520       ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
521     if (conv_params.linear_spatial) {
522       c += "  int lid = LOCAL_ID_0;\n";
523     } else {
524       c += "  int lid = LOCAL_ID_1 * " + std::to_string(work_group_size_.x) +
525            " + LOCAL_ID_0;\n";
526     }
527   }
528   if (use_simd_broadcast) {
529     c += "  int simd_id = SUB_GROUP_LOCAL_ID;\n";
530   }
531   for (int s = 0; s < block_size.w; ++s) {
532     const std::string sind = std::to_string(s);
533     for (int z = 0; z < block_size.z; ++z) {
534       const std::string zind = std::to_string(z);
535       for (int y = 0; y < block_size.y; ++y) {
536         const std::string yind = std::to_string(y);
537         for (int x = 0; x < block_size.x; ++x) {
538           const std::string xind = std::to_string(x);
539           c += "  ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) +
540                " = INIT_ACCUM_FLT4(0.0f);\n";
541         }
542       }
543     }
544   }
545   if (!conv_params_.x_kernel_is_1) {
546     for (int x = 0; x < block_size.x; ++x) {
547       const std::string xind = std::to_string(x);
548       const std::string xc = "(DST_X + " + xind + ")";
549       if (stride_correction) {
550         c += "  int xc" + xind + " = " +
551              GetXStrideCorrected(xc, "args.src_tensor.Batch()", "args.stride_x",
552                                  "args.padding_x") +
553              ";\n";
554       } else {
555         c += "  int xc" + xind + " = " + xc +
556              " * args.stride_x + args.padding_x;\n";
557       }
558     }
559   } else {
560     for (int x = 0; x < block_size.x; ++x) {
561       const std::string xind = std::to_string(x);
562       c += "  int xc" + xind + " = DST_X + " + xind + ";\n";
563       if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
564         c += "  xc" + xind + " = clamp(xc" + xind +
565              ", 0, args.src_tensor.Width() - 1);\n";
566       }
567     }
568   }
569   if (!conv_params_.y_kernel_is_1) {
570     for (int y = 0; y < block_size.y; ++y) {
571       const std::string yind = std::to_string(y);
572       const std::string yc = "(DST_Y + " + yind + ")";
573       c += "  int yc" + yind + " = " + yc +
574            " * args.stride_y + args.padding_y;\n";
575     }
576   } else {
577     for (int y = 0; y < block_size.y; ++y) {
578       const std::string yind = std::to_string(y);
579       c += "  int yc" + yind + " = DST_Y + " + yind + ";\n";
580       if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
581         c += "  yc" + yind + " = clamp(yc" + yind +
582              ", 0, args.src_tensor.Height() - 1);\n";
583       }
584     }
585   }
586   if (src_def.HasAxis(Axis::DEPTH)) {
587     if (!conv_params_.z_kernel_is_1) {
588       for (int z = 0; z < block_size.z; ++z) {
589         const std::string zind = std::to_string(z);
590         const std::string zc = "(DST_Z + " + zind + ")";
591         c += "  int zc" + zind + " = " + zc +
592              " * args.stride_z + args.padding_z;\n";
593       }
594     } else {
595       for (int z = 0; z < block_size.z; ++z) {
596         const std::string zind = std::to_string(z);
597         c += "  int zc" + zind + " = DST_Z + " + zind + ";\n";
598         if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
599           c += "  zc" + zind + " = clamp(zc" + zind +
600                ", 0, args.src_tensor.Depth() - 1);\n";
601         }
602       }
603     }
604   }
605   bool trivial_kernel_size =
606       conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
607   if (src_def.HasAxis(Axis::DEPTH)) {
608     trivial_kernel_size = trivial_kernel_size && conv_params_.z_kernel_is_1;
609   }
610   if (need_local_mem) {
611     c += "  __local " + weights_data_type + " weights_cache[" +
612          std::to_string(local_mem_size) + "];\n";
613   } else if (conv_params.AreWeightsBuffer()) {
614     c += "    " + weights_global_ptr + " weights_cache;\n";
615   } else if (!trivial_kernel_size) {
616     c += "  int filter_offset = 0;\n";
617   }
618   if (conv_params.AreWeightsBuffer()) {
619     if (conv_params.different_weights_for_height) {
620       c += "  " + weights_global_ptr +
621            " filters_loc = args.weights.GetPtr() + (DST_S * "
622            "args.src_tensor.Height() + DST_Y * " +
623            std::to_string(block_size.w) + ") * 4 * args.src_tensor.Slices();\n";
624     } else {
625       std::string kernel_spatial_offset = "";
626       if (!conv_params_.x_kernel_is_1) {
627         kernel_spatial_offset += " * args.kernel_size_x";
628       }
629       if (!conv_params_.y_kernel_is_1) {
630         kernel_spatial_offset += " * args.kernel_size_y";
631       }
632       if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
633         kernel_spatial_offset += " * args.kernel_size_z";
634       }
635       c += "  " + weights_global_ptr +
636            " filters_loc = args.weights.GetPtr() + DST_S * 4 * "
637            "args.src_tensor.Slices()" +
638            kernel_spatial_offset + ";\n";
639     }
640   }
641   if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
642     c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
643     for (int z = 0; z < block_size.z; ++z) {
644       const std::string zck = "zck" + std::to_string(z);
645       c += "  int zck" + std::to_string(z) + " = kz * args.dilation_z + zc" +
646            std::to_string(z) + ";\n";
647       if (!src_def.SupportsZeroClamp(Axis::DEPTH)) {
648         c += "  bool in_z" + std::to_string(z) + " = " + zck + " >= 0 && " +
649              zck + " < args.src_tensor.Depth();\n";
650         if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
651           c += "  " + zck + " = clamp(" + zck +
652                ", 0, args.src_tensor.Depth() - 1);\n";
653         }
654       }
655     }
656   }
657   if (!conv_params_.y_kernel_is_1) {
658     c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
659     for (int y = 0; y < block_size.y; ++y) {
660       const std::string yck = "yck" + std::to_string(y);
661       c += "  int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) +
662            ";\n";
663       if (!src_def.SupportsZeroClamp(Axis::HEIGHT)) {
664         c += "  bool in_y" + std::to_string(y) + " = " + yck + " >= 0 && " +
665              yck + " < args.src_tensor.Height();\n";
666         if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
667           c += "  " + yck + " = clamp(" + yck +
668                ", 0, args.src_tensor.Height() - 1);\n";
669         }
670       }
671     }
672   }
673   if (!conv_params_.x_kernel_is_1) {
674     c += "  for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
675     for (int x = 0; x < block_size.x; ++x) {
676       const std::string xck = "xck" + std::to_string(x);
677       c += "  int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" +
678            std::to_string(x) + ";\n";
679       if (!src_def.SupportsZeroClamp(Axis::WIDTH)) {
680         c += "  bool in_x" + std::to_string(x) + " = " + xck + " >= 0 && " +
681              xck + " < args.src_tensor.Width();\n";
682         if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
683           c += "  " + xck + " = clamp(" + xck +
684                ", 0, args.src_tensor.Width() - 1);\n";
685         }
686       }
687     }
688   }
689   const bool need_multiple_slice_strides =
690       src_def.ReturnsZeroForNegOneRead() && !trivial_kernel_size;
691   for (int z = 0; z < block_size.z; ++z) {
692     const std::string zind = std::to_string(z);
693     for (int y = 0; y < block_size.y; ++y) {
694       const std::string yind = std::to_string(y);
695       for (int x = 0; x < block_size.x; ++x) {
696         const std::string xind = std::to_string(x);
697         std::string xc = conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
698         std::string yc = conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
699         const std::string id = generate_id(xind, yind, zind);
700         std::string coords = "" + xc + ", " + yc;
701         if (src_def.HasAxis(Axis::DEPTH)) {
702           std::string zc =
703               conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
704           coords += ", " + zc;
705         }
706         if (src_def.IsLinear()) {
707           c += "  args.src_tensor.GetAddress(addr" + id + ", " + coords +
708                ", 0);\n";
709           if (need_multiple_slice_strides) {
710             const std::string check = generate_check(xind, yind, zind);
711             c += "  addr" + id + " = select(-1, addr" + id + ", (" + check +
712                  "));\n";
713             c += "  int ds" + id +
714                  " = select(0, args.src_tensor.SliceStride(), (" + check +
715                  "));\n";
716           }
717         }
718       }
719     }
720   }
721   if (src_def.IsLinear() && !need_multiple_slice_strides) {
722     c += "  int ds = args.src_tensor.SliceStride();\n";
723   }
724 
725   auto declare_src = [&]() {
726     for (int z = 0; z < block_size.z; ++z) {
727       const std::string zind = std::to_string(z);
728       for (int y = 0; y < block_size.y; ++y) {
729         const std::string yind = std::to_string(y);
730         for (int x = 0; x < block_size.x; ++x) {
731           const std::string xind = std::to_string(x);
732           const std::string id = generate_id(xind, yind, zind);
733           c += "    " + weights_data_type + " src" + id + ";\n";
734         }
735       }
736     }
737   };
738   const bool conditional_read = gpu_info.IsMali();
739   auto read_src = [&]() {
740     const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
741     for (int z = 0; z < block_size.z; ++z) {
742       const std::string zind = std::to_string(z);
743       for (int y = 0; y < block_size.y; ++y) {
744         const std::string yind = std::to_string(y);
745         for (int x = 0; x < block_size.x; ++x) {
746           const std::string xind = std::to_string(x);
747           std::string id = generate_id(xind, yind, zind);
748           const std::string check = generate_check(xind, yind, zind);
749           std::string address;
750           if (src_def.IsLinear()) {
751             address = "addr" + id;
752           } else {
753             std::string xc =
754                 conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
755             std::string yc =
756                 conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
757             address = "" + xc + ", " + yc;
758             if (src_def.HasAxis(Axis::DEPTH)) {
759               std::string zc =
760                   conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
761               address += ", " + zc;
762             }
763             address += ", s";
764           }
765           if (src_def.ReturnsZeroForNegOneRead()) {
766             c += "    src" + id + " = args.src_tensor.Read<" + cl_type + ">(" +
767                  address + ");\n";
768             const std::string ds = trivial_kernel_size ? "ds" : "ds" + id;
769             c += "    " + address + " += " + ds + ";\n";
770           } else {
771             if (!check.empty()) {
772               if (conditional_read) {
773                 c += "    src" + id + " = " + check +
774                      " ? args.src_tensor.Read<" + cl_type + ">(" + address +
775                      ") : INIT_FLT4(0.0f);\n";
776               } else {
777                 c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
778                      ">(" + address + ") * INIT_FLT(" + check + ");\n";
779               }
780             } else {
781               c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
782                    ">(" + address + ");\n";
783             }
784             if (src_def.IsLinear()) {
785               c += "    " + address + " += ds;\n";
786             }
787           }
788         }
789       }
790     }
791   };
792   const bool weights_type_as_accum_type =
793       !(op_def.precision == CalculationsPrecision::F32_F16 &&
794         conv_params.weights_data_type == DataType::FLOAT16);
795   auto conv_core = [&](int shared_offset) {
796     const std::string channels[] = {"x", "y", "z", "w"};
797     for (int s = 0; s < block_size.w; ++s) {
798       const std::string sind = std::to_string(s);
799       if (weights_type_as_accum_type) {
800         for (int ch = 0; ch < 4; ++ch) {
801           for (int z = 0; z < block_size.z; ++z) {
802             const std::string zind = std::to_string(z);
803             for (int y = 0; y < block_size.y; ++y) {
804               const std::string yind = std::to_string(y);
805               for (int x = 0; x < block_size.x; ++x) {
806                 const std::string xind = std::to_string(x);
807                 std::string R = "r" + generate_id_full(xind, yind, zind, sind);
808                 std::string S = "src" + generate_id(xind, yind, zind);
809                 if (use_simd_broadcast) {
810                   int simd_id = (s * 4 + ch + shared_offset) / simd_size;
811                   int thread_id = (s * 4 + ch + shared_offset) % simd_size;
812                   std::string w_val_x = "SUB_GROUP_BROADCAST(simd_w" +
813                                         std::to_string(simd_id) + ".x, " +
814                                         std::to_string(thread_id) + "u)";
815                   std::string w_val_y = "SUB_GROUP_BROADCAST(simd_w" +
816                                         std::to_string(simd_id) + ".y, " +
817                                         std::to_string(thread_id) + "u)";
818                   std::string w_val_z = "SUB_GROUP_BROADCAST(simd_w" +
819                                         std::to_string(simd_id) + ".z, " +
820                                         std::to_string(thread_id) + "u)";
821                   std::string w_val_w = "SUB_GROUP_BROADCAST(simd_w" +
822                                         std::to_string(simd_id) + ".w, " +
823                                         std::to_string(thread_id) + "u)";
824                   if (GetWeightsDescription().IsI4O4()) {
825                     c += "    " + R + ".x += " + w_val_x + " * " + S + "." +
826                          channels[ch] + ";\n";
827                     c += "    " + R + ".y += " + w_val_y + " * " + S + "." +
828                          channels[ch] + ";\n";
829                     c += "    " + R + ".z += " + w_val_z + " * " + S + "." +
830                          channels[ch] + ";\n";
831                     c += "    " + R + ".w += " + w_val_w + " * " + S + "." +
832                          channels[ch] + ";\n";
833                   } else {
834                     c += "    " + R + "." + channels[ch] + " += " + w_val_x +
835                          " * " + S + ".x;\n";
836                     c += "    " + R + "." + channels[ch] + " += " + w_val_y +
837                          " * " + S + ".y;\n";
838                     c += "    " + R + "." + channels[ch] + " += " + w_val_z +
839                          " * " + S + ".z;\n";
840                     c += "    " + R + "." + channels[ch] + " += " + w_val_w +
841                          " * " + S + ".w;\n";
842                   }
843                 } else {
844                   const std::string weight_id =
845                       std::to_string(s * 4 + ch + shared_offset);
846                   std::string w_val;
847                   if (conv_params.AreWeightsBuffer()) {
848                     w_val = "weights_cache[" + weight_id + "]";
849                   } else {
850                     w_val = "f" + weight_id;
851                   }
852                   if (GetWeightsDescription().IsI4O4()) {
853                     c += "    " + R + " += " + w_val + " * " + S + "." +
854                          channels[ch] + ";\n";
855                   } else {
856                     c += "    " + R + "." + channels[ch] + " += dot(" + w_val +
857                          ", " + S + ");\n";
858                   }
859                 }
860               }
861             }
862           }
863         }
864       } else {  // F32_F16 precision and weights type is float16
865         for (int z = 0; z < block_size.z; ++z) {
866           const std::string zind = std::to_string(z);
867           for (int y = 0; y < block_size.y; ++y) {
868             const std::string yind = std::to_string(y);
869             for (int x = 0; x < block_size.x; ++x) {
870               const std::string xind = std::to_string(x);
871               std::string R = "r" + generate_id_full(xind, yind, zind, sind);
872               std::string S = "src" + generate_id(xind, yind, zind);
873               std::vector<std::string> F(4);
874               for (int i = 0; i < 4; ++i) {
875                 std::string weight_id =
876                     std::to_string(s * 4 + i + shared_offset);
877                 if (conv_params.AreWeightsBuffer()) {
878                   F[i] = "weights_cache[" + weight_id + "]";
879                 } else {
880                   F[i] = "f" + weight_id;
881                 }
882               }
883               if (GetWeightsDescription().IsI4O4()) {
884                 c += "    " + R + " += TO_ACCUM_TYPE(" + S + ".x * " + F[0] +
885                      " + " + S + ".y * " + F[1] + " + " + S + ".z * " + F[2] +
886                      " + " + S + ".w * " + F[3] + ");\n";
887               } else {
888                 c += "    " + R + ".x += dot(" + S + ", " + F[0] + ");\n";
889                 c += "    " + R + ".y += dot(" + S + ", " + F[1] + ");\n";
890                 c += "    " + R + ".z += dot(" + S + ", " + F[2] + ");\n";
891                 c += "    " + R + ".w += dot(" + S + ", " + F[3] + ");\n";
892               }
893             }
894           }
895         }
896       }
897     }
898   };
899 
900   c += "  int s = 0;\n";
901   c += "  do {\n";
902   declare_src();
903   const int total_work_items =
904       work_group_size_.x * work_group_size_.y * work_group_size_.z;
905   if (conv_params.weights_upload_type ==
906       ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
907     c += GenerateAsyncUpload("weights_cache", "filters_loc",
908                              /*global_offset_name*/ "", local_mem_size);
909   } else if (conv_params.weights_upload_type ==
910              ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
911     c += "    " + barrier + ";\n";
912     c += GenerateUploadByThreads("weights_cache", "filters_loc",
913                                  /*global_offset_name*/ "", "lid",
914                                  total_work_items, local_mem_size);
915   } else if (use_simd_broadcast) {
916     int parts = local_mem_size / simd_size;
917     int reminder = local_mem_size % simd_size;
918     for (int i = 0; i < parts; ++i) {
919       c += "    FLT4 simd_w" + std::to_string(i) + " = filters_loc[simd_id + " +
920            std::to_string(i * simd_size) + "];\n";
921     }
922     if (reminder) {
923       c += "    FLT4 simd_w" + std::to_string(parts) + ";\n";
924       c += "    if (simd_id < " + std::to_string(reminder) + ") {\n";
925       c += "      simd_w" + std::to_string(parts) +
926            " = filters_loc[simd_id + " + std::to_string(parts * simd_size) +
927            "];\n";
928       c += "    }\n";
929     }
930   } else if (conv_params.AreWeightsBuffer()) {  // GLOBAL_MEM/CONSTANT_MEM
931     c += "    weights_cache = filters_loc;\n";
932   } else {  // TEXTURES_MEM
933     for (int dst_s = 0; dst_s < block_size.w; ++dst_s) {
934       std::string f_y = trivial_kernel_size ? "s" : "filter_offset";
935       if (conv_params.different_weights_for_height) {
936         f_y = "DST_Y * args.src_tensor.Slices() + s";
937       }
938       c += absl::Substitute(
939           R"(    FLT4 f$2 = args.weights0.Read(DST_S + $0, $1);
940     FLT4 f$3 = args.weights1.Read(DST_S + $0, $1);
941     FLT4 f$4 = args.weights2.Read(DST_S + $0, $1);
942     FLT4 f$5 = args.weights3.Read(DST_S + $0, $1);
943 )",
944           dst_s, f_y, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2,
945           dst_s * 4 + 3);
946     }
947     if (!trivial_kernel_size) {
948       c += "    filter_offset++;\n";
949     }
950   }
951   read_src();
952   c += "    s += 1;\n";
953   if (conv_params.weights_upload_type ==
954       ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
955     c += "    " + barrier + ";\n";
956   }
957   conv_core(0);
958   for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
959     read_src();
960     conv_core(i * block_size.w * 4);
961     c += "    s += 1;\n";
962   }
963   if (conv_params.AreWeightsBuffer()) {
964     c += "    filters_loc += " + std::to_string(local_mem_size) + ";\n";
965   }
966   c += "  } while (s < args.src_tensor.Slices());\n";
967   if (!conv_params.x_kernel_is_1) {
968     c += "  };\n";
969   }
970   if (!conv_params.y_kernel_is_1) {
971     c += "  };\n";
972   }
973   if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
974     c += "  };\n";
975   }
976   if (conv_params.AreWeightsBuffer()) {
977     if (conv_params.weights_upload_type ==
978         ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
979       c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S",
980                                block_size.w);
981     } else if (conv_params.weights_upload_type ==
982                ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
983       c += "  " + barrier + ";\n";
984       c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()",
985                                    "DST_S", "lid", total_work_items,
986                                    block_size.w);
987       c += "  " + barrier + ";\n";
988     } else {
989       c += "  weights_cache = args.biases.GetPtr() + DST_S;\n";
990     }
991   }
992   if (late_oob_check) {
993     c += "  if (" + dst_oob_check + ") {\n";
994     c += "    return;\n";
995     c += "  }\n";
996   }
997 
998   auto generate_dst_check = [&](int x, int y, int z) {
999     std::string check;
1000     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
1001     const std::vector<std::string> names{"Width()", "Height()", "Depth()"};
1002     std::vector<std::string> coords(3);
1003     coords[0] = "DST_X + " + std::to_string(x);
1004     coords[1] = "DST_Y + " + std::to_string(y);
1005     coords[2] = "DST_Z + " + std::to_string(z);
1006     const std::vector<int> ids{x, y, z};
1007     for (int i = 0; i < axes.size(); ++i) {
1008       const auto& axis = axes[i];
1009       if (src_def.HasAxis(axis) && ids[i] != 0) {
1010         if (!check.empty()) {
1011           check += " && ";
1012         }
1013         check += coords[i] + " < args.dst_tensor." + names[i];
1014       }
1015     }
1016     return check;
1017   };
1018 
1019   for (int s = 0; s < block_size.w; ++s) {
1020     const std::string sind = std::to_string(s);
1021     c += "  if (DST_S + " + sind + " >= args.dst_tensor.Slices()) return;\n";
1022     c += "  {\n";
1023     if (conv_params.AreWeightsBuffer()) {
1024       c += "    FLT4 bias_val = TO_FLT4(weights_cache[" + sind + "]);\n";
1025     } else {
1026       c += "    FLT4 bias_val = args.biases.Read(DST_S + " + sind + ");\n";
1027     }
1028     for (int z = 0; z < block_size.z; ++z) {
1029       const std::string zind = std::to_string(z);
1030       for (int y = 0; y < block_size.y; ++y) {
1031         const std::string yind = std::to_string(y);
1032         for (int x = 0; x < block_size.x; ++x) {
1033           const std::string xind = std::to_string(x);
1034           const std::string id = generate_id_full(xind, yind, zind, sind);
1035           const std::string check = generate_dst_check(x, y, z);
1036           std::string coords = "DST_X + " + xind + ", DST_Y + " + yind;
1037           if (src_def.HasAxis(Axis::DEPTH)) {
1038             coords += ", DST_Z + " + zind;
1039           }
1040           coords += ", DST_S + " + sind;
1041           if (!check.empty()) {
1042             c += "  if (" + check + ") {\n";
1043           } else {
1044             c += "  {\n";
1045           }
1046           c += "    FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
1047           c += "    args.dst_tensor.Write(res, " + coords + ");\n";
1048           c += "  }\n";
1049         }
1050       }
1051     }
1052     c += "  }\n";
1053   }
1054   c += "}\n";
1055   return c;
1056 }
1057 
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,int src_depth,int dst_depth,bool x_kernel_is_1,bool y_kernel_is_1,bool different_weights_for_height,const BHWC * dst_shape)1058 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1059     const GpuInfo& gpu_info, const OperationDef& definition, int src_depth,
1060     int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
1061     bool different_weights_for_height, const BHWC* dst_shape) {
1062   ConvParams conv_params;
1063   conv_params.linear_spatial = false;
1064   conv_params.linear_all = false;
1065   conv_params.block_size = int4(1, 1, 1, 1);
1066   conv_params.weights_data_type =
1067       DeduceDataTypeFromPrecision(definition.precision);
1068   conv_params.x_kernel_is_1 = x_kernel_is_1;
1069   conv_params.y_kernel_is_1 = y_kernel_is_1;
1070   conv_params.different_weights_for_height = different_weights_for_height;
1071   if (gpu_info.IsNvidia()) {
1072     if (different_weights_for_height) {
1073       work_group_size_ = int3(32, 1, 1);
1074       work_group_launch_order_ = int3(2, 0, 1);
1075       conv_params.fixed_work_group_size = true;
1076     } else {
1077       conv_params.linear_spatial = true;
1078       work_group_size_ = int3(32, 1, 1);
1079       work_group_launch_order_ = int3(1, 0, 2);
1080       conv_params.fixed_work_group_size = true;
1081     }
1082     conv_params.block_size = int4(2, 1, 1, 4);
1083     conv_params.src_depth_loop_size = 1;
1084     conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
1085     if (dst_depth % 4 == 0 || dst_depth >= 8) {
1086       conv_params.block_size.w = 4;
1087     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1088       conv_params.block_size.w = 2;
1089     } else {
1090       conv_params.block_size.w = dst_depth;
1091     }
1092     if (dst_shape) {
1093       int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
1094       float task_size_per_cu =
1095           static_cast<float>(task_size) / gpu_info.GetComputeUnitsCount();
1096       int block_size = conv_params.block_size.x * conv_params.block_size.y *
1097                        conv_params.block_size.w;
1098       float threads_per_cu = task_size_per_cu / block_size;
1099       float warps_per_cu = threads_per_cu / 32 /*warp_size*/;
1100       if (warps_per_cu < 8.0f) {
1101         conv_params.block_size.x = 1;
1102       }
1103       if (warps_per_cu < 4.0f && conv_params.block_size.w >= 4) {
1104         conv_params.block_size.w /= 2;
1105       }
1106       if (warps_per_cu < 2.0f && conv_params.block_size.w >= 2) {
1107         conv_params.block_size.w /= 2;
1108       }
1109     }
1110     if (src_depth % 2 == 0) {
1111       conv_params.src_depth_loop_size = 2;
1112     }
1113     if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1114       conv_params.src_depth_loop_size = 4;
1115     }
1116   } else if (gpu_info.IsPowerVR()) {
1117     if (different_weights_for_height) {
1118       work_group_size_ = int3(32, 1, 1);
1119       work_group_launch_order_ = int3(2, 0, 1);
1120       conv_params.fixed_work_group_size = true;
1121     } else {
1122       conv_params.linear_spatial = true;
1123       work_group_size_ = int3(32, 1, 1);
1124       work_group_launch_order_ = int3(1, 0, 2);
1125       conv_params.fixed_work_group_size = true;
1126     }
1127     conv_params.weights_data_type =
1128         definition.precision == CalculationsPrecision::F16 ? DataType::FLOAT16
1129                                                            : DataType::FLOAT32;
1130     conv_params.block_size = int4(1, 1, 1, 4);
1131     conv_params.src_depth_loop_size = 1;
1132     conv_params.weights_upload_type =
1133         WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
1134     if (dst_depth % 8 == 0 || dst_depth >= 32) {
1135       conv_params.block_size.w = 8;
1136     } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
1137       conv_params.block_size.w = 4;
1138     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1139       conv_params.block_size.w = 2;
1140     } else {
1141       conv_params.block_size.w = dst_depth;
1142     }
1143     if (definition.precision == CalculationsPrecision::F16) {
1144       conv_params.block_size.w = std::min(4, conv_params.block_size.w);
1145       if (src_depth % 2 == 0) {
1146         conv_params.src_depth_loop_size = 2;
1147       }
1148       if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1149         conv_params.src_depth_loop_size = 4;
1150       }
1151       if (conv_params.block_size.w == 1) {
1152         if (src_depth % 2 == 0) {
1153           conv_params.src_depth_loop_size = 2;
1154         }
1155         if (src_depth % 4 == 0) {
1156           conv_params.src_depth_loop_size = 4;
1157         }
1158         if (src_depth <= 8) {
1159           conv_params.src_depth_loop_size = src_depth;
1160         }
1161       }
1162       conv_params.block_size.x = 2;
1163     }
1164   } else if (gpu_info.IsAMD()) {
1165     if (different_weights_for_height) {
1166       work_group_size_ = int3(32, 1, 1);
1167       work_group_launch_order_ = int3(2, 0, 1);
1168       conv_params.fixed_work_group_size = true;
1169     } else {
1170       work_group_size_ = int3(8, 4, 1);
1171       work_group_launch_order_ = int3(2, 0, 1);
1172       conv_params.fixed_work_group_size = true;
1173     }
1174 
1175     conv_params.block_size = int4(2, 1, 1, 1);
1176     if (x_kernel_is_1 && y_kernel_is_1) {
1177       conv_params.block_size.y = 2;
1178     }
1179     conv_params.src_depth_loop_size = 1;
1180     conv_params.weights_upload_type = WeightsUploadType::CONSTANT_MEM;
1181     if (dst_depth % 8 == 0 || dst_depth >= 32) {
1182       conv_params.block_size.w = 8;
1183     } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
1184       conv_params.block_size.w = 4;
1185     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1186       conv_params.block_size.w = 2;
1187     } else {
1188       conv_params.block_size.w = 1;
1189     }
1190     if (src_depth % 2 == 0 && src_depth >= 16) {
1191       conv_params.src_depth_loop_size = 2;
1192     }
1193   } else if (gpu_info.IsMali()) {
1194     int block_size = 2;
1195     if (dst_shape) {
1196       int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
1197       block_size = GetRecommendedBlockSizeForConv(
1198           gpu_info, definition.precision, task_size);
1199     }
1200     if (!x_kernel_is_1 || !y_kernel_is_1) {
1201       block_size = std::min(block_size, 4);
1202     }
1203     if (block_size == 8) {
1204       if (dst_depth == 1 || dst_depth == 3) {
1205         conv_params.block_size = int4(2, 2, 1, 1);
1206       } else {
1207         conv_params.block_size = int4(2, 2, 1, 2);
1208       }
1209     } else if (block_size == 4) {
1210       if (dst_depth == 1 || dst_depth == 3) {
1211         conv_params.block_size = int4(2, 2, 1, 1);
1212       } else {
1213         conv_params.block_size = int4(2, 1, 1, 2);
1214       }
1215     } else if (block_size == 2) {
1216       conv_params.block_size = int4(2, 1, 1, 1);
1217     } else {
1218       conv_params.block_size = int4(1, 1, 1, 1);
1219     }
1220     conv_params.src_depth_loop_size = 1;
1221     MaliInfo mali_info = gpu_info.mali_info;
1222     if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
1223       conv_params.src_depth_loop_size = 2;
1224     }
1225     if (src_depth % 4 == 0 && block_size == 1 && !mali_info.IsMidgard() &&
1226         definition.precision == CalculationsPrecision::F16) {
1227       conv_params.src_depth_loop_size = 4;
1228     }
1229     work_group_size_ = int3(4, 4, 1);
1230     work_group_launch_order_ = int3(0, 1, 2);
1231     conv_params.fixed_work_group_size = false;
1232     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1233   } else if (gpu_info.IsAdreno()) {
1234     conv_params.block_size = int4(2, 2, 1, 2);
1235     if (gpu_info.adreno_info.IsAdreno3xx()) {
1236       if (definition.precision == CalculationsPrecision::F16) {
1237         conv_params.block_size = int4(2, 2, 1, 2);
1238       } else if (definition.precision == CalculationsPrecision::F32_F16) {
1239         conv_params.block_size = int4(2, 1, 1, 2);
1240       } else {  // F32
1241         conv_params.block_size = int4(2, 2, 1, 1);
1242       }
1243     }
1244     work_group_size_ = int3(8, 2, 1);
1245     work_group_launch_order_ = int3(0, 1, 2);
1246     conv_params.fixed_work_group_size = false;
1247     conv_params.src_depth_loop_size = 1;
1248     if (definition.src_tensors.size() == 2) {
1249       // dynamic weights supported only with buffers.
1250       conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1251     } else {
1252       conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM_X4;
1253     }
1254   } else if (gpu_info.IsIntel()) {
1255     if (different_weights_for_height) {
1256       work_group_size_ = int3(16, 1, 1);
1257       work_group_launch_order_ = int3(0, 1, 2);
1258       conv_params.fixed_work_group_size = true;
1259     } else {
1260       conv_params.linear_spatial = true;
1261       work_group_size_ = int3(16, 1, 1);
1262       work_group_launch_order_ = int3(0, 1, 2);
1263       conv_params.fixed_work_group_size = true;
1264     }
1265     conv_params.block_size = int4(1, 1, 1, 4);
1266     conv_params.src_depth_loop_size = 1;
1267     int sub_group_size = 16;
1268     const bool supports_subgroups =
1269         gpu_info.SupportsExtension("cl_khr_subgroups") ||
1270         gpu_info.SupportsExtension("cl_intel_subgroups");
1271     if (definition.precision != CalculationsPrecision::F32_F16 &&
1272         supports_subgroups &&
1273         gpu_info.SupportsExtension("cl_intel_required_subgroup_size") &&
1274         gpu_info.SupportsSubGroupWithSize(sub_group_size)) {
1275       conv_params.weights_upload_type =
1276           WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
1277       conv_params.simd_size = sub_group_size;
1278     } else {
1279       conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
1280     }
1281     if (dst_depth % 4 == 0 || dst_depth >= 8) {
1282       conv_params.block_size.w = 4;
1283     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1284       conv_params.block_size.w = 2;
1285     } else {
1286       conv_params.block_size.w = dst_depth;
1287     }
1288     if (src_depth % 2 == 0) {
1289       conv_params.src_depth_loop_size = 2;
1290     }
1291     if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1292       conv_params.src_depth_loop_size = 4;
1293     }
1294   } else if (gpu_info.IsApple()) {
1295     conv_params.block_size = int4(2, 2, 1, 2);
1296     work_group_size_ = int3(8, 4, 1);
1297     work_group_launch_order_ = int3(0, 1, 2);
1298     conv_params.fixed_work_group_size = true;
1299     conv_params.src_depth_loop_size = 1;
1300     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1301   } else {
1302     conv_params.block_size = int4(1, 1, 1, 4);
1303     work_group_size_ = int3(8, 2, 1);
1304     work_group_launch_order_ = int3(0, 1, 2);
1305     conv_params.fixed_work_group_size = false;
1306     conv_params.src_depth_loop_size = 1;
1307     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1308     if (dst_depth % 4 == 0 || dst_depth >= 8) {
1309       conv_params.block_size.w = 4;
1310     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1311       conv_params.block_size.w = 2;
1312     } else {
1313       conv_params.block_size.w = dst_depth;
1314     }
1315     if (src_depth % 2 == 0) {
1316       conv_params.src_depth_loop_size = 2;
1317     }
1318     if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1319       conv_params.src_depth_loop_size = 4;
1320     }
1321   }
1322   if (conv_params.AreWeightsBuffer()) {
1323     if (gpu_info.IsApple()) {
1324       conv_params.weights_layout = WeightsLayout::kOHWIOGroupO4I4;
1325     } else {
1326       conv_params.weights_layout = WeightsLayout::kOHWIOGroupI4O4;
1327     }
1328   } else {
1329     if (gpu_info.IsApple()) {
1330       conv_params.weights_layout = WeightsLayout::k2DX4O4YIsHWIAndXIsOOGroupI4;
1331     } else {
1332       conv_params.weights_layout = WeightsLayout::k2DX4I4YIsHWIAndXIsOOGroupO4;
1333     }
1334   }
1335 
1336   return conv_params;
1337 }
1338 
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1339 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1340     const GpuInfo& gpu_info, const OperationDef& definition,
1341     const Convolution2DAttributes& attr, const BHWC* dst_shape) {
1342   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1343   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1344   const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
1345                              attr.dilations.w == 1 &&
1346                              attr.padding.prepended.w == 0 &&
1347                              attr.padding.appended.w == 0;
1348   const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
1349                              attr.dilations.h == 1 &&
1350                              attr.padding.prepended.h == 0 &&
1351                              attr.padding.appended.h == 0;
1352   return GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1353                          x_kernel_is_1, y_kernel_is_1, false, dst_shape);
1354 }
1355 
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution3DAttributes & attr,const BHWDC * dst_shape)1356 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1357     const GpuInfo& gpu_info, const OperationDef& definition,
1358     const Convolution3DAttributes& attr, const BHWDC* dst_shape) {
1359   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1360   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1361   const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
1362                              attr.dilations.w == 1 &&
1363                              attr.padding.prepended.w == 0 &&
1364                              attr.padding.appended.w == 0;
1365   const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
1366                              attr.dilations.h == 1 &&
1367                              attr.padding.prepended.h == 0 &&
1368                              attr.padding.appended.h == 0;
1369   const bool z_kernel_is_1 = attr.weights.shape.d == 1 && attr.strides.d == 1 &&
1370                              attr.dilations.d == 1 &&
1371                              attr.padding.prepended.d == 0 &&
1372                              attr.padding.appended.d == 0;
1373 
1374   ConvPowerVR::ConvParams result;
1375   BHWC shape;
1376   if (dst_shape) {
1377     shape.b = dst_shape->b;
1378     shape.h = dst_shape->h * dst_shape->d;
1379     shape.w = dst_shape->w;
1380     shape.c = dst_shape->c;
1381     result = GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1382                              x_kernel_is_1, y_kernel_is_1, false, &shape);
1383   } else {
1384     result = GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1385                              x_kernel_is_1, y_kernel_is_1, false, nullptr);
1386   }
1387   result.z_kernel_is_1 = z_kernel_is_1;
1388   return result;
1389 }
1390 
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const BHWC * dst_shape)1391 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1392     const GpuInfo& gpu_info, const OperationDef& definition,
1393     const Convolution2DAttributes& attr, const BHWC& weights_shape,
1394     const BHWC* dst_shape) {
1395   const int dst_depth = DivideRoundUp(weights_shape.b, 4);
1396   const int src_depth = DivideRoundUp(weights_shape.c, 4);
1397   const bool x_kernel_is_1 =
1398       weights_shape.w == 1 && attr.strides.w == 1 && attr.dilations.w == 1 &&
1399       attr.padding.prepended.w == 0 && attr.padding.appended.w == 0;
1400   const bool y_kernel_is_1 =
1401       weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
1402       attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
1403   return GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1404                          x_kernel_is_1, y_kernel_is_1, false, dst_shape);
1405 }
1406 
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr,const BHWC * dst_shape)1407 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1408     const GpuInfo& gpu_info, const OperationDef& definition,
1409     const FullyConnectedAttributes& attr, const BHWC* dst_shape) {
1410   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1411   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1412   ConvPowerVR::ConvParams params = GuessBestParams(
1413       gpu_info, definition, src_depth, dst_depth, true, true, false, dst_shape);
1414   work_group_size_.x *= work_group_size_.y;
1415   work_group_size_.y = 1;
1416   params.block_size.x *= params.block_size.y;
1417   params.block_size.y = 1;
1418   return params;
1419 }
1420 
GuessBestParamsWinograd(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1421 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
1422     const GpuInfo& gpu_info, const OperationDef& definition,
1423     const Convolution2DAttributes& attr, const BHWC* dst_shape) {
1424   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1425   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1426   ConvPowerVR::ConvParams params = GuessBestParams(
1427       gpu_info, definition, src_depth, dst_depth, true, true, true, dst_shape);
1428   params.block_size.x *= params.block_size.y;
1429   params.block_size.y = 1;
1430   return params;
1431 }
1432 
CreateConvPowerVR(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1433 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
1434                               const OperationDef& definition,
1435                               const Convolution2DAttributes& attr,
1436                               const BHWC* dst_shape) {
1437   ConvPowerVR result(definition, attr, gpu_info, dst_shape);
1438   result.GenerateCode(gpu_info);
1439   result.UploadData(attr.weights, attr.bias);
1440   return result;
1441 }
1442 
CreateConvPowerVR(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr,const BHWC * dst_shape)1443 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
1444                               const OperationDef& definition,
1445                               const FullyConnectedAttributes& attr,
1446                               const BHWC* dst_shape) {
1447   ConvPowerVR result(definition, attr, gpu_info, dst_shape);
1448   result.GenerateCode(gpu_info);
1449   result.UploadData(attr.weights, attr.bias);
1450   return result;
1451 }
1452 
CreateConvPowerVRDynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const BHWC * dst_shape)1453 ConvPowerVR CreateConvPowerVRDynamicWeights(const GpuInfo& gpu_info,
1454                                             const OperationDef& definition,
1455                                             const Convolution2DAttributes& attr,
1456                                             const BHWC& weights_shape,
1457                                             const BHWC* dst_shape) {
1458   ConvPowerVR result(definition, attr, weights_shape, gpu_info, dst_shape);
1459   result.GenerateCode(gpu_info);
1460   result.UploadBias(attr.bias);
1461   return result;
1462 }
1463 
CreateConvPowerVRWino4x4To6x6(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1464 ConvPowerVR CreateConvPowerVRWino4x4To6x6(const GpuInfo& gpu_info,
1465                                           const OperationDef& definition,
1466                                           const Convolution2DAttributes& attr,
1467                                           const BHWC* dst_shape) {
1468   ConvPowerVR result(definition);
1469   result.conv_params_ =
1470       result.GuessBestParamsWinograd(gpu_info, definition, attr, dst_shape);
1471   result.GenerateCode(gpu_info);
1472   result.UploadDataForWinograd4x4To6x6(attr.weights);
1473   return result;
1474 }
1475 
CreateConvPowerVR3D(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution3DAttributes & attr,const BHWDC * dst_shape)1476 ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
1477                                 const OperationDef& definition,
1478                                 const Convolution3DAttributes& attr,
1479                                 const BHWDC* dst_shape) {
1480   ConvPowerVR result(definition, attr, gpu_info, dst_shape);
1481   result.GenerateCode(gpu_info);
1482   result.UploadWeights(attr.weights);
1483   result.UploadBias(attr.bias);
1484   return result;
1485 }
1486 
1487 }  // namespace gpu
1488 }  // namespace tflite
1489