• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/conv_powervr.h"
17 
18 #include <algorithm>
19 #include <string>
20 #include <utility>
21 
22 #include "absl/strings/substitute.h"
23 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
28 
29 namespace tflite {
30 namespace gpu {
31 
32 namespace {
GenerateUploadByThreads(const std::string & local_ptr_name,const std::string & global_ptr_name,const std::string & global_offset_name,const std::string & lid_name,int total_work_items,int elements_to_upload)33 std::string GenerateUploadByThreads(const std::string& local_ptr_name,
34                                     const std::string& global_ptr_name,
35                                     const std::string& global_offset_name,
36                                     const std::string& lid_name,
37                                     int total_work_items,
38                                     int elements_to_upload) {
39   std::string c;
40   std::string offset =
41       global_offset_name.empty() ? "" : global_offset_name + " + ";
42   const int groups = elements_to_upload / total_work_items;
43   const int reminder = elements_to_upload % total_work_items;
44   for (int i = 0; i < groups; ++i) {
45     c += "    " + local_ptr_name + "[" + lid_name + " + " +
46          std::to_string(total_work_items * i) + "] = " + global_ptr_name + "[" +
47          offset + lid_name + " + " + std::to_string(total_work_items * i) +
48          "];\n";
49   }
50   if (reminder != 0) {
51     c += "    if (" + lid_name + " < " + std::to_string(reminder) + ") {\n";
52     c += "      " + local_ptr_name + "[" + lid_name + " + " +
53          std::to_string(total_work_items * groups) + "] = " + global_ptr_name +
54          "[" + offset + lid_name + " + " +
55          std::to_string(total_work_items * groups) + "];\n";
56     c += "    }\n";
57   }
58   return c;
59 }
60 
GenerateAsyncUpload(const std::string & local_ptr_name,const std::string & global_ptr_name,const std::string & global_offset_name,int elements_to_upload)61 std::string GenerateAsyncUpload(const std::string& local_ptr_name,
62                                 const std::string& global_ptr_name,
63                                 const std::string& global_offset_name,
64                                 int elements_to_upload) {
65   std::string c;
66   std::string offset =
67       global_offset_name.empty() ? "" : " + " + global_offset_name;
68   c += "    async_work_group_copy(" + local_ptr_name + ", " + global_ptr_name +
69        offset + ", " + std::to_string(elements_to_upload) + ", 0);\n";
70   return c;
71 }
72 
GenerateBlockCoords(const int4 & block_size,const int3 & work_group_launch_order,bool linear_spatial,bool linear_all,bool need_depth)73 std::string GenerateBlockCoords(const int4& block_size,
74                                 const int3& work_group_launch_order,
75                                 bool linear_spatial, bool linear_all,
76                                 bool need_depth) {
77   std::string c;
78   int3 launch_remap;
79   launch_remap[work_group_launch_order.x] = 0;
80   launch_remap[work_group_launch_order.y] = 1;
81   launch_remap[work_group_launch_order.z] = 2;
82   if (linear_all) {
83     c += "  int linear_id = GLOBAL_ID_0;\n";
84     c += "  int DST_S = (linear_id / args.task_size_spatial) * " +
85          std::to_string(block_size.w) + ";\n";
86     c += "  int linear_spatial = linear_id % args.task_size_spatial;\n";
87     if (need_depth) {
88       c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
89            std::to_string(block_size.x) + ";\n";
90       c += "  linear_spatial = linear_spatial / args.task_size_x;\n";
91       c += "  int DST_Y = (linear_spatial % args.task_size_y) * " +
92            std::to_string(block_size.y) + ";\n";
93       c += "  int DST_Z = (linear_spatial / args.task_size_y) * " +
94            std::to_string(block_size.z) + ";\n";
95     } else {
96       c += "  int DST_Y = (linear_spatial / args.task_size_x) * " +
97            std::to_string(block_size.y) + ";\n";
98       c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
99            std::to_string(block_size.x) + ";\n";
100     }
101   } else if (linear_spatial) {
102     if (work_group_launch_order[0] == 0) {
103       c += "  int linear_spatial = GLOBAL_ID_0;\n";
104     } else {
105       c += "  int linear_spatial = GROUP_ID_" +
106            std::to_string(launch_remap[0]) + " * GROUP_SIZE_0 + LOCAL_ID_0;\n";
107     }
108     if (need_depth) {
109       c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
110            std::to_string(block_size.x) + ";\n";
111       c += "  linear_spatial = linear_spatial / args.task_size_x;\n";
112       c += "  int DST_Y = (linear_spatial % args.task_size_y) * " +
113            std::to_string(block_size.y) + ";\n";
114       c += "  int DST_Z = (linear_spatial / args.task_size_y) * " +
115            std::to_string(block_size.z) + ";\n";
116     } else {
117       c += "  int DST_Y = (linear_spatial / args.task_size_x) * " +
118            std::to_string(block_size.y) + ";\n";
119       c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
120            std::to_string(block_size.x) + ";\n";
121     }
122     if (work_group_launch_order[1] == 1) {
123       c +=
124           "  int DST_S = GLOBAL_ID_1 * " + std::to_string(block_size.w) + ";\n";
125     } else {
126       c += "  int DST_S = (GROUP_ID_" + std::to_string(launch_remap[1]) +
127            " * GROUP_SIZE_1 + LOCAL_ID_1) * " + std::to_string(block_size.w) +
128            ";\n";
129     }
130   } else {
131     if (work_group_launch_order[0] == 0) {
132       c +=
133           "  int DST_X = GLOBAL_ID_0 * " + std::to_string(block_size.x) + ";\n";
134     } else {
135       c += "  int DST_X = (GROUP_ID_" + std::to_string(launch_remap[0]) +
136            " * GROUP_SIZE_0 + LOCAL_ID_0) * " + std::to_string(block_size.x) +
137            ";\n";
138     }
139     std::string global_id_1;
140     if (work_group_launch_order[1] == 1) {
141       global_id_1 = "GLOBAL_ID_1";
142     } else {
143       global_id_1 = "(GROUP_ID_" + std::to_string(launch_remap[1]) +
144                     " * GROUP_SIZE_1 + LOCAL_ID_1)";
145     }
146     if (need_depth) {
147       c += "  int linear_id_1 = " + global_id_1 + ";\n";
148       c += "  int DST_Z = (linear_id_1 / args.task_size_y) * " +
149            std::to_string(block_size.z) + ";\n";
150       c += "  int DST_Y = (linear_id_1 % args.task_size_y) * " +
151            std::to_string(block_size.y) + ";\n";
152     } else {
153       c += "  int DST_Y = " + global_id_1 + " * " +
154            std::to_string(block_size.y) + ";\n";
155     }
156     if (work_group_launch_order[2] == 2) {
157       c +=
158           "  int DST_S = GLOBAL_ID_2 * " + std::to_string(block_size.w) + ";\n";
159     } else {
160       c += "  int DST_S = (GROUP_ID_" + std::to_string(launch_remap[2]) +
161            " * GROUP_SIZE_2 + LOCAL_ID_2) * " + std::to_string(block_size.w) +
162            ";\n";
163     }
164   }
165 
166   return c;
167 }
168 }  // namespace
169 
ConvPowerVR(const OperationDef & definition,const Convolution2DAttributes & attr,const GpuInfo & gpu_info,const BHWC * dst_shape)170 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
171                          const Convolution2DAttributes& attr,
172                          const GpuInfo& gpu_info, const BHWC* dst_shape)
173     : GPUOperation(definition),
174       stride_(attr.strides.w, attr.strides.h, 1, 1),
175       padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
176       kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 1, 1),
177       dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
178       conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
179 
ConvPowerVR(const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const GpuInfo & gpu_info,const BHWC * dst_shape)180 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
181                          const Convolution2DAttributes& attr,
182                          const BHWC& weights_shape, const GpuInfo& gpu_info,
183                          const BHWC* dst_shape)
184     : GPUOperation(definition),
185       stride_(attr.strides.w, attr.strides.h, 1, 1),
186       padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
187       kernel_size_(weights_shape.w, weights_shape.h, 1, 1),
188       dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
189       conv_params_(GuessBestParams(gpu_info, definition, attr, weights_shape,
190                                    dst_shape)) {}
191 
ConvPowerVR(const OperationDef & definition,const FullyConnectedAttributes & attr,const GpuInfo & gpu_info,const BHWC * dst_shape)192 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
193                          const FullyConnectedAttributes& attr,
194                          const GpuInfo& gpu_info, const BHWC* dst_shape)
195     : GPUOperation(definition),
196       stride_(1, 1, 1, 1),
197       padding_(0, 0, 0, 0),
198       kernel_size_(1, 1, 1, 1),
199       dilation_(1, 1, 1, 1),
200       conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
201 
ConvPowerVR(const OperationDef & definition)202 ConvPowerVR::ConvPowerVR(const OperationDef& definition)
203     : GPUOperation(definition),
204       stride_(1, 1, 1, 1),
205       padding_(0, 0, 0, 0),
206       kernel_size_(1, 1, 1, 1),
207       dilation_(1, 1, 1, 1) {}
208 
ConvPowerVR(ConvPowerVR && operation)209 ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
210     : GPUOperation(std::move(operation)),
211       stride_(operation.stride_),
212       padding_(operation.padding_),
213       kernel_size_(operation.kernel_size_),
214       dilation_(operation.dilation_),
215       conv_params_(operation.conv_params_) {}
216 
ConvPowerVR(const OperationDef & definition,const Convolution3DAttributes & attr,const GpuInfo & gpu_info,const BHWDC * dst_shape)217 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
218                          const Convolution3DAttributes& attr,
219                          const GpuInfo& gpu_info, const BHWDC* dst_shape)
220     : GPUOperation(definition),
221       stride_(attr.strides.w, attr.strides.h, attr.strides.d, 1),
222       padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
223                -attr.padding.prepended.d, 0),
224       kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
225                    attr.weights.shape.d, 1),
226       dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 1),
227       conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
228 
operator =(ConvPowerVR && operation)229 ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
230   if (this != &operation) {
231     std::swap(stride_, operation.stride_);
232     std::swap(padding_, operation.padding_);
233     std::swap(kernel_size_, operation.kernel_size_);
234     std::swap(dilation_, operation.dilation_);
235     std::swap(conv_params_, operation.conv_params_);
236     GPUOperation::operator=(std::move(operation));
237   }
238   return *this;
239 }
240 
GenerateCode(const GpuInfo & gpu_info)241 void ConvPowerVR::GenerateCode(const GpuInfo& gpu_info) {
242   if (conv_params_.linear_all) {
243     grid_dimension_ = 1;
244   } else if (conv_params_.linear_spatial) {
245     grid_dimension_ = 2;
246   }
247   const bool stride_correction =
248       definition_.IsBatchSupported() && stride_.x != 1;
249   code_ = GenerateConv(gpu_info, definition_, stride_correction, conv_params_);
250   if (definition_.precision == CalculationsPrecision::F16 &&
251       gpu_info.IsPowerVR()) {
252     compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
253   }
254   if (gpu_info.IsMali()) {
255     compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
256   }
257   if (conv_params_.IsPrivateMemBroadcast() && gpu_info.IsCL20OrHigher()) {
258     compiler_options_.push_back(CompilerOptions::kCl20);
259   }
260   bool kernel_is_trivial =
261       conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
262   if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
263     kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1;
264   }
265   if (gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx() &&
266       definition_.precision == CalculationsPrecision::F16 &&
267       kernel_is_trivial) {
268     compiler_options_.push_back(CompilerOptions::kAdrenoFullSimd);
269   }
270 }
271 
BindArguments(ArgumentsBinder * args)272 absl::Status ConvPowerVR::BindArguments(ArgumentsBinder* args) {
273   if (!conv_params_.x_kernel_is_1) {
274     RETURN_IF_ERROR(args->SetInt("stride_x", stride_.x));
275     RETURN_IF_ERROR(args->SetInt("padding_x", padding_.x * src_[0]->Batch()));
276     RETURN_IF_ERROR(args->SetInt("kernel_size_x", kernel_size_.x));
277     RETURN_IF_ERROR(args->SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
278   }
279   if (!conv_params_.y_kernel_is_1) {
280     RETURN_IF_ERROR(args->SetInt("stride_y", stride_.y));
281     RETURN_IF_ERROR(args->SetInt("padding_y", padding_.y));
282     RETURN_IF_ERROR(args->SetInt("kernel_size_y", kernel_size_.y));
283     RETURN_IF_ERROR(args->SetInt("dilation_y", dilation_.y));
284   }
285   if (definition_.src_tensors[0].HasAxis(Axis::DEPTH) &&
286       !conv_params_.z_kernel_is_1) {
287     RETURN_IF_ERROR(args->SetInt("stride_z", stride_.z));
288     RETURN_IF_ERROR(args->SetInt("padding_z", padding_.z));
289     RETURN_IF_ERROR(args->SetInt("kernel_size_z", kernel_size_.z));
290     RETURN_IF_ERROR(args->SetInt("dilation_z", dilation_.z));
291   }
292   const int task_size_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
293                                         conv_params_.block_size.x);
294   const int task_size_y =
295       DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
296   const int task_size_z =
297       DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
298   RETURN_IF_ERROR(args->SetInt("task_size_x", task_size_x));
299   RETURN_IF_ERROR(args->SetInt("task_size_y", task_size_y));
300   const int task_size_spatial = task_size_x * task_size_y * task_size_z;
301   RETURN_IF_ERROR(args->SetInt("task_size_spatial", task_size_spatial));
302   return absl::OkStatus();
303 }
304 
GetGridSize() const305 int3 ConvPowerVR::GetGridSize() const {
306   const int task_size_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
307                                         conv_params_.block_size.x);
308   const int task_size_y =
309       DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
310   const int task_size_z =
311       DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
312   const int task_size_s =
313       DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w);
314   int3 wg;
315 
316   if (conv_params_.linear_all) {
317     return int3(task_size_x * task_size_y * task_size_z * task_size_s, 1, 1);
318   } else if (conv_params_.linear_spatial) {
319     return int3(task_size_x * task_size_y * task_size_z, task_size_s, 1);
320   } else {
321     return int3(task_size_x, task_size_y * task_size_z, task_size_s);
322   }
323 }
324 
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const325 void ConvPowerVR::GetPossibleKernelWorkGroups(
326     TuningType tuning_type, const GpuInfo& gpu_info,
327     const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
328   if (conv_params_.weights_upload_type ==
329           WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
330       conv_params_.weights_upload_type ==
331           WeightsUploadType::LOCAL_MEM_BY_THREADS ||
332       conv_params_.fixed_work_group_size) {
333     work_groups->push_back(work_group_size_);
334     return;
335   }
336   GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
337                             work_groups);
338 }
339 
GenerateConv(const GpuInfo & gpu_info,const OperationDef & op_def,bool stride_correction,const ConvParams & conv_params)340 std::string ConvPowerVR::GenerateConv(const GpuInfo& gpu_info,
341                                       const OperationDef& op_def,
342                                       bool stride_correction,
343                                       const ConvParams& conv_params) {
344   auto src_desc = op_def.src_tensors[0];
345   src_desc.SetAddressMode(AddressMode::kZero);
346   if (op_def.IsBatchSupported()) {
347     src_desc.SetStateVar("BatchedWidth", "true");
348   }
349   AddSrcTensor("src_tensor", src_desc);
350   if (op_def.src_tensors.size() == 2) {
351     // dynamic weights
352     BufferDescriptor desc;
353     desc.element_type = op_def.src_tensors[1].data_type;
354     desc.element_size = 4;
355     desc.memory_type = conv_params.weights_upload_type ==
356                                ConvPowerVR::WeightsUploadType::CONSTANT_MEM
357                            ? MemoryType::CONSTANT
358                            : MemoryType::GLOBAL;
359 
360     AddSrcBuffer("weights", desc);
361   }
362 
363   const auto& src_def = op_def.src_tensors[0];
364 
365   auto generate_id = [&](const std::string& x, const std::string& y,
366                          const std::string& z) {
367     std::string id;
368     if (src_def.HasAxis(Axis::WIDTH)) {
369       id += "_w" + x;
370     }
371     if (src_def.HasAxis(Axis::HEIGHT)) {
372       id += "_h" + y;
373     }
374     if (src_def.HasAxis(Axis::DEPTH)) {
375       id += "_d" + z;
376     }
377     return id;
378   };
379 
380   auto generate_id_full = [&](const std::string& x, const std::string& y,
381                               const std::string& z, const std::string& s) {
382     return generate_id(x, y, z) + "_s" + s;
383   };
384 
385   auto generate_check = [&](const std::string& x, const std::string& y,
386                             const std::string& z) {
387     std::string check;
388     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
389     const std::vector<std::string> names{"in_x", "in_y", "in_z"};
390     const std::vector<bool> is_1{conv_params_.x_kernel_is_1,
391                                  conv_params_.y_kernel_is_1,
392                                  conv_params_.z_kernel_is_1};
393     const std::vector<std::string> coords{x, y, z};
394     for (int i = 0; i < axes.size(); ++i) {
395       const auto& axis = axes[i];
396       if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis) &&
397           !is_1[i]) {
398         if (!check.empty()) {
399           check += " && ";
400         }
401         check += names[i] + coords[i];
402       }
403     }
404     return check;
405   };
406 
407   auto dst_desc = op_def.dst_tensors[0];
408   if (op_def.IsBatchSupported()) {
409     dst_desc.SetStateVar("BatchedWidth", "true");
410   }
411   AddDstTensor("dst_tensor", dst_desc);
412 
413   if (!conv_params_.x_kernel_is_1) {
414     args_.AddInt("stride_x");
415     args_.AddInt("padding_x");
416     args_.AddInt("kernel_size_x");
417     args_.AddInt("dilation_x");
418   }
419   if (!conv_params_.y_kernel_is_1) {
420     args_.AddInt("stride_y");
421     args_.AddInt("padding_y");
422     args_.AddInt("kernel_size_y");
423     args_.AddInt("dilation_y");
424   }
425   if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
426     args_.AddInt("stride_z");
427     args_.AddInt("padding_z");
428     args_.AddInt("kernel_size_z");
429     args_.AddInt("dilation_z");
430   }
431   args_.AddInt("task_size_x");
432   args_.AddInt("task_size_y");
433   args_.AddInt("task_size_spatial");
434 
435   const int wg_total_size =
436       work_group_size_.x * work_group_size_.y * work_group_size_.z;
437   const std::string barrier =
438       wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
439           ? "SIMD_LOCAL_MEM_BARRIER"
440           : "LOCAL_MEM_BARRIER";
441 
442   const bool need_local_mem =
443       conv_params.weights_upload_type ==
444           ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
445       conv_params.weights_upload_type ==
446           ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
447 
448   const int local_mem_size =
449       conv_params.block_size.w * 4 * conv_params.src_depth_loop_size;
450 
451   const bool use_simd_broadcast = conv_params.IsPrivateMemBroadcast();
452   const int simd_size = conv_params.simd_size;
453 
454   const bool late_oob_check = need_local_mem || use_simd_broadcast;
455 
456   const std::string weights_space =
457       conv_params.weights_upload_type ==
458               ConvPowerVR::WeightsUploadType::CONSTANT_MEM
459           ? "__constant"
460           : "__global";
461 
462   const std::string weights_data_type =
463       conv_params.weights_data_type == DataType::FLOAT32 ? "float4" : "half4";
464 
465   const std::string weights_global_ptr =
466       weights_space + " " + weights_data_type + "*";
467 
468   std::string c;
469   if (use_simd_broadcast && gpu_info.IsApiOpenCl()) {
470     if (gpu_info.opencl_info.cl_version == OpenClVersion::kCl2_0) {
471       c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
472     } else if (gpu_info.SupportsExtension("cl_intel_subgroups")) {
473       c += "#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n";
474     }
475   }
476   const int4 block_size = conv_params.block_size;
477   if (conv_params.fixed_work_group_size && gpu_info.IsApiOpenCl()) {
478     c += "__attribute__((reqd_work_group_size(" +
479          std::to_string(work_group_size_.x) + ", " +
480          std::to_string(work_group_size_.y) + ", " +
481          std::to_string(work_group_size_.z) + ")))\n";
482   }
483   if (use_simd_broadcast && gpu_info.IsIntel() && gpu_info.IsApiOpenCl()) {
484     c += "__attribute__((intel_reqd_sub_group_size(" +
485          std::to_string(simd_size) + ")))\n";
486   }
487   std::string dst_oob_check;
488   if (src_def.HasAxis(Axis::DEPTH)) {
489     if (conv_params.linear_all) {
490       dst_oob_check = "DST_S >= args.dst_tensor.Slices()";
491     } else if (conv_params.linear_spatial) {
492       dst_oob_check =
493           "DST_Z >= args.dst_tensor.Depth() || DST_S >= "
494           "args.dst_tensor.Slices()";
495     } else {
496       dst_oob_check =
497           "DST_X >= args.dst_tensor.Width() || DST_Z >= "
498           "args.dst_tensor.Depth() || DST_S >= args.dst_tensor.Slices()";
499     }
500   } else {
501     if (conv_params.linear_all) {
502       dst_oob_check = "DST_S >= args.dst_tensor.Slices()";
503     } else if (conv_params.linear_spatial) {
504       dst_oob_check =
505           "DST_Y >= args.dst_tensor.Height() || DST_S >= "
506           "args.dst_tensor.Slices()";
507     } else {
508       dst_oob_check =
509           "DST_X >= args.dst_tensor.Width() || DST_Y >= "
510           "args.dst_tensor.Height() || DST_S >= args.dst_tensor.Slices()";
511     }
512   }
513   c += "MAIN_FUNCTION($0) {\n";
514   c += GenerateBlockCoords(conv_params.block_size, work_group_launch_order_,
515                            conv_params.linear_spatial, conv_params.linear_all,
516                            src_def.HasAxis(Axis::DEPTH));
517   if (!late_oob_check) {
518     c += "  if (" + dst_oob_check + ") {\n";
519     c += "    return;\n";
520     c += "  }\n";
521   }
522   if (conv_params.weights_upload_type ==
523       ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
524     if (conv_params.linear_spatial) {
525       c += "  int lid = LOCAL_ID_0;\n";
526     } else {
527       c += "  int lid = LOCAL_ID_1 * " + std::to_string(work_group_size_.x) +
528            " + LOCAL_ID_0;\n";
529     }
530   }
531   if (use_simd_broadcast) {
532     c += "  int simd_id = SUB_GROUP_LOCAL_ID;\n";
533   }
534   for (int s = 0; s < block_size.w; ++s) {
535     const std::string sind = std::to_string(s);
536     for (int z = 0; z < block_size.z; ++z) {
537       const std::string zind = std::to_string(z);
538       for (int y = 0; y < block_size.y; ++y) {
539         const std::string yind = std::to_string(y);
540         for (int x = 0; x < block_size.x; ++x) {
541           const std::string xind = std::to_string(x);
542           c += "  ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) +
543                " = INIT_ACCUM_FLT4(0.0f);\n";
544         }
545       }
546     }
547   }
548   if (!conv_params_.x_kernel_is_1) {
549     for (int x = 0; x < block_size.x; ++x) {
550       const std::string xind = std::to_string(x);
551       const std::string xc = "(DST_X + " + xind + ")";
552       if (stride_correction) {
553         c += "  int xc" + xind + " = " +
554              GetXStrideCorrected(xc, "args.src_tensor.Batch()", "args.stride_x",
555                                  "args.padding_x") +
556              ";\n";
557       } else {
558         c += "  int xc" + xind + " = " + xc +
559              " * args.stride_x + args.padding_x;\n";
560       }
561     }
562   } else {
563     for (int x = 0; x < block_size.x; ++x) {
564       const std::string xind = std::to_string(x);
565       c += "  int xc" + xind + " = DST_X + " + xind + ";\n";
566       if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
567         c += "  xc" + xind + " = clamp(xc" + xind +
568              ", 0, args.src_tensor.Width() - 1);\n";
569       }
570     }
571   }
572   if (!conv_params_.y_kernel_is_1) {
573     for (int y = 0; y < block_size.y; ++y) {
574       const std::string yind = std::to_string(y);
575       const std::string yc = "(DST_Y + " + yind + ")";
576       c += "  int yc" + yind + " = " + yc +
577            " * args.stride_y + args.padding_y;\n";
578     }
579   } else {
580     for (int y = 0; y < block_size.y; ++y) {
581       const std::string yind = std::to_string(y);
582       c += "  int yc" + yind + " = DST_Y + " + yind + ";\n";
583       if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
584         c += "  yc" + yind + " = clamp(yc" + yind +
585              ", 0, args.src_tensor.Height() - 1);\n";
586       }
587     }
588   }
589   if (src_def.HasAxis(Axis::DEPTH)) {
590     if (!conv_params_.z_kernel_is_1) {
591       for (int z = 0; z < block_size.z; ++z) {
592         const std::string zind = std::to_string(z);
593         const std::string zc = "(DST_Z + " + zind + ")";
594         c += "  int zc" + zind + " = " + zc +
595              " * args.stride_z + args.padding_z;\n";
596       }
597     } else {
598       for (int z = 0; z < block_size.z; ++z) {
599         const std::string zind = std::to_string(z);
600         c += "  int zc" + zind + " = DST_Z + " + zind + ";\n";
601         if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
602           c += "  zc" + zind + " = clamp(zc" + zind +
603                ", 0, args.src_tensor.Depth() - 1);\n";
604         }
605       }
606     }
607   }
608   bool trivial_kernel_size =
609       conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
610   if (src_def.HasAxis(Axis::DEPTH)) {
611     trivial_kernel_size = trivial_kernel_size && conv_params_.z_kernel_is_1;
612   }
613   if (need_local_mem) {
614     c += "  __local " + weights_data_type + " weights_cache[" +
615          std::to_string(local_mem_size) + "];\n";
616   } else if (conv_params.AreWeightsBuffer() &&
617              gpu_info.SupportsPointersInKernels()) {
618     c += "    " + weights_global_ptr + " weights_cache;\n";
619   } else if (!trivial_kernel_size) {
620     c += "  int filter_offset = 0;\n";
621   }
622   if (conv_params.AreWeightsBuffer()) {
623     std::string offset;
624     if (conv_params.different_weights_for_height) {
625       offset = "(DST_S * args.src_tensor.Height() + DST_Y * " +
626                std::to_string(block_size.w) +
627                ") * 4 * args.src_tensor.Slices()";
628     } else {
629       std::string kernel_spatial_offset = "";
630       if (!conv_params_.x_kernel_is_1) {
631         kernel_spatial_offset += " * args.kernel_size_x";
632       }
633       if (!conv_params_.y_kernel_is_1) {
634         kernel_spatial_offset += " * args.kernel_size_y";
635       }
636       if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
637         kernel_spatial_offset += " * args.kernel_size_z";
638       }
639       offset = "DST_S * 4 * args.src_tensor.Slices()" + kernel_spatial_offset;
640     }
641     if (gpu_info.SupportsPointersInKernels()) {
642       c += "  " + weights_global_ptr +
643            " filters_loc = args.weights.GetPtr() + " + offset + ";\n";
644     } else {
645       c += "  int filters_offset = " + offset + ";\n";
646     }
647   }
648   if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
649     c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
650     for (int z = 0; z < block_size.z; ++z) {
651       const std::string zck = "zck" + std::to_string(z);
652       c += "  int zck" + std::to_string(z) + " = kz * args.dilation_z + zc" +
653            std::to_string(z) + ";\n";
654       if (!src_def.SupportsZeroClamp(Axis::DEPTH)) {
655         c += "  bool in_z" + std::to_string(z) + " = " + zck + " >= 0 && " +
656              zck + " < args.src_tensor.Depth();\n";
657         if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
658           c += "  " + zck + " = clamp(" + zck +
659                ", 0, args.src_tensor.Depth() - 1);\n";
660         }
661       }
662     }
663   }
664   if (!conv_params_.y_kernel_is_1) {
665     c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
666     for (int y = 0; y < block_size.y; ++y) {
667       const std::string yck = "yck" + std::to_string(y);
668       c += "  int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) +
669            ";\n";
670       if (!src_def.SupportsZeroClamp(Axis::HEIGHT)) {
671         c += "  bool in_y" + std::to_string(y) + " = " + yck + " >= 0 && " +
672              yck + " < args.src_tensor.Height();\n";
673         if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
674           c += "  " + yck + " = clamp(" + yck +
675                ", 0, args.src_tensor.Height() - 1);\n";
676         }
677       }
678     }
679   }
680   if (!conv_params_.x_kernel_is_1) {
681     c += "  for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
682     for (int x = 0; x < block_size.x; ++x) {
683       const std::string xck = "xck" + std::to_string(x);
684       c += "  int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" +
685            std::to_string(x) + ";\n";
686       if (!src_def.SupportsZeroClamp(Axis::WIDTH)) {
687         c += "  bool in_x" + std::to_string(x) + " = " + xck + " >= 0 && " +
688              xck + " < args.src_tensor.Width();\n";
689         if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
690           c += "  " + xck + " = clamp(" + xck +
691                ", 0, args.src_tensor.Width() - 1);\n";
692         }
693       }
694     }
695   }
696   const bool need_multiple_slice_strides =
697       src_def.ReturnsZeroForNegOneRead() && !trivial_kernel_size;
698   for (int z = 0; z < block_size.z; ++z) {
699     const std::string zind = std::to_string(z);
700     for (int y = 0; y < block_size.y; ++y) {
701       const std::string yind = std::to_string(y);
702       for (int x = 0; x < block_size.x; ++x) {
703         const std::string xind = std::to_string(x);
704         std::string xc = conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
705         std::string yc = conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
706         const std::string id = generate_id(xind, yind, zind);
707         std::string coords = "" + xc + ", " + yc;
708         if (src_def.HasAxis(Axis::DEPTH)) {
709           std::string zc =
710               conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
711           coords += ", " + zc;
712         }
713         if (src_def.IsLinear()) {
714           c += "  args.src_tensor.GetAddress(addr" + id + ", " + coords +
715                ", 0);\n";
716           if (need_multiple_slice_strides) {
717             const std::string check = generate_check(xind, yind, zind);
718             c += "  addr" + id + " = select(-1, addr" + id + ", (" + check +
719                  "));\n";
720             c += "  int ds" + id +
721                  " = select(0, args.src_tensor.SliceStride(), (" + check +
722                  "));\n";
723           }
724         }
725       }
726     }
727   }
728   if (src_def.IsLinear() && !need_multiple_slice_strides) {
729     c += "  int ds = args.src_tensor.SliceStride();\n";
730   }
731 
732   auto declare_src = [&]() {
733     for (int z = 0; z < block_size.z; ++z) {
734       const std::string zind = std::to_string(z);
735       for (int y = 0; y < block_size.y; ++y) {
736         const std::string yind = std::to_string(y);
737         for (int x = 0; x < block_size.x; ++x) {
738           const std::string xind = std::to_string(x);
739           const std::string id = generate_id(xind, yind, zind);
740           c += "    " + weights_data_type + " src" + id + ";\n";
741         }
742       }
743     }
744   };
745   const bool conditional_read = gpu_info.IsMali();
746   auto read_src = [&]() {
747     const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
748     for (int z = 0; z < block_size.z; ++z) {
749       const std::string zind = std::to_string(z);
750       for (int y = 0; y < block_size.y; ++y) {
751         const std::string yind = std::to_string(y);
752         for (int x = 0; x < block_size.x; ++x) {
753           const std::string xind = std::to_string(x);
754           std::string id = generate_id(xind, yind, zind);
755           const std::string check = generate_check(xind, yind, zind);
756           std::string address;
757           if (src_def.IsLinear()) {
758             address = "addr" + id;
759           } else {
760             std::string xc =
761                 conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
762             std::string yc =
763                 conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
764             address = "" + xc + ", " + yc;
765             if (src_def.HasAxis(Axis::DEPTH)) {
766               std::string zc =
767                   conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
768               address += ", " + zc;
769             }
770             address += ", s";
771           }
772           if (src_def.ReturnsZeroForNegOneRead()) {
773             c += "    src" + id + " = args.src_tensor.Read<" + cl_type + ">(" +
774                  address + ");\n";
775             const std::string ds = trivial_kernel_size ? "ds" : "ds" + id;
776             c += "    " + address + " += " + ds + ";\n";
777           } else {
778             if (!check.empty()) {
779               if (conditional_read) {
780                 c += "    src" + id + " = " + check +
781                      " ? args.src_tensor.Read<" + cl_type + ">(" + address +
782                      ") : INIT_FLT4(0.0f);\n";
783               } else {
784                 c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
785                      ">(" + address + ") * INIT_FLT(" + check + ");\n";
786               }
787             } else {
788               c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
789                    ">(" + address + ");\n";
790             }
791             if (src_def.IsLinear()) {
792               c += "    " + address + " += ds;\n";
793             }
794           }
795         }
796       }
797     }
798   };
799   const bool weights_type_as_accum_type =
800       !(op_def.precision == CalculationsPrecision::F32_F16 &&
801         conv_params.weights_data_type == DataType::FLOAT16);
802   auto conv_core = [&](int shared_offset) {
803     const std::string channels[] = {"x", "y", "z", "w"};
804     for (int s = 0; s < block_size.w; ++s) {
805       const std::string sind = std::to_string(s);
806       if (weights_type_as_accum_type) {
807         for (int ch = 0; ch < 4; ++ch) {
808           for (int z = 0; z < block_size.z; ++z) {
809             const std::string zind = std::to_string(z);
810             for (int y = 0; y < block_size.y; ++y) {
811               const std::string yind = std::to_string(y);
812               for (int x = 0; x < block_size.x; ++x) {
813                 const std::string xind = std::to_string(x);
814                 std::string R = "r" + generate_id_full(xind, yind, zind, sind);
815                 std::string S = "src" + generate_id(xind, yind, zind);
816                 if (use_simd_broadcast) {
817                   int simd_id = (s * 4 + ch + shared_offset) / simd_size;
818                   int thread_id = (s * 4 + ch + shared_offset) % simd_size;
819                   std::string w_val_x = "SUB_GROUP_BROADCAST(simd_w" +
820                                         std::to_string(simd_id) + ".x, " +
821                                         std::to_string(thread_id) + "u)";
822                   std::string w_val_y = "SUB_GROUP_BROADCAST(simd_w" +
823                                         std::to_string(simd_id) + ".y, " +
824                                         std::to_string(thread_id) + "u)";
825                   std::string w_val_z = "SUB_GROUP_BROADCAST(simd_w" +
826                                         std::to_string(simd_id) + ".z, " +
827                                         std::to_string(thread_id) + "u)";
828                   std::string w_val_w = "SUB_GROUP_BROADCAST(simd_w" +
829                                         std::to_string(simd_id) + ".w, " +
830                                         std::to_string(thread_id) + "u)";
831                   if (GetWeightsDescription().IsI4O4()) {
832                     c += "    " + R + ".x += " + w_val_x + " * " + S + "." +
833                          channels[ch] + ";\n";
834                     c += "    " + R + ".y += " + w_val_y + " * " + S + "." +
835                          channels[ch] + ";\n";
836                     c += "    " + R + ".z += " + w_val_z + " * " + S + "." +
837                          channels[ch] + ";\n";
838                     c += "    " + R + ".w += " + w_val_w + " * " + S + "." +
839                          channels[ch] + ";\n";
840                   } else {
841                     c += "    " + R + "." + channels[ch] + " += " + w_val_x +
842                          " * " + S + ".x;\n";
843                     c += "    " + R + "." + channels[ch] + " += " + w_val_y +
844                          " * " + S + ".y;\n";
845                     c += "    " + R + "." + channels[ch] + " += " + w_val_z +
846                          " * " + S + ".z;\n";
847                     c += "    " + R + "." + channels[ch] + " += " + w_val_w +
848                          " * " + S + ".w;\n";
849                   }
850                 } else {
851                   const std::string weight_id =
852                       std::to_string(s * 4 + ch + shared_offset);
853                   std::string w_val;
854                   if (conv_params.AreWeightsBuffer()) {
855                     if (gpu_info.SupportsPointersInKernels()) {
856                       w_val = "weights_cache[" + weight_id + "]";
857                     } else {
858                       w_val = "args.weights.Read(filters_offset + " +
859                               weight_id + ")";
860                     }
861                   } else {
862                     w_val = "f" + weight_id;
863                   }
864                   if (GetWeightsDescription().IsI4O4()) {
865                     c += "    " + R + " += " + w_val + " * " + S + "." +
866                          channels[ch] + ";\n";
867                   } else {
868                     c += "    " + R + "." + channels[ch] + " += dot(" + w_val +
869                          ", " + S + ");\n";
870                   }
871                 }
872               }
873             }
874           }
875         }
876       } else {  // F32_F16 precision and weights type is float16
877         for (int z = 0; z < block_size.z; ++z) {
878           const std::string zind = std::to_string(z);
879           for (int y = 0; y < block_size.y; ++y) {
880             const std::string yind = std::to_string(y);
881             for (int x = 0; x < block_size.x; ++x) {
882               const std::string xind = std::to_string(x);
883               std::string R = "r" + generate_id_full(xind, yind, zind, sind);
884               std::string S = "src" + generate_id(xind, yind, zind);
885               std::vector<std::string> F(4);
886               for (int i = 0; i < 4; ++i) {
887                 std::string weight_id =
888                     std::to_string(s * 4 + i + shared_offset);
889                 if (conv_params.AreWeightsBuffer()) {
890                   if (gpu_info.SupportsPointersInKernels()) {
891                     F[i] = "weights_cache[" + weight_id + "]";
892                   } else {
893                     F[i] =
894                         "args.weights.Read(filters_offset + " + weight_id + ")";
895                   }
896                 } else {
897                   F[i] = "f" + weight_id;
898                 }
899               }
900               if (GetWeightsDescription().IsI4O4()) {
901                 c += "    " + R + " += TO_ACCUM_TYPE(" + S + ".x * " + F[0] +
902                      " + " + S + ".y * " + F[1] + " + " + S + ".z * " + F[2] +
903                      " + " + S + ".w * " + F[3] + ");\n";
904               } else {
905                 c += "    " + R + ".x += dot(" + S + ", " + F[0] + ");\n";
906                 c += "    " + R + ".y += dot(" + S + ", " + F[1] + ");\n";
907                 c += "    " + R + ".z += dot(" + S + ", " + F[2] + ");\n";
908                 c += "    " + R + ".w += dot(" + S + ", " + F[3] + ");\n";
909               }
910             }
911           }
912         }
913       }
914     }
915   };
916 
917   c += "  int s = 0;\n";
918   c += "  do {\n";
919   declare_src();
920   const int total_work_items =
921       work_group_size_.x * work_group_size_.y * work_group_size_.z;
922   if (conv_params.weights_upload_type ==
923       ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
924     c += GenerateAsyncUpload("weights_cache", "filters_loc",
925                              /*global_offset_name*/ "", local_mem_size);
926   } else if (conv_params.weights_upload_type ==
927              ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
928     c += "    " + barrier + ";\n";
929     c += GenerateUploadByThreads("weights_cache", "filters_loc",
930                                  /*global_offset_name*/ "", "lid",
931                                  total_work_items, local_mem_size);
932   } else if (use_simd_broadcast) {
933     int parts = local_mem_size / simd_size;
934     int reminder = local_mem_size % simd_size;
935     for (int i = 0; i < parts; ++i) {
936       c += "    FLT4 simd_w" + std::to_string(i) + " = filters_loc[simd_id + " +
937            std::to_string(i * simd_size) + "];\n";
938     }
939     if (reminder) {
940       c += "    FLT4 simd_w" + std::to_string(parts) + ";\n";
941       c += "    if (simd_id < " + std::to_string(reminder) + ") {\n";
942       c += "      simd_w" + std::to_string(parts) +
943            " = filters_loc[simd_id + " + std::to_string(parts * simd_size) +
944            "];\n";
945       c += "    }\n";
946     }
947   } else if (conv_params.AreWeightsBuffer()) {  // GLOBAL_MEM/CONSTANT_MEM
948     if (gpu_info.SupportsPointersInKernels()) {
949       c += "    weights_cache = filters_loc;\n";
950     }
951   } else {  // TEXTURES_MEM
952     for (int dst_s = 0; dst_s < block_size.w; ++dst_s) {
953       std::string f_y = trivial_kernel_size ? "s" : "filter_offset";
954       if (conv_params.different_weights_for_height) {
955         f_y = "DST_Y * args.src_tensor.Slices() + s";
956       }
957       c += absl::Substitute(
958           R"(    FLT4 f$2 = args.weights0.Read(DST_S + $0, $1);
959     FLT4 f$3 = args.weights1.Read(DST_S + $0, $1);
960     FLT4 f$4 = args.weights2.Read(DST_S + $0, $1);
961     FLT4 f$5 = args.weights3.Read(DST_S + $0, $1);
962 )",
963           dst_s, f_y, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2,
964           dst_s * 4 + 3);
965     }
966     if (!trivial_kernel_size) {
967       c += "    filter_offset++;\n";
968     }
969   }
970   read_src();
971   c += "    s += 1;\n";
972   if (conv_params.weights_upload_type ==
973       ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
974     c += "    " + barrier + ";\n";
975   }
976   conv_core(0);
977   for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
978     read_src();
979     conv_core(i * block_size.w * 4);
980     c += "    s += 1;\n";
981   }
982   if (conv_params.AreWeightsBuffer()) {
983     if (gpu_info.SupportsPointersInKernels()) {
984       c += "    filters_loc += " + std::to_string(local_mem_size) + ";\n";
985     } else {
986       c += "    filters_offset += " + std::to_string(local_mem_size) + ";\n";
987     }
988   }
989   c += "  } while (s < args.src_tensor.Slices());\n";
990   if (!conv_params.x_kernel_is_1) {
991     c += "  };\n";
992   }
993   if (!conv_params.y_kernel_is_1) {
994     c += "  };\n";
995   }
996   if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
997     c += "  };\n";
998   }
999   if (conv_params.AreWeightsBuffer()) {
1000     if (conv_params.weights_upload_type ==
1001         ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
1002       c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S",
1003                                block_size.w);
1004     } else if (conv_params.weights_upload_type ==
1005                ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
1006       c += "  " + barrier + ";\n";
1007       c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()",
1008                                    "DST_S", "lid", total_work_items,
1009                                    block_size.w);
1010       c += "  " + barrier + ";\n";
1011     } else if (gpu_info.SupportsPointersInKernels()) {
1012       c += "  weights_cache = args.biases.GetPtr() + DST_S;\n";
1013     }
1014   }
1015   if (late_oob_check) {
1016     c += "  if (" + dst_oob_check + ") {\n";
1017     c += "    return;\n";
1018     c += "  }\n";
1019   }
1020 
1021   auto generate_dst_check = [&](int x, int y, int z) {
1022     std::string check;
1023     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
1024     const std::vector<std::string> names{"Width()", "Height()", "Depth()"};
1025     std::vector<std::string> coords(3);
1026     coords[0] = "DST_X + " + std::to_string(x);
1027     coords[1] = "DST_Y + " + std::to_string(y);
1028     coords[2] = "DST_Z + " + std::to_string(z);
1029     const std::vector<int> ids{x, y, z};
1030     for (int i = 0; i < axes.size(); ++i) {
1031       const auto& axis = axes[i];
1032       if (src_def.HasAxis(axis) && ids[i] != 0) {
1033         if (!check.empty()) {
1034           check += " && ";
1035         }
1036         check += coords[i] + " < args.dst_tensor." + names[i];
1037       }
1038     }
1039     return check;
1040   };
1041 
1042   for (int s = 0; s < block_size.w; ++s) {
1043     const std::string sind = std::to_string(s);
1044     c += "  if (DST_S + " + sind + " >= args.dst_tensor.Slices()) return;\n";
1045     c += "  {\n";
1046     if (conv_params.AreWeightsBuffer() &&
1047         gpu_info.SupportsPointersInKernels()) {
1048       c += "    FLT4 bias_val = TO_FLT4(weights_cache[" + sind + "]);\n";
1049     } else {
1050       c += "    FLT4 bias_val = args.biases.Read(DST_S + " + sind + ");\n";
1051     }
1052     for (int z = 0; z < block_size.z; ++z) {
1053       const std::string zind = std::to_string(z);
1054       for (int y = 0; y < block_size.y; ++y) {
1055         const std::string yind = std::to_string(y);
1056         for (int x = 0; x < block_size.x; ++x) {
1057           const std::string xind = std::to_string(x);
1058           const std::string id = generate_id_full(xind, yind, zind, sind);
1059           const std::string check = generate_dst_check(x, y, z);
1060           std::string coords = "DST_X + " + xind + ", DST_Y + " + yind;
1061           if (src_def.HasAxis(Axis::DEPTH)) {
1062             coords += ", DST_Z + " + zind;
1063           }
1064           coords += ", DST_S + " + sind;
1065           if (!check.empty()) {
1066             c += "  if (" + check + ") {\n";
1067           } else {
1068             c += "  {\n";
1069           }
1070           c += "    FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
1071           c += "    args.dst_tensor.Write(res, " + coords + ");\n";
1072           c += "  }\n";
1073         }
1074       }
1075     }
1076     c += "  }\n";
1077   }
1078   c += "}\n";
1079   return c;
1080 }
1081 
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,int src_depth,int dst_depth,bool x_kernel_is_1,bool y_kernel_is_1,bool different_weights_for_height,const BHWC * dst_shape)1082 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1083     const GpuInfo& gpu_info, const OperationDef& definition, int src_depth,
1084     int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
1085     bool different_weights_for_height, const BHWC* dst_shape) {
1086   ConvParams conv_params;
1087   conv_params.linear_spatial = false;
1088   conv_params.linear_all = false;
1089   conv_params.block_size = int4(1, 1, 1, 1);
1090   conv_params.weights_data_type =
1091       DeduceDataTypeFromPrecision(definition.precision);
1092   conv_params.x_kernel_is_1 = x_kernel_is_1;
1093   conv_params.y_kernel_is_1 = y_kernel_is_1;
1094   conv_params.different_weights_for_height = different_weights_for_height;
1095   if (gpu_info.IsNvidia()) {
1096     if (different_weights_for_height) {
1097       work_group_size_ = int3(32, 1, 1);
1098       work_group_launch_order_ = int3(2, 0, 1);
1099       conv_params.fixed_work_group_size = true;
1100     } else {
1101       conv_params.linear_spatial = true;
1102       work_group_size_ = int3(32, 1, 1);
1103       work_group_launch_order_ = int3(1, 0, 2);
1104       conv_params.fixed_work_group_size = true;
1105     }
1106     conv_params.block_size = int4(2, 1, 1, 4);
1107     conv_params.src_depth_loop_size = 1;
1108     conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
1109     if (dst_depth % 4 == 0 || dst_depth >= 8) {
1110       conv_params.block_size.w = 4;
1111     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1112       conv_params.block_size.w = 2;
1113     } else {
1114       conv_params.block_size.w = dst_depth;
1115     }
1116     if (dst_shape) {
1117       int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
1118       float task_size_per_cu =
1119           static_cast<float>(task_size) / gpu_info.GetComputeUnitsCount();
1120       int block_size = conv_params.block_size.x * conv_params.block_size.y *
1121                        conv_params.block_size.w;
1122       float threads_per_cu = task_size_per_cu / block_size;
1123       float warps_per_cu = threads_per_cu / 32 /*warp_size*/;
1124       if (warps_per_cu < 8.0f) {
1125         conv_params.block_size.x = 1;
1126       }
1127       if (warps_per_cu < 4.0f && conv_params.block_size.w >= 4) {
1128         conv_params.block_size.w /= 2;
1129       }
1130       if (warps_per_cu < 2.0f && conv_params.block_size.w >= 2) {
1131         conv_params.block_size.w /= 2;
1132       }
1133     }
1134     if (src_depth % 2 == 0) {
1135       conv_params.src_depth_loop_size = 2;
1136     }
1137     if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1138       conv_params.src_depth_loop_size = 4;
1139     }
1140   } else if (gpu_info.IsPowerVR()) {
1141     if (different_weights_for_height) {
1142       work_group_size_ = int3(32, 1, 1);
1143       work_group_launch_order_ = int3(2, 0, 1);
1144       conv_params.fixed_work_group_size = true;
1145     } else {
1146       conv_params.linear_spatial = true;
1147       work_group_size_ = int3(32, 1, 1);
1148       work_group_launch_order_ = int3(1, 0, 2);
1149       conv_params.fixed_work_group_size = true;
1150     }
1151     conv_params.weights_data_type =
1152         definition.precision == CalculationsPrecision::F16 ? DataType::FLOAT16
1153                                                            : DataType::FLOAT32;
1154     conv_params.block_size = int4(1, 1, 1, 4);
1155     conv_params.src_depth_loop_size = 1;
1156     conv_params.weights_upload_type =
1157         WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
1158     if (dst_depth % 8 == 0 || dst_depth >= 32) {
1159       conv_params.block_size.w = 8;
1160     } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
1161       conv_params.block_size.w = 4;
1162     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1163       conv_params.block_size.w = 2;
1164     } else {
1165       conv_params.block_size.w = dst_depth;
1166     }
1167     if (definition.precision == CalculationsPrecision::F16) {
1168       conv_params.block_size.w = std::min(4, conv_params.block_size.w);
1169       if (src_depth % 2 == 0) {
1170         conv_params.src_depth_loop_size = 2;
1171       }
1172       if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1173         conv_params.src_depth_loop_size = 4;
1174       }
1175       if (conv_params.block_size.w == 1) {
1176         if (src_depth % 2 == 0) {
1177           conv_params.src_depth_loop_size = 2;
1178         }
1179         if (src_depth % 4 == 0) {
1180           conv_params.src_depth_loop_size = 4;
1181         }
1182         if (src_depth <= 8) {
1183           conv_params.src_depth_loop_size = src_depth;
1184         }
1185       }
1186       conv_params.block_size.x = 2;
1187     }
1188   } else if (gpu_info.IsAMD()) {
1189     if (different_weights_for_height) {
1190       work_group_size_ = int3(32, 1, 1);
1191       work_group_launch_order_ = int3(2, 0, 1);
1192       conv_params.fixed_work_group_size = true;
1193     } else {
1194       work_group_size_ = int3(8, 4, 1);
1195       work_group_launch_order_ = int3(2, 0, 1);
1196       conv_params.fixed_work_group_size = true;
1197     }
1198 
1199     conv_params.block_size = int4(2, 1, 1, 1);
1200     if (x_kernel_is_1 && y_kernel_is_1) {
1201       conv_params.block_size.y = 2;
1202     }
1203     conv_params.src_depth_loop_size = 1;
1204     conv_params.weights_upload_type = WeightsUploadType::CONSTANT_MEM;
1205     if (dst_depth % 8 == 0 || dst_depth >= 32) {
1206       conv_params.block_size.w = 8;
1207     } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
1208       conv_params.block_size.w = 4;
1209     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1210       conv_params.block_size.w = 2;
1211     } else {
1212       conv_params.block_size.w = 1;
1213     }
1214     if (src_depth % 2 == 0 && src_depth >= 16) {
1215       conv_params.src_depth_loop_size = 2;
1216     }
1217   } else if (gpu_info.IsMali()) {
1218     int block_size = 2;
1219     if (dst_shape) {
1220       int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
1221       block_size = GetRecommendedBlockSizeForConv(
1222           gpu_info, definition.precision, task_size);
1223     }
1224     if (!x_kernel_is_1 || !y_kernel_is_1) {
1225       block_size = std::min(block_size, 4);
1226     }
1227     if (block_size == 8) {
1228       if (dst_depth == 1 || dst_depth == 3) {
1229         conv_params.block_size = int4(2, 2, 1, 1);
1230       } else {
1231         conv_params.block_size = int4(2, 2, 1, 2);
1232       }
1233     } else if (block_size == 4) {
1234       if (dst_depth == 1 || dst_depth == 3) {
1235         conv_params.block_size = int4(2, 2, 1, 1);
1236       } else {
1237         conv_params.block_size = int4(2, 1, 1, 1);
1238         if (definition.precision == CalculationsPrecision::F32 &&
1239             gpu_info.mali_info.IsValhall()) {
1240           conv_params.block_size.y = 2;
1241         } else {
1242           conv_params.block_size.w = 2;
1243         }
1244       }
1245     } else if (block_size == 2) {
1246       conv_params.block_size = int4(2, 1, 1, 1);
1247     } else {
1248       conv_params.block_size = int4(1, 1, 1, 1);
1249     }
1250     conv_params.src_depth_loop_size = 1;
1251     MaliInfo mali_info = gpu_info.mali_info;
1252     if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
1253       conv_params.src_depth_loop_size = 2;
1254     }
1255     if (src_depth % 4 == 0 && block_size == 1 && !mali_info.IsMidgard() &&
1256         definition.precision == CalculationsPrecision::F16) {
1257       conv_params.src_depth_loop_size = 4;
1258     }
1259     work_group_size_ = int3(4, 4, 1);
1260     work_group_launch_order_ = int3(0, 1, 2);
1261     conv_params.fixed_work_group_size = false;
1262     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1263   } else if (gpu_info.IsAdreno()) {
1264     conv_params.block_size = int4(2, 2, 1, 2);
1265     if (gpu_info.adreno_info.IsAdreno3xx()) {
1266       if (definition.precision == CalculationsPrecision::F16) {
1267         conv_params.block_size = int4(2, 2, 1, 2);
1268       } else if (definition.precision == CalculationsPrecision::F32_F16) {
1269         conv_params.block_size = int4(2, 1, 1, 2);
1270       } else {  // F32
1271         conv_params.block_size = int4(2, 2, 1, 1);
1272       }
1273     }
1274     work_group_size_ = int3(8, 2, 1);
1275     work_group_launch_order_ = int3(0, 1, 2);
1276     conv_params.fixed_work_group_size = false;
1277     conv_params.src_depth_loop_size = 1;
1278     if (definition.src_tensors.size() == 2) {
1279       // dynamic weights supported only with buffers.
1280       conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1281     } else {
1282       conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM_X4;
1283     }
1284   } else if (gpu_info.IsIntel()) {
1285     if (different_weights_for_height) {
1286       work_group_size_ = int3(16, 1, 1);
1287       work_group_launch_order_ = int3(0, 1, 2);
1288       conv_params.fixed_work_group_size = true;
1289     } else {
1290       conv_params.linear_spatial = true;
1291       work_group_size_ = int3(16, 1, 1);
1292       work_group_launch_order_ = int3(0, 1, 2);
1293       conv_params.fixed_work_group_size = true;
1294     }
1295     conv_params.block_size = int4(1, 1, 1, 4);
1296     conv_params.src_depth_loop_size = 1;
1297     int sub_group_size = 16;
1298     const bool supports_subgroups =
1299         gpu_info.SupportsExtension("cl_khr_subgroups") ||
1300         gpu_info.SupportsExtension("cl_intel_subgroups");
1301     if (definition.precision != CalculationsPrecision::F32_F16 &&
1302         supports_subgroups &&
1303         gpu_info.SupportsExtension("cl_intel_required_subgroup_size") &&
1304         gpu_info.SupportsSubGroupWithSize(sub_group_size)) {
1305       conv_params.weights_upload_type =
1306           WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
1307       conv_params.simd_size = sub_group_size;
1308     } else {
1309       conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
1310     }
1311     if (dst_depth % 4 == 0 || dst_depth >= 8) {
1312       conv_params.block_size.w = 4;
1313     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1314       conv_params.block_size.w = 2;
1315     } else {
1316       conv_params.block_size.w = dst_depth;
1317     }
1318     if (src_depth % 2 == 0) {
1319       conv_params.src_depth_loop_size = 2;
1320     }
1321     if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1322       conv_params.src_depth_loop_size = 4;
1323     }
1324   } else if (gpu_info.IsApple()) {
1325     conv_params.block_size = int4(2, 2, 1, 2);
1326     work_group_size_ = int3(8, 4, 1);
1327     work_group_launch_order_ = int3(0, 1, 2);
1328     conv_params.fixed_work_group_size = true;
1329     conv_params.src_depth_loop_size = 1;
1330     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1331   } else {
1332     conv_params.block_size = int4(1, 1, 1, 4);
1333     work_group_size_ = int3(8, 2, 1);
1334     work_group_launch_order_ = int3(0, 1, 2);
1335     conv_params.fixed_work_group_size = false;
1336     conv_params.src_depth_loop_size = 1;
1337     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1338     if (dst_depth % 4 == 0 || dst_depth >= 8) {
1339       conv_params.block_size.w = 4;
1340     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1341       conv_params.block_size.w = 2;
1342     } else {
1343       conv_params.block_size.w = dst_depth;
1344     }
1345     if (src_depth % 2 == 0) {
1346       conv_params.src_depth_loop_size = 2;
1347     }
1348     if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1349       conv_params.src_depth_loop_size = 4;
1350     }
1351   }
1352   if (conv_params.AreWeightsBuffer()) {
1353     if (gpu_info.IsApple()) {
1354       conv_params.weights_layout = WeightsLayout::kOSpatialIOGroupO4I4;
1355     } else {
1356       conv_params.weights_layout = WeightsLayout::kOSpatialIOGroupI4O4;
1357     }
1358   } else {
1359     if (gpu_info.IsApple()) {
1360       conv_params.weights_layout =
1361           WeightsLayout::k2DX4O4YIsSpatialIAndXIsOOGroupI4;
1362     } else {
1363       conv_params.weights_layout =
1364           WeightsLayout::k2DX4I4YIsSpatialIAndXIsOOGroupO4;
1365     }
1366   }
1367 
1368   return conv_params;
1369 }
1370 
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1371 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1372     const GpuInfo& gpu_info, const OperationDef& definition,
1373     const Convolution2DAttributes& attr, const BHWC* dst_shape) {
1374   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1375   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1376   const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
1377                              attr.dilations.w == 1 &&
1378                              attr.padding.prepended.w == 0 &&
1379                              attr.padding.appended.w == 0;
1380   const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
1381                              attr.dilations.h == 1 &&
1382                              attr.padding.prepended.h == 0 &&
1383                              attr.padding.appended.h == 0;
1384   return GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1385                          x_kernel_is_1, y_kernel_is_1, false, dst_shape);
1386 }
1387 
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution3DAttributes & attr,const BHWDC * dst_shape)1388 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1389     const GpuInfo& gpu_info, const OperationDef& definition,
1390     const Convolution3DAttributes& attr, const BHWDC* dst_shape) {
1391   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1392   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1393   const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
1394                              attr.dilations.w == 1 &&
1395                              attr.padding.prepended.w == 0 &&
1396                              attr.padding.appended.w == 0;
1397   const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
1398                              attr.dilations.h == 1 &&
1399                              attr.padding.prepended.h == 0 &&
1400                              attr.padding.appended.h == 0;
1401   const bool z_kernel_is_1 = attr.weights.shape.d == 1 && attr.strides.d == 1 &&
1402                              attr.dilations.d == 1 &&
1403                              attr.padding.prepended.d == 0 &&
1404                              attr.padding.appended.d == 0;
1405 
1406   ConvPowerVR::ConvParams result;
1407   BHWC shape;
1408   if (dst_shape) {
1409     shape.b = dst_shape->b;
1410     shape.h = dst_shape->h * dst_shape->d;
1411     shape.w = dst_shape->w;
1412     shape.c = dst_shape->c;
1413     result = GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1414                              x_kernel_is_1, y_kernel_is_1, false, &shape);
1415   } else {
1416     result = GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1417                              x_kernel_is_1, y_kernel_is_1, false, nullptr);
1418   }
1419   result.z_kernel_is_1 = z_kernel_is_1;
1420   return result;
1421 }
1422 
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const BHWC * dst_shape)1423 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1424     const GpuInfo& gpu_info, const OperationDef& definition,
1425     const Convolution2DAttributes& attr, const BHWC& weights_shape,
1426     const BHWC* dst_shape) {
1427   const int dst_depth = DivideRoundUp(weights_shape.b, 4);
1428   const int src_depth = DivideRoundUp(weights_shape.c, 4);
1429   const bool x_kernel_is_1 =
1430       weights_shape.w == 1 && attr.strides.w == 1 && attr.dilations.w == 1 &&
1431       attr.padding.prepended.w == 0 && attr.padding.appended.w == 0;
1432   const bool y_kernel_is_1 =
1433       weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
1434       attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
1435   return GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1436                          x_kernel_is_1, y_kernel_is_1, false, dst_shape);
1437 }
1438 
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr,const BHWC * dst_shape)1439 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1440     const GpuInfo& gpu_info, const OperationDef& definition,
1441     const FullyConnectedAttributes& attr, const BHWC* dst_shape) {
1442   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1443   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1444   ConvPowerVR::ConvParams params = GuessBestParams(
1445       gpu_info, definition, src_depth, dst_depth, true, true, false, dst_shape);
1446   work_group_size_.x *= work_group_size_.y;
1447   work_group_size_.y = 1;
1448   params.block_size.x *= params.block_size.y;
1449   params.block_size.y = 1;
1450   return params;
1451 }
1452 
GuessBestParamsWinograd(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1453 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
1454     const GpuInfo& gpu_info, const OperationDef& definition,
1455     const Convolution2DAttributes& attr, const BHWC* dst_shape) {
1456   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1457   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1458   ConvPowerVR::ConvParams params = GuessBestParams(
1459       gpu_info, definition, src_depth, dst_depth, true, true, true, dst_shape);
1460   params.block_size.x *= params.block_size.y;
1461   params.block_size.y = 1;
1462   return params;
1463 }
1464 
CreateConvPowerVR(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1465 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
1466                               const OperationDef& definition,
1467                               const Convolution2DAttributes& attr,
1468                               const BHWC* dst_shape) {
1469   ConvPowerVR result(definition, attr, gpu_info, dst_shape);
1470   result.GenerateCode(gpu_info);
1471   result.UploadData(attr.weights, attr.bias);
1472   return result;
1473 }
1474 
CreateConvPowerVR(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr,const BHWC * dst_shape)1475 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
1476                               const OperationDef& definition,
1477                               const FullyConnectedAttributes& attr,
1478                               const BHWC* dst_shape) {
1479   ConvPowerVR result(definition, attr, gpu_info, dst_shape);
1480   result.GenerateCode(gpu_info);
1481   result.UploadData(attr.weights, attr.bias);
1482   return result;
1483 }
1484 
CreateConvPowerVRDynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const BHWC * dst_shape)1485 ConvPowerVR CreateConvPowerVRDynamicWeights(const GpuInfo& gpu_info,
1486                                             const OperationDef& definition,
1487                                             const Convolution2DAttributes& attr,
1488                                             const BHWC& weights_shape,
1489                                             const BHWC* dst_shape) {
1490   ConvPowerVR result(definition, attr, weights_shape, gpu_info, dst_shape);
1491   result.GenerateCode(gpu_info);
1492   result.UploadBias(attr.bias);
1493   return result;
1494 }
1495 
CreateConvPowerVRWino4x4To6x6(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1496 ConvPowerVR CreateConvPowerVRWino4x4To6x6(const GpuInfo& gpu_info,
1497                                           const OperationDef& definition,
1498                                           const Convolution2DAttributes& attr,
1499                                           const BHWC* dst_shape) {
1500   ConvPowerVR result(definition);
1501   result.conv_params_ =
1502       result.GuessBestParamsWinograd(gpu_info, definition, attr, dst_shape);
1503   result.GenerateCode(gpu_info);
1504   result.UploadDataForWinograd4x4To6x6(attr.weights);
1505   return result;
1506 }
1507 
CreateConvPowerVR3D(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution3DAttributes & attr,const BHWDC * dst_shape)1508 ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
1509                                 const OperationDef& definition,
1510                                 const Convolution3DAttributes& attr,
1511                                 const BHWDC* dst_shape) {
1512   ConvPowerVR result(definition, attr, gpu_info, dst_shape);
1513   result.GenerateCode(gpu_info);
1514   result.UploadWeights(attr.weights);
1515   result.UploadBias(attr.bias);
1516   return result;
1517 }
1518 
1519 }  // namespace gpu
1520 }  // namespace tflite
1521