1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/conv_powervr.h"
17
18 #include <algorithm>
19 #include <string>
20 #include <utility>
21
22 #include "absl/strings/substitute.h"
23 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
28
29 namespace tflite {
30 namespace gpu {
31
32 namespace {
GenerateUploadByThreads(const std::string & local_ptr_name,const std::string & global_ptr_name,const std::string & global_offset_name,const std::string & lid_name,int total_work_items,int elements_to_upload)33 std::string GenerateUploadByThreads(const std::string& local_ptr_name,
34 const std::string& global_ptr_name,
35 const std::string& global_offset_name,
36 const std::string& lid_name,
37 int total_work_items,
38 int elements_to_upload) {
39 std::string c;
40 std::string offset =
41 global_offset_name.empty() ? "" : global_offset_name + " + ";
42 const int groups = elements_to_upload / total_work_items;
43 const int reminder = elements_to_upload % total_work_items;
44 for (int i = 0; i < groups; ++i) {
45 c += " " + local_ptr_name + "[" + lid_name + " + " +
46 std::to_string(total_work_items * i) + "] = " + global_ptr_name + "[" +
47 offset + lid_name + " + " + std::to_string(total_work_items * i) +
48 "];\n";
49 }
50 if (reminder != 0) {
51 c += " if (" + lid_name + " < " + std::to_string(reminder) + ") {\n";
52 c += " " + local_ptr_name + "[" + lid_name + " + " +
53 std::to_string(total_work_items * groups) + "] = " + global_ptr_name +
54 "[" + offset + lid_name + " + " +
55 std::to_string(total_work_items * groups) + "];\n";
56 c += " }\n";
57 }
58 return c;
59 }
60
GenerateAsyncUpload(const std::string & local_ptr_name,const std::string & global_ptr_name,const std::string & global_offset_name,int elements_to_upload)61 std::string GenerateAsyncUpload(const std::string& local_ptr_name,
62 const std::string& global_ptr_name,
63 const std::string& global_offset_name,
64 int elements_to_upload) {
65 std::string c;
66 std::string offset =
67 global_offset_name.empty() ? "" : " + " + global_offset_name;
68 c += " async_work_group_copy(" + local_ptr_name + ", " + global_ptr_name +
69 offset + ", " + std::to_string(elements_to_upload) + ", 0);\n";
70 return c;
71 }
72
GenerateBlockCoords(const int4 & block_size,const int3 & work_group_launch_order,bool linear_spatial,bool linear_all,bool need_depth)73 std::string GenerateBlockCoords(const int4& block_size,
74 const int3& work_group_launch_order,
75 bool linear_spatial, bool linear_all,
76 bool need_depth) {
77 std::string c;
78 int3 launch_remap;
79 launch_remap[work_group_launch_order.x] = 0;
80 launch_remap[work_group_launch_order.y] = 1;
81 launch_remap[work_group_launch_order.z] = 2;
82 if (linear_all) {
83 c += " int linear_id = GLOBAL_ID_0;\n";
84 c += " int DST_S = (linear_id / args.task_size_spatial) * " +
85 std::to_string(block_size.w) + ";\n";
86 c += " int linear_spatial = linear_id % args.task_size_spatial;\n";
87 if (need_depth) {
88 c += " int DST_X = (linear_spatial % args.task_size_x) * " +
89 std::to_string(block_size.x) + ";\n";
90 c += " linear_spatial = linear_spatial / args.task_size_x;\n";
91 c += " int DST_Y = (linear_spatial % args.task_size_y) * " +
92 std::to_string(block_size.y) + ";\n";
93 c += " int DST_Z = (linear_spatial / args.task_size_y) * " +
94 std::to_string(block_size.z) + ";\n";
95 } else {
96 c += " int DST_Y = (linear_spatial / args.task_size_x) * " +
97 std::to_string(block_size.y) + ";\n";
98 c += " int DST_X = (linear_spatial % args.task_size_x) * " +
99 std::to_string(block_size.x) + ";\n";
100 }
101 } else if (linear_spatial) {
102 if (work_group_launch_order[0] == 0) {
103 c += " int linear_spatial = GLOBAL_ID_0;\n";
104 } else {
105 c += " int linear_spatial = GROUP_ID_" +
106 std::to_string(launch_remap[0]) + " * GROUP_SIZE_0 + LOCAL_ID_0;\n";
107 }
108 if (need_depth) {
109 c += " int DST_X = (linear_spatial % args.task_size_x) * " +
110 std::to_string(block_size.x) + ";\n";
111 c += " linear_spatial = linear_spatial / args.task_size_x;\n";
112 c += " int DST_Y = (linear_spatial % args.task_size_y) * " +
113 std::to_string(block_size.y) + ";\n";
114 c += " int DST_Z = (linear_spatial / args.task_size_y) * " +
115 std::to_string(block_size.z) + ";\n";
116 } else {
117 c += " int DST_Y = (linear_spatial / args.task_size_x) * " +
118 std::to_string(block_size.y) + ";\n";
119 c += " int DST_X = (linear_spatial % args.task_size_x) * " +
120 std::to_string(block_size.x) + ";\n";
121 }
122 if (work_group_launch_order[1] == 1) {
123 c +=
124 " int DST_S = GLOBAL_ID_1 * " + std::to_string(block_size.w) + ";\n";
125 } else {
126 c += " int DST_S = (GROUP_ID_" + std::to_string(launch_remap[1]) +
127 " * GROUP_SIZE_1 + LOCAL_ID_1) * " + std::to_string(block_size.w) +
128 ";\n";
129 }
130 } else {
131 if (work_group_launch_order[0] == 0) {
132 c +=
133 " int DST_X = GLOBAL_ID_0 * " + std::to_string(block_size.x) + ";\n";
134 } else {
135 c += " int DST_X = (GROUP_ID_" + std::to_string(launch_remap[0]) +
136 " * GROUP_SIZE_0 + LOCAL_ID_0) * " + std::to_string(block_size.x) +
137 ";\n";
138 }
139 std::string global_id_1;
140 if (work_group_launch_order[1] == 1) {
141 global_id_1 = "GLOBAL_ID_1";
142 } else {
143 global_id_1 = "(GROUP_ID_" + std::to_string(launch_remap[1]) +
144 " * GROUP_SIZE_1 + LOCAL_ID_1)";
145 }
146 if (need_depth) {
147 c += " int linear_id_1 = " + global_id_1 + ";\n";
148 c += " int DST_Z = (linear_id_1 / args.task_size_y) * " +
149 std::to_string(block_size.z) + ";\n";
150 c += " int DST_Y = (linear_id_1 % args.task_size_y) * " +
151 std::to_string(block_size.y) + ";\n";
152 } else {
153 c += " int DST_Y = " + global_id_1 + " * " +
154 std::to_string(block_size.y) + ";\n";
155 }
156 if (work_group_launch_order[2] == 2) {
157 c +=
158 " int DST_S = GLOBAL_ID_2 * " + std::to_string(block_size.w) + ";\n";
159 } else {
160 c += " int DST_S = (GROUP_ID_" + std::to_string(launch_remap[2]) +
161 " * GROUP_SIZE_2 + LOCAL_ID_2) * " + std::to_string(block_size.w) +
162 ";\n";
163 }
164 }
165
166 return c;
167 }
168 } // namespace
169
ConvPowerVR(const OperationDef & definition,const Convolution2DAttributes & attr,const GpuInfo & gpu_info,const BHWC * dst_shape)170 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
171 const Convolution2DAttributes& attr,
172 const GpuInfo& gpu_info, const BHWC* dst_shape)
173 : GPUOperation(definition),
174 stride_(attr.strides.w, attr.strides.h, 1, 1),
175 padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
176 kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 1, 1),
177 dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
178 conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
179
ConvPowerVR(const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const GpuInfo & gpu_info,const BHWC * dst_shape)180 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
181 const Convolution2DAttributes& attr,
182 const BHWC& weights_shape, const GpuInfo& gpu_info,
183 const BHWC* dst_shape)
184 : GPUOperation(definition),
185 stride_(attr.strides.w, attr.strides.h, 1, 1),
186 padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
187 kernel_size_(weights_shape.w, weights_shape.h, 1, 1),
188 dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
189 conv_params_(GuessBestParams(gpu_info, definition, attr, weights_shape,
190 dst_shape)) {}
191
ConvPowerVR(const OperationDef & definition,const FullyConnectedAttributes & attr,const GpuInfo & gpu_info,const BHWC * dst_shape)192 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
193 const FullyConnectedAttributes& attr,
194 const GpuInfo& gpu_info, const BHWC* dst_shape)
195 : GPUOperation(definition),
196 stride_(1, 1, 1, 1),
197 padding_(0, 0, 0, 0),
198 kernel_size_(1, 1, 1, 1),
199 dilation_(1, 1, 1, 1),
200 conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
201
ConvPowerVR(const OperationDef & definition)202 ConvPowerVR::ConvPowerVR(const OperationDef& definition)
203 : GPUOperation(definition),
204 stride_(1, 1, 1, 1),
205 padding_(0, 0, 0, 0),
206 kernel_size_(1, 1, 1, 1),
207 dilation_(1, 1, 1, 1) {}
208
ConvPowerVR(ConvPowerVR && operation)209 ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
210 : GPUOperation(std::move(operation)),
211 stride_(operation.stride_),
212 padding_(operation.padding_),
213 kernel_size_(operation.kernel_size_),
214 dilation_(operation.dilation_),
215 conv_params_(operation.conv_params_) {}
216
ConvPowerVR(const OperationDef & definition,const Convolution3DAttributes & attr,const GpuInfo & gpu_info,const BHWDC * dst_shape)217 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
218 const Convolution3DAttributes& attr,
219 const GpuInfo& gpu_info, const BHWDC* dst_shape)
220 : GPUOperation(definition),
221 stride_(attr.strides.w, attr.strides.h, attr.strides.d, 1),
222 padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
223 -attr.padding.prepended.d, 0),
224 kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
225 attr.weights.shape.d, 1),
226 dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 1),
227 conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
228
operator =(ConvPowerVR && operation)229 ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
230 if (this != &operation) {
231 std::swap(stride_, operation.stride_);
232 std::swap(padding_, operation.padding_);
233 std::swap(kernel_size_, operation.kernel_size_);
234 std::swap(dilation_, operation.dilation_);
235 std::swap(conv_params_, operation.conv_params_);
236 GPUOperation::operator=(std::move(operation));
237 }
238 return *this;
239 }
240
GenerateCode(const GpuInfo & gpu_info)241 void ConvPowerVR::GenerateCode(const GpuInfo& gpu_info) {
242 if (conv_params_.linear_all) {
243 grid_dimension_ = 1;
244 } else if (conv_params_.linear_spatial) {
245 grid_dimension_ = 2;
246 }
247 const bool stride_correction =
248 definition_.IsBatchSupported() && stride_.x != 1;
249 code_ = GenerateConv(gpu_info, definition_, stride_correction, conv_params_);
250 if (definition_.precision == CalculationsPrecision::F16 &&
251 gpu_info.IsPowerVR()) {
252 compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
253 }
254 if (gpu_info.IsMali()) {
255 compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
256 }
257 if (conv_params_.IsPrivateMemBroadcast() && gpu_info.IsCL20OrHigher()) {
258 compiler_options_.push_back(CompilerOptions::kCl20);
259 }
260 bool kernel_is_trivial =
261 conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
262 if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
263 kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1;
264 }
265 if (gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx() &&
266 definition_.precision == CalculationsPrecision::F16 &&
267 kernel_is_trivial) {
268 compiler_options_.push_back(CompilerOptions::kAdrenoFullSimd);
269 }
270 }
271
BindArguments(ArgumentsBinder * args)272 absl::Status ConvPowerVR::BindArguments(ArgumentsBinder* args) {
273 if (!conv_params_.x_kernel_is_1) {
274 RETURN_IF_ERROR(args->SetInt("stride_x", stride_.x));
275 RETURN_IF_ERROR(args->SetInt("padding_x", padding_.x * src_[0]->Batch()));
276 RETURN_IF_ERROR(args->SetInt("kernel_size_x", kernel_size_.x));
277 RETURN_IF_ERROR(args->SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
278 }
279 if (!conv_params_.y_kernel_is_1) {
280 RETURN_IF_ERROR(args->SetInt("stride_y", stride_.y));
281 RETURN_IF_ERROR(args->SetInt("padding_y", padding_.y));
282 RETURN_IF_ERROR(args->SetInt("kernel_size_y", kernel_size_.y));
283 RETURN_IF_ERROR(args->SetInt("dilation_y", dilation_.y));
284 }
285 if (definition_.src_tensors[0].HasAxis(Axis::DEPTH) &&
286 !conv_params_.z_kernel_is_1) {
287 RETURN_IF_ERROR(args->SetInt("stride_z", stride_.z));
288 RETURN_IF_ERROR(args->SetInt("padding_z", padding_.z));
289 RETURN_IF_ERROR(args->SetInt("kernel_size_z", kernel_size_.z));
290 RETURN_IF_ERROR(args->SetInt("dilation_z", dilation_.z));
291 }
292 const int task_size_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
293 conv_params_.block_size.x);
294 const int task_size_y =
295 DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
296 const int task_size_z =
297 DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
298 RETURN_IF_ERROR(args->SetInt("task_size_x", task_size_x));
299 RETURN_IF_ERROR(args->SetInt("task_size_y", task_size_y));
300 const int task_size_spatial = task_size_x * task_size_y * task_size_z;
301 RETURN_IF_ERROR(args->SetInt("task_size_spatial", task_size_spatial));
302 return absl::OkStatus();
303 }
304
GetGridSize() const305 int3 ConvPowerVR::GetGridSize() const {
306 const int task_size_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
307 conv_params_.block_size.x);
308 const int task_size_y =
309 DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
310 const int task_size_z =
311 DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
312 const int task_size_s =
313 DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w);
314 int3 wg;
315
316 if (conv_params_.linear_all) {
317 return int3(task_size_x * task_size_y * task_size_z * task_size_s, 1, 1);
318 } else if (conv_params_.linear_spatial) {
319 return int3(task_size_x * task_size_y * task_size_z, task_size_s, 1);
320 } else {
321 return int3(task_size_x, task_size_y * task_size_z, task_size_s);
322 }
323 }
324
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const325 void ConvPowerVR::GetPossibleKernelWorkGroups(
326 TuningType tuning_type, const GpuInfo& gpu_info,
327 const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
328 if (conv_params_.weights_upload_type ==
329 WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
330 conv_params_.weights_upload_type ==
331 WeightsUploadType::LOCAL_MEM_BY_THREADS ||
332 conv_params_.fixed_work_group_size) {
333 work_groups->push_back(work_group_size_);
334 return;
335 }
336 GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
337 work_groups);
338 }
339
GenerateConv(const GpuInfo & gpu_info,const OperationDef & op_def,bool stride_correction,const ConvParams & conv_params)340 std::string ConvPowerVR::GenerateConv(const GpuInfo& gpu_info,
341 const OperationDef& op_def,
342 bool stride_correction,
343 const ConvParams& conv_params) {
344 auto src_desc = op_def.src_tensors[0];
345 src_desc.SetAddressMode(AddressMode::kZero);
346 if (op_def.IsBatchSupported()) {
347 src_desc.SetStateVar("BatchedWidth", "true");
348 }
349 AddSrcTensor("src_tensor", src_desc);
350 if (op_def.src_tensors.size() == 2) {
351 // dynamic weights
352 BufferDescriptor desc;
353 desc.element_type = op_def.src_tensors[1].data_type;
354 desc.element_size = 4;
355 desc.memory_type = conv_params.weights_upload_type ==
356 ConvPowerVR::WeightsUploadType::CONSTANT_MEM
357 ? MemoryType::CONSTANT
358 : MemoryType::GLOBAL;
359
360 AddSrcBuffer("weights", desc);
361 }
362
363 const auto& src_def = op_def.src_tensors[0];
364
365 auto generate_id = [&](const std::string& x, const std::string& y,
366 const std::string& z) {
367 std::string id;
368 if (src_def.HasAxis(Axis::WIDTH)) {
369 id += "_w" + x;
370 }
371 if (src_def.HasAxis(Axis::HEIGHT)) {
372 id += "_h" + y;
373 }
374 if (src_def.HasAxis(Axis::DEPTH)) {
375 id += "_d" + z;
376 }
377 return id;
378 };
379
380 auto generate_id_full = [&](const std::string& x, const std::string& y,
381 const std::string& z, const std::string& s) {
382 return generate_id(x, y, z) + "_s" + s;
383 };
384
385 auto generate_check = [&](const std::string& x, const std::string& y,
386 const std::string& z) {
387 std::string check;
388 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
389 const std::vector<std::string> names{"in_x", "in_y", "in_z"};
390 const std::vector<bool> is_1{conv_params_.x_kernel_is_1,
391 conv_params_.y_kernel_is_1,
392 conv_params_.z_kernel_is_1};
393 const std::vector<std::string> coords{x, y, z};
394 for (int i = 0; i < axes.size(); ++i) {
395 const auto& axis = axes[i];
396 if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis) &&
397 !is_1[i]) {
398 if (!check.empty()) {
399 check += " && ";
400 }
401 check += names[i] + coords[i];
402 }
403 }
404 return check;
405 };
406
407 auto dst_desc = op_def.dst_tensors[0];
408 if (op_def.IsBatchSupported()) {
409 dst_desc.SetStateVar("BatchedWidth", "true");
410 }
411 AddDstTensor("dst_tensor", dst_desc);
412
413 if (!conv_params_.x_kernel_is_1) {
414 args_.AddInt("stride_x");
415 args_.AddInt("padding_x");
416 args_.AddInt("kernel_size_x");
417 args_.AddInt("dilation_x");
418 }
419 if (!conv_params_.y_kernel_is_1) {
420 args_.AddInt("stride_y");
421 args_.AddInt("padding_y");
422 args_.AddInt("kernel_size_y");
423 args_.AddInt("dilation_y");
424 }
425 if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
426 args_.AddInt("stride_z");
427 args_.AddInt("padding_z");
428 args_.AddInt("kernel_size_z");
429 args_.AddInt("dilation_z");
430 }
431 args_.AddInt("task_size_x");
432 args_.AddInt("task_size_y");
433 args_.AddInt("task_size_spatial");
434
435 const int wg_total_size =
436 work_group_size_.x * work_group_size_.y * work_group_size_.z;
437 const std::string barrier =
438 wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
439 ? "SIMD_LOCAL_MEM_BARRIER"
440 : "LOCAL_MEM_BARRIER";
441
442 const bool need_local_mem =
443 conv_params.weights_upload_type ==
444 ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
445 conv_params.weights_upload_type ==
446 ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
447
448 const int local_mem_size =
449 conv_params.block_size.w * 4 * conv_params.src_depth_loop_size;
450
451 const bool use_simd_broadcast = conv_params.IsPrivateMemBroadcast();
452 const int simd_size = conv_params.simd_size;
453
454 const bool late_oob_check = need_local_mem || use_simd_broadcast;
455
456 const std::string weights_space =
457 conv_params.weights_upload_type ==
458 ConvPowerVR::WeightsUploadType::CONSTANT_MEM
459 ? "__constant"
460 : "__global";
461
462 const std::string weights_data_type =
463 conv_params.weights_data_type == DataType::FLOAT32 ? "float4" : "half4";
464
465 const std::string weights_global_ptr =
466 weights_space + " " + weights_data_type + "*";
467
468 std::string c;
469 if (use_simd_broadcast && gpu_info.IsApiOpenCl()) {
470 if (gpu_info.opencl_info.cl_version == OpenClVersion::kCl2_0) {
471 c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
472 } else if (gpu_info.SupportsExtension("cl_intel_subgroups")) {
473 c += "#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n";
474 }
475 }
476 const int4 block_size = conv_params.block_size;
477 if (conv_params.fixed_work_group_size && gpu_info.IsApiOpenCl()) {
478 c += "__attribute__((reqd_work_group_size(" +
479 std::to_string(work_group_size_.x) + ", " +
480 std::to_string(work_group_size_.y) + ", " +
481 std::to_string(work_group_size_.z) + ")))\n";
482 }
483 if (use_simd_broadcast && gpu_info.IsIntel() && gpu_info.IsApiOpenCl()) {
484 c += "__attribute__((intel_reqd_sub_group_size(" +
485 std::to_string(simd_size) + ")))\n";
486 }
487 std::string dst_oob_check;
488 if (src_def.HasAxis(Axis::DEPTH)) {
489 if (conv_params.linear_all) {
490 dst_oob_check = "DST_S >= args.dst_tensor.Slices()";
491 } else if (conv_params.linear_spatial) {
492 dst_oob_check =
493 "DST_Z >= args.dst_tensor.Depth() || DST_S >= "
494 "args.dst_tensor.Slices()";
495 } else {
496 dst_oob_check =
497 "DST_X >= args.dst_tensor.Width() || DST_Z >= "
498 "args.dst_tensor.Depth() || DST_S >= args.dst_tensor.Slices()";
499 }
500 } else {
501 if (conv_params.linear_all) {
502 dst_oob_check = "DST_S >= args.dst_tensor.Slices()";
503 } else if (conv_params.linear_spatial) {
504 dst_oob_check =
505 "DST_Y >= args.dst_tensor.Height() || DST_S >= "
506 "args.dst_tensor.Slices()";
507 } else {
508 dst_oob_check =
509 "DST_X >= args.dst_tensor.Width() || DST_Y >= "
510 "args.dst_tensor.Height() || DST_S >= args.dst_tensor.Slices()";
511 }
512 }
513 c += "MAIN_FUNCTION($0) {\n";
514 c += GenerateBlockCoords(conv_params.block_size, work_group_launch_order_,
515 conv_params.linear_spatial, conv_params.linear_all,
516 src_def.HasAxis(Axis::DEPTH));
517 if (!late_oob_check) {
518 c += " if (" + dst_oob_check + ") {\n";
519 c += " return;\n";
520 c += " }\n";
521 }
522 if (conv_params.weights_upload_type ==
523 ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
524 if (conv_params.linear_spatial) {
525 c += " int lid = LOCAL_ID_0;\n";
526 } else {
527 c += " int lid = LOCAL_ID_1 * " + std::to_string(work_group_size_.x) +
528 " + LOCAL_ID_0;\n";
529 }
530 }
531 if (use_simd_broadcast) {
532 c += " int simd_id = SUB_GROUP_LOCAL_ID;\n";
533 }
534 for (int s = 0; s < block_size.w; ++s) {
535 const std::string sind = std::to_string(s);
536 for (int z = 0; z < block_size.z; ++z) {
537 const std::string zind = std::to_string(z);
538 for (int y = 0; y < block_size.y; ++y) {
539 const std::string yind = std::to_string(y);
540 for (int x = 0; x < block_size.x; ++x) {
541 const std::string xind = std::to_string(x);
542 c += " ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) +
543 " = INIT_ACCUM_FLT4(0.0f);\n";
544 }
545 }
546 }
547 }
548 if (!conv_params_.x_kernel_is_1) {
549 for (int x = 0; x < block_size.x; ++x) {
550 const std::string xind = std::to_string(x);
551 const std::string xc = "(DST_X + " + xind + ")";
552 if (stride_correction) {
553 c += " int xc" + xind + " = " +
554 GetXStrideCorrected(xc, "args.src_tensor.Batch()", "args.stride_x",
555 "args.padding_x") +
556 ";\n";
557 } else {
558 c += " int xc" + xind + " = " + xc +
559 " * args.stride_x + args.padding_x;\n";
560 }
561 }
562 } else {
563 for (int x = 0; x < block_size.x; ++x) {
564 const std::string xind = std::to_string(x);
565 c += " int xc" + xind + " = DST_X + " + xind + ";\n";
566 if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
567 c += " xc" + xind + " = clamp(xc" + xind +
568 ", 0, args.src_tensor.Width() - 1);\n";
569 }
570 }
571 }
572 if (!conv_params_.y_kernel_is_1) {
573 for (int y = 0; y < block_size.y; ++y) {
574 const std::string yind = std::to_string(y);
575 const std::string yc = "(DST_Y + " + yind + ")";
576 c += " int yc" + yind + " = " + yc +
577 " * args.stride_y + args.padding_y;\n";
578 }
579 } else {
580 for (int y = 0; y < block_size.y; ++y) {
581 const std::string yind = std::to_string(y);
582 c += " int yc" + yind + " = DST_Y + " + yind + ";\n";
583 if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
584 c += " yc" + yind + " = clamp(yc" + yind +
585 ", 0, args.src_tensor.Height() - 1);\n";
586 }
587 }
588 }
589 if (src_def.HasAxis(Axis::DEPTH)) {
590 if (!conv_params_.z_kernel_is_1) {
591 for (int z = 0; z < block_size.z; ++z) {
592 const std::string zind = std::to_string(z);
593 const std::string zc = "(DST_Z + " + zind + ")";
594 c += " int zc" + zind + " = " + zc +
595 " * args.stride_z + args.padding_z;\n";
596 }
597 } else {
598 for (int z = 0; z < block_size.z; ++z) {
599 const std::string zind = std::to_string(z);
600 c += " int zc" + zind + " = DST_Z + " + zind + ";\n";
601 if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
602 c += " zc" + zind + " = clamp(zc" + zind +
603 ", 0, args.src_tensor.Depth() - 1);\n";
604 }
605 }
606 }
607 }
608 bool trivial_kernel_size =
609 conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
610 if (src_def.HasAxis(Axis::DEPTH)) {
611 trivial_kernel_size = trivial_kernel_size && conv_params_.z_kernel_is_1;
612 }
613 if (need_local_mem) {
614 c += " __local " + weights_data_type + " weights_cache[" +
615 std::to_string(local_mem_size) + "];\n";
616 } else if (conv_params.AreWeightsBuffer() &&
617 gpu_info.SupportsPointersInKernels()) {
618 c += " " + weights_global_ptr + " weights_cache;\n";
619 } else if (!trivial_kernel_size) {
620 c += " int filter_offset = 0;\n";
621 }
622 if (conv_params.AreWeightsBuffer()) {
623 std::string offset;
624 if (conv_params.different_weights_for_height) {
625 offset = "(DST_S * args.src_tensor.Height() + DST_Y * " +
626 std::to_string(block_size.w) +
627 ") * 4 * args.src_tensor.Slices()";
628 } else {
629 std::string kernel_spatial_offset = "";
630 if (!conv_params_.x_kernel_is_1) {
631 kernel_spatial_offset += " * args.kernel_size_x";
632 }
633 if (!conv_params_.y_kernel_is_1) {
634 kernel_spatial_offset += " * args.kernel_size_y";
635 }
636 if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
637 kernel_spatial_offset += " * args.kernel_size_z";
638 }
639 offset = "DST_S * 4 * args.src_tensor.Slices()" + kernel_spatial_offset;
640 }
641 if (gpu_info.SupportsPointersInKernels()) {
642 c += " " + weights_global_ptr +
643 " filters_loc = args.weights.GetPtr() + " + offset + ";\n";
644 } else {
645 c += " int filters_offset = " + offset + ";\n";
646 }
647 }
648 if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
649 c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
650 for (int z = 0; z < block_size.z; ++z) {
651 const std::string zck = "zck" + std::to_string(z);
652 c += " int zck" + std::to_string(z) + " = kz * args.dilation_z + zc" +
653 std::to_string(z) + ";\n";
654 if (!src_def.SupportsZeroClamp(Axis::DEPTH)) {
655 c += " bool in_z" + std::to_string(z) + " = " + zck + " >= 0 && " +
656 zck + " < args.src_tensor.Depth();\n";
657 if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
658 c += " " + zck + " = clamp(" + zck +
659 ", 0, args.src_tensor.Depth() - 1);\n";
660 }
661 }
662 }
663 }
664 if (!conv_params_.y_kernel_is_1) {
665 c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
666 for (int y = 0; y < block_size.y; ++y) {
667 const std::string yck = "yck" + std::to_string(y);
668 c += " int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) +
669 ";\n";
670 if (!src_def.SupportsZeroClamp(Axis::HEIGHT)) {
671 c += " bool in_y" + std::to_string(y) + " = " + yck + " >= 0 && " +
672 yck + " < args.src_tensor.Height();\n";
673 if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
674 c += " " + yck + " = clamp(" + yck +
675 ", 0, args.src_tensor.Height() - 1);\n";
676 }
677 }
678 }
679 }
680 if (!conv_params_.x_kernel_is_1) {
681 c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
682 for (int x = 0; x < block_size.x; ++x) {
683 const std::string xck = "xck" + std::to_string(x);
684 c += " int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" +
685 std::to_string(x) + ";\n";
686 if (!src_def.SupportsZeroClamp(Axis::WIDTH)) {
687 c += " bool in_x" + std::to_string(x) + " = " + xck + " >= 0 && " +
688 xck + " < args.src_tensor.Width();\n";
689 if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
690 c += " " + xck + " = clamp(" + xck +
691 ", 0, args.src_tensor.Width() - 1);\n";
692 }
693 }
694 }
695 }
696 const bool need_multiple_slice_strides =
697 src_def.ReturnsZeroForNegOneRead() && !trivial_kernel_size;
698 for (int z = 0; z < block_size.z; ++z) {
699 const std::string zind = std::to_string(z);
700 for (int y = 0; y < block_size.y; ++y) {
701 const std::string yind = std::to_string(y);
702 for (int x = 0; x < block_size.x; ++x) {
703 const std::string xind = std::to_string(x);
704 std::string xc = conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
705 std::string yc = conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
706 const std::string id = generate_id(xind, yind, zind);
707 std::string coords = "" + xc + ", " + yc;
708 if (src_def.HasAxis(Axis::DEPTH)) {
709 std::string zc =
710 conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
711 coords += ", " + zc;
712 }
713 if (src_def.IsLinear()) {
714 c += " args.src_tensor.GetAddress(addr" + id + ", " + coords +
715 ", 0);\n";
716 if (need_multiple_slice_strides) {
717 const std::string check = generate_check(xind, yind, zind);
718 c += " addr" + id + " = select(-1, addr" + id + ", (" + check +
719 "));\n";
720 c += " int ds" + id +
721 " = select(0, args.src_tensor.SliceStride(), (" + check +
722 "));\n";
723 }
724 }
725 }
726 }
727 }
728 if (src_def.IsLinear() && !need_multiple_slice_strides) {
729 c += " int ds = args.src_tensor.SliceStride();\n";
730 }
731
732 auto declare_src = [&]() {
733 for (int z = 0; z < block_size.z; ++z) {
734 const std::string zind = std::to_string(z);
735 for (int y = 0; y < block_size.y; ++y) {
736 const std::string yind = std::to_string(y);
737 for (int x = 0; x < block_size.x; ++x) {
738 const std::string xind = std::to_string(x);
739 const std::string id = generate_id(xind, yind, zind);
740 c += " " + weights_data_type + " src" + id + ";\n";
741 }
742 }
743 }
744 };
745 const bool conditional_read = gpu_info.IsMali();
746 auto read_src = [&]() {
747 const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
748 for (int z = 0; z < block_size.z; ++z) {
749 const std::string zind = std::to_string(z);
750 for (int y = 0; y < block_size.y; ++y) {
751 const std::string yind = std::to_string(y);
752 for (int x = 0; x < block_size.x; ++x) {
753 const std::string xind = std::to_string(x);
754 std::string id = generate_id(xind, yind, zind);
755 const std::string check = generate_check(xind, yind, zind);
756 std::string address;
757 if (src_def.IsLinear()) {
758 address = "addr" + id;
759 } else {
760 std::string xc =
761 conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
762 std::string yc =
763 conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
764 address = "" + xc + ", " + yc;
765 if (src_def.HasAxis(Axis::DEPTH)) {
766 std::string zc =
767 conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
768 address += ", " + zc;
769 }
770 address += ", s";
771 }
772 if (src_def.ReturnsZeroForNegOneRead()) {
773 c += " src" + id + " = args.src_tensor.Read<" + cl_type + ">(" +
774 address + ");\n";
775 const std::string ds = trivial_kernel_size ? "ds" : "ds" + id;
776 c += " " + address + " += " + ds + ";\n";
777 } else {
778 if (!check.empty()) {
779 if (conditional_read) {
780 c += " src" + id + " = " + check +
781 " ? args.src_tensor.Read<" + cl_type + ">(" + address +
782 ") : INIT_FLT4(0.0f);\n";
783 } else {
784 c += " src" + id + " = args.src_tensor.Read<" + cl_type +
785 ">(" + address + ") * INIT_FLT(" + check + ");\n";
786 }
787 } else {
788 c += " src" + id + " = args.src_tensor.Read<" + cl_type +
789 ">(" + address + ");\n";
790 }
791 if (src_def.IsLinear()) {
792 c += " " + address + " += ds;\n";
793 }
794 }
795 }
796 }
797 }
798 };
799 const bool weights_type_as_accum_type =
800 !(op_def.precision == CalculationsPrecision::F32_F16 &&
801 conv_params.weights_data_type == DataType::FLOAT16);
802 auto conv_core = [&](int shared_offset) {
803 const std::string channels[] = {"x", "y", "z", "w"};
804 for (int s = 0; s < block_size.w; ++s) {
805 const std::string sind = std::to_string(s);
806 if (weights_type_as_accum_type) {
807 for (int ch = 0; ch < 4; ++ch) {
808 for (int z = 0; z < block_size.z; ++z) {
809 const std::string zind = std::to_string(z);
810 for (int y = 0; y < block_size.y; ++y) {
811 const std::string yind = std::to_string(y);
812 for (int x = 0; x < block_size.x; ++x) {
813 const std::string xind = std::to_string(x);
814 std::string R = "r" + generate_id_full(xind, yind, zind, sind);
815 std::string S = "src" + generate_id(xind, yind, zind);
816 if (use_simd_broadcast) {
817 int simd_id = (s * 4 + ch + shared_offset) / simd_size;
818 int thread_id = (s * 4 + ch + shared_offset) % simd_size;
819 std::string w_val_x = "SUB_GROUP_BROADCAST(simd_w" +
820 std::to_string(simd_id) + ".x, " +
821 std::to_string(thread_id) + "u)";
822 std::string w_val_y = "SUB_GROUP_BROADCAST(simd_w" +
823 std::to_string(simd_id) + ".y, " +
824 std::to_string(thread_id) + "u)";
825 std::string w_val_z = "SUB_GROUP_BROADCAST(simd_w" +
826 std::to_string(simd_id) + ".z, " +
827 std::to_string(thread_id) + "u)";
828 std::string w_val_w = "SUB_GROUP_BROADCAST(simd_w" +
829 std::to_string(simd_id) + ".w, " +
830 std::to_string(thread_id) + "u)";
831 if (GetWeightsDescription().IsI4O4()) {
832 c += " " + R + ".x += " + w_val_x + " * " + S + "." +
833 channels[ch] + ";\n";
834 c += " " + R + ".y += " + w_val_y + " * " + S + "." +
835 channels[ch] + ";\n";
836 c += " " + R + ".z += " + w_val_z + " * " + S + "." +
837 channels[ch] + ";\n";
838 c += " " + R + ".w += " + w_val_w + " * " + S + "." +
839 channels[ch] + ";\n";
840 } else {
841 c += " " + R + "." + channels[ch] + " += " + w_val_x +
842 " * " + S + ".x;\n";
843 c += " " + R + "." + channels[ch] + " += " + w_val_y +
844 " * " + S + ".y;\n";
845 c += " " + R + "." + channels[ch] + " += " + w_val_z +
846 " * " + S + ".z;\n";
847 c += " " + R + "." + channels[ch] + " += " + w_val_w +
848 " * " + S + ".w;\n";
849 }
850 } else {
851 const std::string weight_id =
852 std::to_string(s * 4 + ch + shared_offset);
853 std::string w_val;
854 if (conv_params.AreWeightsBuffer()) {
855 if (gpu_info.SupportsPointersInKernels()) {
856 w_val = "weights_cache[" + weight_id + "]";
857 } else {
858 w_val = "args.weights.Read(filters_offset + " +
859 weight_id + ")";
860 }
861 } else {
862 w_val = "f" + weight_id;
863 }
864 if (GetWeightsDescription().IsI4O4()) {
865 c += " " + R + " += " + w_val + " * " + S + "." +
866 channels[ch] + ";\n";
867 } else {
868 c += " " + R + "." + channels[ch] + " += dot(" + w_val +
869 ", " + S + ");\n";
870 }
871 }
872 }
873 }
874 }
875 }
876 } else { // F32_F16 precision and weights type is float16
877 for (int z = 0; z < block_size.z; ++z) {
878 const std::string zind = std::to_string(z);
879 for (int y = 0; y < block_size.y; ++y) {
880 const std::string yind = std::to_string(y);
881 for (int x = 0; x < block_size.x; ++x) {
882 const std::string xind = std::to_string(x);
883 std::string R = "r" + generate_id_full(xind, yind, zind, sind);
884 std::string S = "src" + generate_id(xind, yind, zind);
885 std::vector<std::string> F(4);
886 for (int i = 0; i < 4; ++i) {
887 std::string weight_id =
888 std::to_string(s * 4 + i + shared_offset);
889 if (conv_params.AreWeightsBuffer()) {
890 if (gpu_info.SupportsPointersInKernels()) {
891 F[i] = "weights_cache[" + weight_id + "]";
892 } else {
893 F[i] =
894 "args.weights.Read(filters_offset + " + weight_id + ")";
895 }
896 } else {
897 F[i] = "f" + weight_id;
898 }
899 }
900 if (GetWeightsDescription().IsI4O4()) {
901 c += " " + R + " += TO_ACCUM_TYPE(" + S + ".x * " + F[0] +
902 " + " + S + ".y * " + F[1] + " + " + S + ".z * " + F[2] +
903 " + " + S + ".w * " + F[3] + ");\n";
904 } else {
905 c += " " + R + ".x += dot(" + S + ", " + F[0] + ");\n";
906 c += " " + R + ".y += dot(" + S + ", " + F[1] + ");\n";
907 c += " " + R + ".z += dot(" + S + ", " + F[2] + ");\n";
908 c += " " + R + ".w += dot(" + S + ", " + F[3] + ");\n";
909 }
910 }
911 }
912 }
913 }
914 }
915 };
916
917 c += " int s = 0;\n";
918 c += " do {\n";
919 declare_src();
920 const int total_work_items =
921 work_group_size_.x * work_group_size_.y * work_group_size_.z;
922 if (conv_params.weights_upload_type ==
923 ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
924 c += GenerateAsyncUpload("weights_cache", "filters_loc",
925 /*global_offset_name*/ "", local_mem_size);
926 } else if (conv_params.weights_upload_type ==
927 ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
928 c += " " + barrier + ";\n";
929 c += GenerateUploadByThreads("weights_cache", "filters_loc",
930 /*global_offset_name*/ "", "lid",
931 total_work_items, local_mem_size);
932 } else if (use_simd_broadcast) {
933 int parts = local_mem_size / simd_size;
934 int reminder = local_mem_size % simd_size;
935 for (int i = 0; i < parts; ++i) {
936 c += " FLT4 simd_w" + std::to_string(i) + " = filters_loc[simd_id + " +
937 std::to_string(i * simd_size) + "];\n";
938 }
939 if (reminder) {
940 c += " FLT4 simd_w" + std::to_string(parts) + ";\n";
941 c += " if (simd_id < " + std::to_string(reminder) + ") {\n";
942 c += " simd_w" + std::to_string(parts) +
943 " = filters_loc[simd_id + " + std::to_string(parts * simd_size) +
944 "];\n";
945 c += " }\n";
946 }
947 } else if (conv_params.AreWeightsBuffer()) { // GLOBAL_MEM/CONSTANT_MEM
948 if (gpu_info.SupportsPointersInKernels()) {
949 c += " weights_cache = filters_loc;\n";
950 }
951 } else { // TEXTURES_MEM
952 for (int dst_s = 0; dst_s < block_size.w; ++dst_s) {
953 std::string f_y = trivial_kernel_size ? "s" : "filter_offset";
954 if (conv_params.different_weights_for_height) {
955 f_y = "DST_Y * args.src_tensor.Slices() + s";
956 }
957 c += absl::Substitute(
958 R"( FLT4 f$2 = args.weights0.Read(DST_S + $0, $1);
959 FLT4 f$3 = args.weights1.Read(DST_S + $0, $1);
960 FLT4 f$4 = args.weights2.Read(DST_S + $0, $1);
961 FLT4 f$5 = args.weights3.Read(DST_S + $0, $1);
962 )",
963 dst_s, f_y, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2,
964 dst_s * 4 + 3);
965 }
966 if (!trivial_kernel_size) {
967 c += " filter_offset++;\n";
968 }
969 }
970 read_src();
971 c += " s += 1;\n";
972 if (conv_params.weights_upload_type ==
973 ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
974 c += " " + barrier + ";\n";
975 }
976 conv_core(0);
977 for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
978 read_src();
979 conv_core(i * block_size.w * 4);
980 c += " s += 1;\n";
981 }
982 if (conv_params.AreWeightsBuffer()) {
983 if (gpu_info.SupportsPointersInKernels()) {
984 c += " filters_loc += " + std::to_string(local_mem_size) + ";\n";
985 } else {
986 c += " filters_offset += " + std::to_string(local_mem_size) + ";\n";
987 }
988 }
989 c += " } while (s < args.src_tensor.Slices());\n";
990 if (!conv_params.x_kernel_is_1) {
991 c += " };\n";
992 }
993 if (!conv_params.y_kernel_is_1) {
994 c += " };\n";
995 }
996 if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
997 c += " };\n";
998 }
999 if (conv_params.AreWeightsBuffer()) {
1000 if (conv_params.weights_upload_type ==
1001 ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
1002 c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S",
1003 block_size.w);
1004 } else if (conv_params.weights_upload_type ==
1005 ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
1006 c += " " + barrier + ";\n";
1007 c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()",
1008 "DST_S", "lid", total_work_items,
1009 block_size.w);
1010 c += " " + barrier + ";\n";
1011 } else if (gpu_info.SupportsPointersInKernels()) {
1012 c += " weights_cache = args.biases.GetPtr() + DST_S;\n";
1013 }
1014 }
1015 if (late_oob_check) {
1016 c += " if (" + dst_oob_check + ") {\n";
1017 c += " return;\n";
1018 c += " }\n";
1019 }
1020
1021 auto generate_dst_check = [&](int x, int y, int z) {
1022 std::string check;
1023 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
1024 const std::vector<std::string> names{"Width()", "Height()", "Depth()"};
1025 std::vector<std::string> coords(3);
1026 coords[0] = "DST_X + " + std::to_string(x);
1027 coords[1] = "DST_Y + " + std::to_string(y);
1028 coords[2] = "DST_Z + " + std::to_string(z);
1029 const std::vector<int> ids{x, y, z};
1030 for (int i = 0; i < axes.size(); ++i) {
1031 const auto& axis = axes[i];
1032 if (src_def.HasAxis(axis) && ids[i] != 0) {
1033 if (!check.empty()) {
1034 check += " && ";
1035 }
1036 check += coords[i] + " < args.dst_tensor." + names[i];
1037 }
1038 }
1039 return check;
1040 };
1041
1042 for (int s = 0; s < block_size.w; ++s) {
1043 const std::string sind = std::to_string(s);
1044 c += " if (DST_S + " + sind + " >= args.dst_tensor.Slices()) return;\n";
1045 c += " {\n";
1046 if (conv_params.AreWeightsBuffer() &&
1047 gpu_info.SupportsPointersInKernels()) {
1048 c += " FLT4 bias_val = TO_FLT4(weights_cache[" + sind + "]);\n";
1049 } else {
1050 c += " FLT4 bias_val = args.biases.Read(DST_S + " + sind + ");\n";
1051 }
1052 for (int z = 0; z < block_size.z; ++z) {
1053 const std::string zind = std::to_string(z);
1054 for (int y = 0; y < block_size.y; ++y) {
1055 const std::string yind = std::to_string(y);
1056 for (int x = 0; x < block_size.x; ++x) {
1057 const std::string xind = std::to_string(x);
1058 const std::string id = generate_id_full(xind, yind, zind, sind);
1059 const std::string check = generate_dst_check(x, y, z);
1060 std::string coords = "DST_X + " + xind + ", DST_Y + " + yind;
1061 if (src_def.HasAxis(Axis::DEPTH)) {
1062 coords += ", DST_Z + " + zind;
1063 }
1064 coords += ", DST_S + " + sind;
1065 if (!check.empty()) {
1066 c += " if (" + check + ") {\n";
1067 } else {
1068 c += " {\n";
1069 }
1070 c += " FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
1071 c += " args.dst_tensor.Write(res, " + coords + ");\n";
1072 c += " }\n";
1073 }
1074 }
1075 }
1076 c += " }\n";
1077 }
1078 c += "}\n";
1079 return c;
1080 }
1081
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,int src_depth,int dst_depth,bool x_kernel_is_1,bool y_kernel_is_1,bool different_weights_for_height,const BHWC * dst_shape)1082 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1083 const GpuInfo& gpu_info, const OperationDef& definition, int src_depth,
1084 int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
1085 bool different_weights_for_height, const BHWC* dst_shape) {
1086 ConvParams conv_params;
1087 conv_params.linear_spatial = false;
1088 conv_params.linear_all = false;
1089 conv_params.block_size = int4(1, 1, 1, 1);
1090 conv_params.weights_data_type =
1091 DeduceDataTypeFromPrecision(definition.precision);
1092 conv_params.x_kernel_is_1 = x_kernel_is_1;
1093 conv_params.y_kernel_is_1 = y_kernel_is_1;
1094 conv_params.different_weights_for_height = different_weights_for_height;
1095 if (gpu_info.IsNvidia()) {
1096 if (different_weights_for_height) {
1097 work_group_size_ = int3(32, 1, 1);
1098 work_group_launch_order_ = int3(2, 0, 1);
1099 conv_params.fixed_work_group_size = true;
1100 } else {
1101 conv_params.linear_spatial = true;
1102 work_group_size_ = int3(32, 1, 1);
1103 work_group_launch_order_ = int3(1, 0, 2);
1104 conv_params.fixed_work_group_size = true;
1105 }
1106 conv_params.block_size = int4(2, 1, 1, 4);
1107 conv_params.src_depth_loop_size = 1;
1108 conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
1109 if (dst_depth % 4 == 0 || dst_depth >= 8) {
1110 conv_params.block_size.w = 4;
1111 } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1112 conv_params.block_size.w = 2;
1113 } else {
1114 conv_params.block_size.w = dst_depth;
1115 }
1116 if (dst_shape) {
1117 int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
1118 float task_size_per_cu =
1119 static_cast<float>(task_size) / gpu_info.GetComputeUnitsCount();
1120 int block_size = conv_params.block_size.x * conv_params.block_size.y *
1121 conv_params.block_size.w;
1122 float threads_per_cu = task_size_per_cu / block_size;
1123 float warps_per_cu = threads_per_cu / 32 /*warp_size*/;
1124 if (warps_per_cu < 8.0f) {
1125 conv_params.block_size.x = 1;
1126 }
1127 if (warps_per_cu < 4.0f && conv_params.block_size.w >= 4) {
1128 conv_params.block_size.w /= 2;
1129 }
1130 if (warps_per_cu < 2.0f && conv_params.block_size.w >= 2) {
1131 conv_params.block_size.w /= 2;
1132 }
1133 }
1134 if (src_depth % 2 == 0) {
1135 conv_params.src_depth_loop_size = 2;
1136 }
1137 if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1138 conv_params.src_depth_loop_size = 4;
1139 }
1140 } else if (gpu_info.IsPowerVR()) {
1141 if (different_weights_for_height) {
1142 work_group_size_ = int3(32, 1, 1);
1143 work_group_launch_order_ = int3(2, 0, 1);
1144 conv_params.fixed_work_group_size = true;
1145 } else {
1146 conv_params.linear_spatial = true;
1147 work_group_size_ = int3(32, 1, 1);
1148 work_group_launch_order_ = int3(1, 0, 2);
1149 conv_params.fixed_work_group_size = true;
1150 }
1151 conv_params.weights_data_type =
1152 definition.precision == CalculationsPrecision::F16 ? DataType::FLOAT16
1153 : DataType::FLOAT32;
1154 conv_params.block_size = int4(1, 1, 1, 4);
1155 conv_params.src_depth_loop_size = 1;
1156 conv_params.weights_upload_type =
1157 WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
1158 if (dst_depth % 8 == 0 || dst_depth >= 32) {
1159 conv_params.block_size.w = 8;
1160 } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
1161 conv_params.block_size.w = 4;
1162 } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1163 conv_params.block_size.w = 2;
1164 } else {
1165 conv_params.block_size.w = dst_depth;
1166 }
1167 if (definition.precision == CalculationsPrecision::F16) {
1168 conv_params.block_size.w = std::min(4, conv_params.block_size.w);
1169 if (src_depth % 2 == 0) {
1170 conv_params.src_depth_loop_size = 2;
1171 }
1172 if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1173 conv_params.src_depth_loop_size = 4;
1174 }
1175 if (conv_params.block_size.w == 1) {
1176 if (src_depth % 2 == 0) {
1177 conv_params.src_depth_loop_size = 2;
1178 }
1179 if (src_depth % 4 == 0) {
1180 conv_params.src_depth_loop_size = 4;
1181 }
1182 if (src_depth <= 8) {
1183 conv_params.src_depth_loop_size = src_depth;
1184 }
1185 }
1186 conv_params.block_size.x = 2;
1187 }
1188 } else if (gpu_info.IsAMD()) {
1189 if (different_weights_for_height) {
1190 work_group_size_ = int3(32, 1, 1);
1191 work_group_launch_order_ = int3(2, 0, 1);
1192 conv_params.fixed_work_group_size = true;
1193 } else {
1194 work_group_size_ = int3(8, 4, 1);
1195 work_group_launch_order_ = int3(2, 0, 1);
1196 conv_params.fixed_work_group_size = true;
1197 }
1198
1199 conv_params.block_size = int4(2, 1, 1, 1);
1200 if (x_kernel_is_1 && y_kernel_is_1) {
1201 conv_params.block_size.y = 2;
1202 }
1203 conv_params.src_depth_loop_size = 1;
1204 conv_params.weights_upload_type = WeightsUploadType::CONSTANT_MEM;
1205 if (dst_depth % 8 == 0 || dst_depth >= 32) {
1206 conv_params.block_size.w = 8;
1207 } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
1208 conv_params.block_size.w = 4;
1209 } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1210 conv_params.block_size.w = 2;
1211 } else {
1212 conv_params.block_size.w = 1;
1213 }
1214 if (src_depth % 2 == 0 && src_depth >= 16) {
1215 conv_params.src_depth_loop_size = 2;
1216 }
1217 } else if (gpu_info.IsMali()) {
1218 int block_size = 2;
1219 if (dst_shape) {
1220 int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
1221 block_size = GetRecommendedBlockSizeForConv(
1222 gpu_info, definition.precision, task_size);
1223 }
1224 if (!x_kernel_is_1 || !y_kernel_is_1) {
1225 block_size = std::min(block_size, 4);
1226 }
1227 if (block_size == 8) {
1228 if (dst_depth == 1 || dst_depth == 3) {
1229 conv_params.block_size = int4(2, 2, 1, 1);
1230 } else {
1231 conv_params.block_size = int4(2, 2, 1, 2);
1232 }
1233 } else if (block_size == 4) {
1234 if (dst_depth == 1 || dst_depth == 3) {
1235 conv_params.block_size = int4(2, 2, 1, 1);
1236 } else {
1237 conv_params.block_size = int4(2, 1, 1, 1);
1238 if (definition.precision == CalculationsPrecision::F32 &&
1239 gpu_info.mali_info.IsValhall()) {
1240 conv_params.block_size.y = 2;
1241 } else {
1242 conv_params.block_size.w = 2;
1243 }
1244 }
1245 } else if (block_size == 2) {
1246 conv_params.block_size = int4(2, 1, 1, 1);
1247 } else {
1248 conv_params.block_size = int4(1, 1, 1, 1);
1249 }
1250 conv_params.src_depth_loop_size = 1;
1251 MaliInfo mali_info = gpu_info.mali_info;
1252 if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
1253 conv_params.src_depth_loop_size = 2;
1254 }
1255 if (src_depth % 4 == 0 && block_size == 1 && !mali_info.IsMidgard() &&
1256 definition.precision == CalculationsPrecision::F16) {
1257 conv_params.src_depth_loop_size = 4;
1258 }
1259 work_group_size_ = int3(4, 4, 1);
1260 work_group_launch_order_ = int3(0, 1, 2);
1261 conv_params.fixed_work_group_size = false;
1262 conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1263 } else if (gpu_info.IsAdreno()) {
1264 conv_params.block_size = int4(2, 2, 1, 2);
1265 if (gpu_info.adreno_info.IsAdreno3xx()) {
1266 if (definition.precision == CalculationsPrecision::F16) {
1267 conv_params.block_size = int4(2, 2, 1, 2);
1268 } else if (definition.precision == CalculationsPrecision::F32_F16) {
1269 conv_params.block_size = int4(2, 1, 1, 2);
1270 } else { // F32
1271 conv_params.block_size = int4(2, 2, 1, 1);
1272 }
1273 }
1274 work_group_size_ = int3(8, 2, 1);
1275 work_group_launch_order_ = int3(0, 1, 2);
1276 conv_params.fixed_work_group_size = false;
1277 conv_params.src_depth_loop_size = 1;
1278 if (definition.src_tensors.size() == 2) {
1279 // dynamic weights supported only with buffers.
1280 conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1281 } else {
1282 conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM_X4;
1283 }
1284 } else if (gpu_info.IsIntel()) {
1285 if (different_weights_for_height) {
1286 work_group_size_ = int3(16, 1, 1);
1287 work_group_launch_order_ = int3(0, 1, 2);
1288 conv_params.fixed_work_group_size = true;
1289 } else {
1290 conv_params.linear_spatial = true;
1291 work_group_size_ = int3(16, 1, 1);
1292 work_group_launch_order_ = int3(0, 1, 2);
1293 conv_params.fixed_work_group_size = true;
1294 }
1295 conv_params.block_size = int4(1, 1, 1, 4);
1296 conv_params.src_depth_loop_size = 1;
1297 int sub_group_size = 16;
1298 const bool supports_subgroups =
1299 gpu_info.SupportsExtension("cl_khr_subgroups") ||
1300 gpu_info.SupportsExtension("cl_intel_subgroups");
1301 if (definition.precision != CalculationsPrecision::F32_F16 &&
1302 supports_subgroups &&
1303 gpu_info.SupportsExtension("cl_intel_required_subgroup_size") &&
1304 gpu_info.SupportsSubGroupWithSize(sub_group_size)) {
1305 conv_params.weights_upload_type =
1306 WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
1307 conv_params.simd_size = sub_group_size;
1308 } else {
1309 conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
1310 }
1311 if (dst_depth % 4 == 0 || dst_depth >= 8) {
1312 conv_params.block_size.w = 4;
1313 } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1314 conv_params.block_size.w = 2;
1315 } else {
1316 conv_params.block_size.w = dst_depth;
1317 }
1318 if (src_depth % 2 == 0) {
1319 conv_params.src_depth_loop_size = 2;
1320 }
1321 if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1322 conv_params.src_depth_loop_size = 4;
1323 }
1324 } else if (gpu_info.IsApple()) {
1325 conv_params.block_size = int4(2, 2, 1, 2);
1326 work_group_size_ = int3(8, 4, 1);
1327 work_group_launch_order_ = int3(0, 1, 2);
1328 conv_params.fixed_work_group_size = true;
1329 conv_params.src_depth_loop_size = 1;
1330 conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1331 } else {
1332 conv_params.block_size = int4(1, 1, 1, 4);
1333 work_group_size_ = int3(8, 2, 1);
1334 work_group_launch_order_ = int3(0, 1, 2);
1335 conv_params.fixed_work_group_size = false;
1336 conv_params.src_depth_loop_size = 1;
1337 conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1338 if (dst_depth % 4 == 0 || dst_depth >= 8) {
1339 conv_params.block_size.w = 4;
1340 } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1341 conv_params.block_size.w = 2;
1342 } else {
1343 conv_params.block_size.w = dst_depth;
1344 }
1345 if (src_depth % 2 == 0) {
1346 conv_params.src_depth_loop_size = 2;
1347 }
1348 if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1349 conv_params.src_depth_loop_size = 4;
1350 }
1351 }
1352 if (conv_params.AreWeightsBuffer()) {
1353 if (gpu_info.IsApple()) {
1354 conv_params.weights_layout = WeightsLayout::kOSpatialIOGroupO4I4;
1355 } else {
1356 conv_params.weights_layout = WeightsLayout::kOSpatialIOGroupI4O4;
1357 }
1358 } else {
1359 if (gpu_info.IsApple()) {
1360 conv_params.weights_layout =
1361 WeightsLayout::k2DX4O4YIsSpatialIAndXIsOOGroupI4;
1362 } else {
1363 conv_params.weights_layout =
1364 WeightsLayout::k2DX4I4YIsSpatialIAndXIsOOGroupO4;
1365 }
1366 }
1367
1368 return conv_params;
1369 }
1370
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1371 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1372 const GpuInfo& gpu_info, const OperationDef& definition,
1373 const Convolution2DAttributes& attr, const BHWC* dst_shape) {
1374 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1375 const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1376 const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
1377 attr.dilations.w == 1 &&
1378 attr.padding.prepended.w == 0 &&
1379 attr.padding.appended.w == 0;
1380 const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
1381 attr.dilations.h == 1 &&
1382 attr.padding.prepended.h == 0 &&
1383 attr.padding.appended.h == 0;
1384 return GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1385 x_kernel_is_1, y_kernel_is_1, false, dst_shape);
1386 }
1387
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution3DAttributes & attr,const BHWDC * dst_shape)1388 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1389 const GpuInfo& gpu_info, const OperationDef& definition,
1390 const Convolution3DAttributes& attr, const BHWDC* dst_shape) {
1391 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1392 const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1393 const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
1394 attr.dilations.w == 1 &&
1395 attr.padding.prepended.w == 0 &&
1396 attr.padding.appended.w == 0;
1397 const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
1398 attr.dilations.h == 1 &&
1399 attr.padding.prepended.h == 0 &&
1400 attr.padding.appended.h == 0;
1401 const bool z_kernel_is_1 = attr.weights.shape.d == 1 && attr.strides.d == 1 &&
1402 attr.dilations.d == 1 &&
1403 attr.padding.prepended.d == 0 &&
1404 attr.padding.appended.d == 0;
1405
1406 ConvPowerVR::ConvParams result;
1407 BHWC shape;
1408 if (dst_shape) {
1409 shape.b = dst_shape->b;
1410 shape.h = dst_shape->h * dst_shape->d;
1411 shape.w = dst_shape->w;
1412 shape.c = dst_shape->c;
1413 result = GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1414 x_kernel_is_1, y_kernel_is_1, false, &shape);
1415 } else {
1416 result = GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1417 x_kernel_is_1, y_kernel_is_1, false, nullptr);
1418 }
1419 result.z_kernel_is_1 = z_kernel_is_1;
1420 return result;
1421 }
1422
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const BHWC * dst_shape)1423 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1424 const GpuInfo& gpu_info, const OperationDef& definition,
1425 const Convolution2DAttributes& attr, const BHWC& weights_shape,
1426 const BHWC* dst_shape) {
1427 const int dst_depth = DivideRoundUp(weights_shape.b, 4);
1428 const int src_depth = DivideRoundUp(weights_shape.c, 4);
1429 const bool x_kernel_is_1 =
1430 weights_shape.w == 1 && attr.strides.w == 1 && attr.dilations.w == 1 &&
1431 attr.padding.prepended.w == 0 && attr.padding.appended.w == 0;
1432 const bool y_kernel_is_1 =
1433 weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
1434 attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
1435 return GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1436 x_kernel_is_1, y_kernel_is_1, false, dst_shape);
1437 }
1438
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr,const BHWC * dst_shape)1439 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1440 const GpuInfo& gpu_info, const OperationDef& definition,
1441 const FullyConnectedAttributes& attr, const BHWC* dst_shape) {
1442 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1443 const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1444 ConvPowerVR::ConvParams params = GuessBestParams(
1445 gpu_info, definition, src_depth, dst_depth, true, true, false, dst_shape);
1446 work_group_size_.x *= work_group_size_.y;
1447 work_group_size_.y = 1;
1448 params.block_size.x *= params.block_size.y;
1449 params.block_size.y = 1;
1450 return params;
1451 }
1452
GuessBestParamsWinograd(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1453 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
1454 const GpuInfo& gpu_info, const OperationDef& definition,
1455 const Convolution2DAttributes& attr, const BHWC* dst_shape) {
1456 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1457 const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1458 ConvPowerVR::ConvParams params = GuessBestParams(
1459 gpu_info, definition, src_depth, dst_depth, true, true, true, dst_shape);
1460 params.block_size.x *= params.block_size.y;
1461 params.block_size.y = 1;
1462 return params;
1463 }
1464
CreateConvPowerVR(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1465 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
1466 const OperationDef& definition,
1467 const Convolution2DAttributes& attr,
1468 const BHWC* dst_shape) {
1469 ConvPowerVR result(definition, attr, gpu_info, dst_shape);
1470 result.GenerateCode(gpu_info);
1471 result.UploadData(attr.weights, attr.bias);
1472 return result;
1473 }
1474
CreateConvPowerVR(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr,const BHWC * dst_shape)1475 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
1476 const OperationDef& definition,
1477 const FullyConnectedAttributes& attr,
1478 const BHWC* dst_shape) {
1479 ConvPowerVR result(definition, attr, gpu_info, dst_shape);
1480 result.GenerateCode(gpu_info);
1481 result.UploadData(attr.weights, attr.bias);
1482 return result;
1483 }
1484
CreateConvPowerVRDynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const BHWC * dst_shape)1485 ConvPowerVR CreateConvPowerVRDynamicWeights(const GpuInfo& gpu_info,
1486 const OperationDef& definition,
1487 const Convolution2DAttributes& attr,
1488 const BHWC& weights_shape,
1489 const BHWC* dst_shape) {
1490 ConvPowerVR result(definition, attr, weights_shape, gpu_info, dst_shape);
1491 result.GenerateCode(gpu_info);
1492 result.UploadBias(attr.bias);
1493 return result;
1494 }
1495
CreateConvPowerVRWino4x4To6x6(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1496 ConvPowerVR CreateConvPowerVRWino4x4To6x6(const GpuInfo& gpu_info,
1497 const OperationDef& definition,
1498 const Convolution2DAttributes& attr,
1499 const BHWC* dst_shape) {
1500 ConvPowerVR result(definition);
1501 result.conv_params_ =
1502 result.GuessBestParamsWinograd(gpu_info, definition, attr, dst_shape);
1503 result.GenerateCode(gpu_info);
1504 result.UploadDataForWinograd4x4To6x6(attr.weights);
1505 return result;
1506 }
1507
CreateConvPowerVR3D(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution3DAttributes & attr,const BHWDC * dst_shape)1508 ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
1509 const OperationDef& definition,
1510 const Convolution3DAttributes& attr,
1511 const BHWDC* dst_shape) {
1512 ConvPowerVR result(definition, attr, gpu_info, dst_shape);
1513 result.GenerateCode(gpu_info);
1514 result.UploadWeights(attr.weights);
1515 result.UploadBias(attr.bias);
1516 return result;
1517 }
1518
1519 } // namespace gpu
1520 } // namespace tflite
1521