1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/conv_powervr.h"
17
18 #include <algorithm>
19 #include <string>
20 #include <utility>
21
22 #include "absl/strings/substitute.h"
23 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
28
29 namespace tflite {
30 namespace gpu {
31
32 namespace {
GenerateUploadByThreads(const std::string & local_ptr_name,const std::string & global_ptr_name,const std::string & global_offset_name,const std::string & lid_name,int total_work_items,int elements_to_upload)33 std::string GenerateUploadByThreads(const std::string& local_ptr_name,
34 const std::string& global_ptr_name,
35 const std::string& global_offset_name,
36 const std::string& lid_name,
37 int total_work_items,
38 int elements_to_upload) {
39 std::string c;
40 std::string offset =
41 global_offset_name.empty() ? "" : global_offset_name + " + ";
42 const int groups = elements_to_upload / total_work_items;
43 const int reminder = elements_to_upload % total_work_items;
44 for (int i = 0; i < groups; ++i) {
45 c += " " + local_ptr_name + "[" + lid_name + " + " +
46 std::to_string(total_work_items * i) + "] = " + global_ptr_name + "[" +
47 offset + lid_name + " + " + std::to_string(total_work_items * i) +
48 "];\n";
49 }
50 if (reminder != 0) {
51 c += " if (" + lid_name + " < " + std::to_string(reminder) + ") {\n";
52 c += " " + local_ptr_name + "[" + lid_name + " + " +
53 std::to_string(total_work_items * groups) + "] = " + global_ptr_name +
54 "[" + offset + lid_name + " + " +
55 std::to_string(total_work_items * groups) + "];\n";
56 c += " }\n";
57 }
58 return c;
59 }
60
GenerateAsyncUpload(const std::string & local_ptr_name,const std::string & global_ptr_name,const std::string & global_offset_name,int elements_to_upload)61 std::string GenerateAsyncUpload(const std::string& local_ptr_name,
62 const std::string& global_ptr_name,
63 const std::string& global_offset_name,
64 int elements_to_upload) {
65 std::string c;
66 std::string offset =
67 global_offset_name.empty() ? "" : " + " + global_offset_name;
68 c += " async_work_group_copy(" + local_ptr_name + ", " + global_ptr_name +
69 offset + ", " + std::to_string(elements_to_upload) + ", 0);\n";
70 return c;
71 }
72
GenerateBlockCoords(const int4 & block_size,const int3 & work_group_launch_order,bool linear_spatial,bool linear_all,bool need_depth)73 std::string GenerateBlockCoords(const int4& block_size,
74 const int3& work_group_launch_order,
75 bool linear_spatial, bool linear_all,
76 bool need_depth) {
77 std::string c;
78 int3 launch_remap;
79 launch_remap[work_group_launch_order.x] = 0;
80 launch_remap[work_group_launch_order.y] = 1;
81 launch_remap[work_group_launch_order.z] = 2;
82 if (linear_all) {
83 c += " int linear_id = GLOBAL_ID_0;\n";
84 c += " int DST_S = (linear_id / args.task_size_spatial) * " +
85 std::to_string(block_size.w) + ";\n";
86 c += " int linear_spatial = linear_id % args.task_size_spatial;\n";
87 if (need_depth) {
88 c += " int DST_X = (linear_spatial % args.task_size_x) * " +
89 std::to_string(block_size.x) + ";\n";
90 c += " linear_spatial = linear_spatial / args.task_size_x;\n";
91 c += " int DST_Y = (linear_spatial % args.task_size_y) * " +
92 std::to_string(block_size.y) + ";\n";
93 c += " int DST_Z = (linear_spatial / args.task_size_y) * " +
94 std::to_string(block_size.z) + ";\n";
95 } else {
96 c += " int DST_Y = (linear_spatial / args.task_size_x) * " +
97 std::to_string(block_size.y) + ";\n";
98 c += " int DST_X = (linear_spatial % args.task_size_x) * " +
99 std::to_string(block_size.x) + ";\n";
100 }
101 } else if (linear_spatial) {
102 if (work_group_launch_order[0] == 0) {
103 c += " int linear_spatial = GLOBAL_ID_0;\n";
104 } else {
105 c += " int linear_spatial = GROUP_ID_" +
106 std::to_string(launch_remap[0]) + " * GROUP_SIZE_0 + LOCAL_ID_0;\n";
107 }
108 if (need_depth) {
109 c += " int DST_X = (linear_spatial % args.task_size_x) * " +
110 std::to_string(block_size.x) + ";\n";
111 c += " linear_spatial = linear_spatial / args.task_size_x;\n";
112 c += " int DST_Y = (linear_spatial % args.task_size_y) * " +
113 std::to_string(block_size.y) + ";\n";
114 c += " int DST_Z = (linear_spatial / args.task_size_y) * " +
115 std::to_string(block_size.z) + ";\n";
116 } else {
117 c += " int DST_Y = (linear_spatial / args.task_size_x) * " +
118 std::to_string(block_size.y) + ";\n";
119 c += " int DST_X = (linear_spatial % args.task_size_x) * " +
120 std::to_string(block_size.x) + ";\n";
121 }
122 if (work_group_launch_order[1] == 1) {
123 c +=
124 " int DST_S = GLOBAL_ID_1 * " + std::to_string(block_size.w) + ";\n";
125 } else {
126 c += " int DST_S = (GROUP_ID_" + std::to_string(launch_remap[1]) +
127 " * GROUP_SIZE_1 + LOCAL_ID_1) * " + std::to_string(block_size.w) +
128 ";\n";
129 }
130 } else {
131 if (work_group_launch_order[0] == 0) {
132 c +=
133 " int DST_X = GLOBAL_ID_0 * " + std::to_string(block_size.x) + ";\n";
134 } else {
135 c += " int DST_X = (GROUP_ID_" + std::to_string(launch_remap[0]) +
136 " * GROUP_SIZE_0 + LOCAL_ID_0) * " + std::to_string(block_size.x) +
137 ";\n";
138 }
139 std::string global_id_1;
140 if (work_group_launch_order[1] == 1) {
141 global_id_1 = "GLOBAL_ID_1";
142 } else {
143 global_id_1 = "(GROUP_ID_" + std::to_string(launch_remap[1]) +
144 " * GROUP_SIZE_1 + LOCAL_ID_1)";
145 }
146 if (need_depth) {
147 c += " int linear_id_1 = " + global_id_1 + ";\n";
148 c += " int DST_Z = (linear_id_1 / args.task_size_y) * " +
149 std::to_string(block_size.z) + ";\n";
150 c += " int DST_Y = (linear_id_1 % args.task_size_y) * " +
151 std::to_string(block_size.y) + ";\n";
152 } else {
153 c += " int DST_Y = " + global_id_1 + " * " +
154 std::to_string(block_size.y) + ";\n";
155 }
156 if (work_group_launch_order[2] == 2) {
157 c +=
158 " int DST_S = GLOBAL_ID_2 * " + std::to_string(block_size.w) + ";\n";
159 } else {
160 c += " int DST_S = (GROUP_ID_" + std::to_string(launch_remap[2]) +
161 " * GROUP_SIZE_2 + LOCAL_ID_2) * " + std::to_string(block_size.w) +
162 ";\n";
163 }
164 }
165
166 return c;
167 }
168 } // namespace
169
ConvPowerVR(const OperationDef & definition,const Convolution2DAttributes & attr,const GpuInfo & gpu_info,const BHWC * dst_shape)170 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
171 const Convolution2DAttributes& attr,
172 const GpuInfo& gpu_info, const BHWC* dst_shape)
173 : GPUOperation(definition),
174 stride_(attr.strides.w, attr.strides.h, 1, 1),
175 padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
176 kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 1, 1),
177 dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
178 conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
179
ConvPowerVR(const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const GpuInfo & gpu_info,const BHWC * dst_shape)180 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
181 const Convolution2DAttributes& attr,
182 const BHWC& weights_shape, const GpuInfo& gpu_info,
183 const BHWC* dst_shape)
184 : GPUOperation(definition),
185 stride_(attr.strides.w, attr.strides.h, 1, 1),
186 padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
187 kernel_size_(weights_shape.w, weights_shape.h, 1, 1),
188 dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
189 conv_params_(GuessBestParams(gpu_info, definition, attr, weights_shape,
190 dst_shape)) {}
191
ConvPowerVR(const OperationDef & definition,const FullyConnectedAttributes & attr,const GpuInfo & gpu_info,const BHWC * dst_shape)192 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
193 const FullyConnectedAttributes& attr,
194 const GpuInfo& gpu_info, const BHWC* dst_shape)
195 : GPUOperation(definition),
196 stride_(1, 1, 1, 1),
197 padding_(0, 0, 0, 0),
198 kernel_size_(1, 1, 1, 1),
199 dilation_(1, 1, 1, 1),
200 conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
201
ConvPowerVR(const OperationDef & definition)202 ConvPowerVR::ConvPowerVR(const OperationDef& definition)
203 : GPUOperation(definition),
204 stride_(1, 1, 1, 1),
205 padding_(0, 0, 0, 0),
206 kernel_size_(1, 1, 1, 1),
207 dilation_(1, 1, 1, 1) {}
208
ConvPowerVR(ConvPowerVR && operation)209 ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
210 : GPUOperation(std::move(operation)),
211 stride_(operation.stride_),
212 padding_(operation.padding_),
213 kernel_size_(operation.kernel_size_),
214 dilation_(operation.dilation_),
215 conv_params_(operation.conv_params_) {}
216
ConvPowerVR(const OperationDef & definition,const Convolution3DAttributes & attr,const GpuInfo & gpu_info,const BHWDC * dst_shape)217 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
218 const Convolution3DAttributes& attr,
219 const GpuInfo& gpu_info, const BHWDC* dst_shape)
220 : GPUOperation(definition),
221 stride_(attr.strides.w, attr.strides.h, attr.strides.d, 1),
222 padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
223 -attr.padding.prepended.d, 0),
224 kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
225 attr.weights.shape.d, 1),
226 dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 1),
227 conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
228
operator =(ConvPowerVR && operation)229 ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
230 if (this != &operation) {
231 std::swap(stride_, operation.stride_);
232 std::swap(padding_, operation.padding_);
233 std::swap(kernel_size_, operation.kernel_size_);
234 std::swap(dilation_, operation.dilation_);
235 std::swap(conv_params_, operation.conv_params_);
236 GPUOperation::operator=(std::move(operation));
237 }
238 return *this;
239 }
240
GenerateCode(const GpuInfo & gpu_info)241 void ConvPowerVR::GenerateCode(const GpuInfo& gpu_info) {
242 if (conv_params_.linear_all) {
243 grid_dimension_ = 1;
244 } else if (conv_params_.linear_spatial) {
245 grid_dimension_ = 2;
246 }
247 const bool stride_correction =
248 definition_.IsBatchSupported() && stride_.x != 1;
249 code_ = GenerateConv(gpu_info, definition_, stride_correction, conv_params_);
250 if (definition_.precision == CalculationsPrecision::F16 &&
251 gpu_info.IsPowerVR()) {
252 compiler_options_.push_back(CompilerOptions::kClPowervrFp16);
253 }
254 if (conv_params_.IsPrivateMemBroadcast() && gpu_info.IsCL20OrHigher()) {
255 compiler_options_.push_back(CompilerOptions::kCl20);
256 }
257 bool kernel_is_trivial =
258 conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
259 if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
260 kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1;
261 }
262 if (gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx() &&
263 definition_.precision == CalculationsPrecision::F16 &&
264 kernel_is_trivial) {
265 compiler_options_.push_back(CompilerOptions::kAdrenoFullSimd);
266 }
267 }
268
BindArguments(ArgumentsBinder * args)269 absl::Status ConvPowerVR::BindArguments(ArgumentsBinder* args) {
270 if (!conv_params_.x_kernel_is_1) {
271 RETURN_IF_ERROR(args->SetInt("stride_x", stride_.x));
272 RETURN_IF_ERROR(args->SetInt("padding_x", padding_.x * src_[0]->Batch()));
273 RETURN_IF_ERROR(args->SetInt("kernel_size_x", kernel_size_.x));
274 RETURN_IF_ERROR(args->SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
275 }
276 if (!conv_params_.y_kernel_is_1) {
277 RETURN_IF_ERROR(args->SetInt("stride_y", stride_.y));
278 RETURN_IF_ERROR(args->SetInt("padding_y", padding_.y));
279 RETURN_IF_ERROR(args->SetInt("kernel_size_y", kernel_size_.y));
280 RETURN_IF_ERROR(args->SetInt("dilation_y", dilation_.y));
281 }
282 if (definition_.src_tensors[0].HasAxis(Axis::DEPTH) &&
283 !conv_params_.z_kernel_is_1) {
284 RETURN_IF_ERROR(args->SetInt("stride_z", stride_.z));
285 RETURN_IF_ERROR(args->SetInt("padding_z", padding_.z));
286 RETURN_IF_ERROR(args->SetInt("kernel_size_z", kernel_size_.z));
287 RETURN_IF_ERROR(args->SetInt("dilation_z", dilation_.z));
288 }
289 const int task_size_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
290 conv_params_.block_size.x);
291 const int task_size_y =
292 DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
293 const int task_size_z =
294 DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
295 RETURN_IF_ERROR(args->SetInt("task_size_x", task_size_x));
296 RETURN_IF_ERROR(args->SetInt("task_size_y", task_size_y));
297 const int task_size_spatial = task_size_x * task_size_y * task_size_z;
298 RETURN_IF_ERROR(args->SetInt("task_size_spatial", task_size_spatial));
299 return absl::OkStatus();
300 }
301
GetGridSize() const302 int3 ConvPowerVR::GetGridSize() const {
303 const int task_size_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
304 conv_params_.block_size.x);
305 const int task_size_y =
306 DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
307 const int task_size_z =
308 DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
309 const int task_size_s =
310 DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w);
311 int3 wg;
312
313 if (conv_params_.linear_all) {
314 return int3(task_size_x * task_size_y * task_size_z * task_size_s, 1, 1);
315 } else if (conv_params_.linear_spatial) {
316 return int3(task_size_x * task_size_y * task_size_z, task_size_s, 1);
317 } else {
318 return int3(task_size_x, task_size_y * task_size_z, task_size_s);
319 }
320 }
321
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const322 void ConvPowerVR::GetPossibleKernelWorkGroups(
323 TuningType tuning_type, const GpuInfo& gpu_info,
324 const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
325 if (conv_params_.weights_upload_type ==
326 WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
327 conv_params_.weights_upload_type ==
328 WeightsUploadType::LOCAL_MEM_BY_THREADS ||
329 conv_params_.fixed_work_group_size) {
330 work_groups->push_back(work_group_size_);
331 return;
332 }
333 GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
334 work_groups);
335 }
336
GenerateConv(const GpuInfo & gpu_info,const OperationDef & op_def,bool stride_correction,const ConvParams & conv_params)337 std::string ConvPowerVR::GenerateConv(const GpuInfo& gpu_info,
338 const OperationDef& op_def,
339 bool stride_correction,
340 const ConvParams& conv_params) {
341 auto src_desc = op_def.src_tensors[0];
342 src_desc.SetAddressMode(AddressMode::kZero);
343 if (op_def.IsBatchSupported()) {
344 src_desc.SetStateVar("BatchedWidth", "true");
345 }
346 AddSrcTensor("src_tensor", src_desc);
347 if (op_def.src_tensors.size() == 2) {
348 // dynamic weights
349 BufferDescriptor desc;
350 desc.element_type = op_def.src_tensors[1].data_type;
351 desc.element_size = 4;
352 desc.memory_type = conv_params.weights_upload_type ==
353 ConvPowerVR::WeightsUploadType::CONSTANT_MEM
354 ? MemoryType::CONSTANT
355 : MemoryType::GLOBAL;
356
357 AddSrcBuffer("weights", desc);
358 }
359
360 const auto& src_def = op_def.src_tensors[0];
361
362 auto generate_id = [&](const std::string& x, const std::string& y,
363 const std::string& z) {
364 std::string id;
365 if (src_def.HasAxis(Axis::WIDTH)) {
366 id += "_w" + x;
367 }
368 if (src_def.HasAxis(Axis::HEIGHT)) {
369 id += "_h" + y;
370 }
371 if (src_def.HasAxis(Axis::DEPTH)) {
372 id += "_d" + z;
373 }
374 return id;
375 };
376
377 auto generate_id_full = [&](const std::string& x, const std::string& y,
378 const std::string& z, const std::string& s) {
379 return generate_id(x, y, z) + "_s" + s;
380 };
381
382 auto generate_check = [&](const std::string& x, const std::string& y,
383 const std::string& z) {
384 std::string check;
385 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
386 const std::vector<std::string> names{"in_x", "in_y", "in_z"};
387 const std::vector<bool> is_1{conv_params_.x_kernel_is_1,
388 conv_params_.y_kernel_is_1,
389 conv_params_.z_kernel_is_1};
390 const std::vector<std::string> coords{x, y, z};
391 for (int i = 0; i < axes.size(); ++i) {
392 const auto& axis = axes[i];
393 if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis) &&
394 !is_1[i]) {
395 if (!check.empty()) {
396 check += " && ";
397 }
398 check += names[i] + coords[i];
399 }
400 }
401 return check;
402 };
403
404 auto dst_desc = op_def.dst_tensors[0];
405 if (op_def.IsBatchSupported()) {
406 dst_desc.SetStateVar("BatchedWidth", "true");
407 }
408 AddDstTensor("dst_tensor", dst_desc);
409
410 if (!conv_params_.x_kernel_is_1) {
411 args_.AddInt("stride_x");
412 args_.AddInt("padding_x");
413 args_.AddInt("kernel_size_x");
414 args_.AddInt("dilation_x");
415 }
416 if (!conv_params_.y_kernel_is_1) {
417 args_.AddInt("stride_y");
418 args_.AddInt("padding_y");
419 args_.AddInt("kernel_size_y");
420 args_.AddInt("dilation_y");
421 }
422 if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
423 args_.AddInt("stride_z");
424 args_.AddInt("padding_z");
425 args_.AddInt("kernel_size_z");
426 args_.AddInt("dilation_z");
427 }
428 args_.AddInt("task_size_x");
429 args_.AddInt("task_size_y");
430 args_.AddInt("task_size_spatial");
431
432 const int wg_total_size =
433 work_group_size_.x * work_group_size_.y * work_group_size_.z;
434 const std::string barrier =
435 wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
436 ? "SIMD_LOCAL_MEM_BARRIER"
437 : "LOCAL_MEM_BARRIER";
438
439 const bool need_local_mem =
440 conv_params.weights_upload_type ==
441 ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
442 conv_params.weights_upload_type ==
443 ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
444
445 const int local_mem_size =
446 conv_params.block_size.w * 4 * conv_params.src_depth_loop_size;
447
448 const bool use_simd_broadcast = conv_params.IsPrivateMemBroadcast();
449 const int simd_size = conv_params.simd_size;
450
451 const bool late_oob_check = need_local_mem || use_simd_broadcast;
452
453 const std::string weights_space =
454 conv_params.weights_upload_type ==
455 ConvPowerVR::WeightsUploadType::CONSTANT_MEM
456 ? "__constant"
457 : "__global";
458
459 const std::string weights_data_type =
460 conv_params.weights_data_type == DataType::FLOAT32 ? "float4" : "half4";
461
462 const std::string weights_global_ptr =
463 weights_space + " " + weights_data_type + "*";
464
465 std::string c;
466 if (use_simd_broadcast && gpu_info.IsApiOpenCl()) {
467 if (gpu_info.opencl_info.cl_version == OpenClVersion::kCl2_0) {
468 c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
469 } else if (gpu_info.SupportsExtension("cl_intel_subgroups")) {
470 c += "#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n";
471 }
472 }
473 const int4 block_size = conv_params.block_size;
474 if (conv_params.fixed_work_group_size && gpu_info.IsApiOpenCl()) {
475 c += "__attribute__((reqd_work_group_size(" +
476 std::to_string(work_group_size_.x) + ", " +
477 std::to_string(work_group_size_.y) + ", " +
478 std::to_string(work_group_size_.z) + ")))\n";
479 }
480 if (use_simd_broadcast && gpu_info.IsIntel() && gpu_info.IsApiOpenCl()) {
481 c += "__attribute__((intel_reqd_sub_group_size(" +
482 std::to_string(simd_size) + ")))\n";
483 }
484 std::string dst_oob_check;
485 if (src_def.HasAxis(Axis::DEPTH)) {
486 if (conv_params.linear_all) {
487 dst_oob_check = "DST_S >= args.dst_tensor.Slices()";
488 } else if (conv_params.linear_spatial) {
489 dst_oob_check =
490 "DST_Z >= args.dst_tensor.Depth() || DST_S >= "
491 "args.dst_tensor.Slices()";
492 } else {
493 dst_oob_check =
494 "DST_X >= args.dst_tensor.Width() || DST_Z >= "
495 "args.dst_tensor.Depth() || DST_S >= args.dst_tensor.Slices()";
496 }
497 } else {
498 if (conv_params.linear_all) {
499 dst_oob_check = "DST_S >= args.dst_tensor.Slices()";
500 } else if (conv_params.linear_spatial) {
501 dst_oob_check =
502 "DST_Y >= args.dst_tensor.Height() || DST_S >= "
503 "args.dst_tensor.Slices()";
504 } else {
505 dst_oob_check =
506 "DST_X >= args.dst_tensor.Width() || DST_Y >= "
507 "args.dst_tensor.Height() || DST_S >= args.dst_tensor.Slices()";
508 }
509 }
510 c += "MAIN_FUNCTION($0) {\n";
511 c += GenerateBlockCoords(conv_params.block_size, work_group_launch_order_,
512 conv_params.linear_spatial, conv_params.linear_all,
513 src_def.HasAxis(Axis::DEPTH));
514 if (!late_oob_check) {
515 c += " if (" + dst_oob_check + ") {\n";
516 c += " return;\n";
517 c += " }\n";
518 }
519 if (conv_params.weights_upload_type ==
520 ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
521 if (conv_params.linear_spatial) {
522 c += " int lid = LOCAL_ID_0;\n";
523 } else {
524 c += " int lid = LOCAL_ID_1 * " + std::to_string(work_group_size_.x) +
525 " + LOCAL_ID_0;\n";
526 }
527 }
528 if (use_simd_broadcast) {
529 c += " int simd_id = SUB_GROUP_LOCAL_ID;\n";
530 }
531 for (int s = 0; s < block_size.w; ++s) {
532 const std::string sind = std::to_string(s);
533 for (int z = 0; z < block_size.z; ++z) {
534 const std::string zind = std::to_string(z);
535 for (int y = 0; y < block_size.y; ++y) {
536 const std::string yind = std::to_string(y);
537 for (int x = 0; x < block_size.x; ++x) {
538 const std::string xind = std::to_string(x);
539 c += " ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) +
540 " = INIT_ACCUM_FLT4(0.0f);\n";
541 }
542 }
543 }
544 }
545 if (!conv_params_.x_kernel_is_1) {
546 for (int x = 0; x < block_size.x; ++x) {
547 const std::string xind = std::to_string(x);
548 const std::string xc = "(DST_X + " + xind + ")";
549 if (stride_correction) {
550 c += " int xc" + xind + " = " +
551 GetXStrideCorrected(xc, "args.src_tensor.Batch()", "args.stride_x",
552 "args.padding_x") +
553 ";\n";
554 } else {
555 c += " int xc" + xind + " = " + xc +
556 " * args.stride_x + args.padding_x;\n";
557 }
558 }
559 } else {
560 for (int x = 0; x < block_size.x; ++x) {
561 const std::string xind = std::to_string(x);
562 c += " int xc" + xind + " = DST_X + " + xind + ";\n";
563 if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
564 c += " xc" + xind + " = clamp(xc" + xind +
565 ", 0, args.src_tensor.Width() - 1);\n";
566 }
567 }
568 }
569 if (!conv_params_.y_kernel_is_1) {
570 for (int y = 0; y < block_size.y; ++y) {
571 const std::string yind = std::to_string(y);
572 const std::string yc = "(DST_Y + " + yind + ")";
573 c += " int yc" + yind + " = " + yc +
574 " * args.stride_y + args.padding_y;\n";
575 }
576 } else {
577 for (int y = 0; y < block_size.y; ++y) {
578 const std::string yind = std::to_string(y);
579 c += " int yc" + yind + " = DST_Y + " + yind + ";\n";
580 if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
581 c += " yc" + yind + " = clamp(yc" + yind +
582 ", 0, args.src_tensor.Height() - 1);\n";
583 }
584 }
585 }
586 if (src_def.HasAxis(Axis::DEPTH)) {
587 if (!conv_params_.z_kernel_is_1) {
588 for (int z = 0; z < block_size.z; ++z) {
589 const std::string zind = std::to_string(z);
590 const std::string zc = "(DST_Z + " + zind + ")";
591 c += " int zc" + zind + " = " + zc +
592 " * args.stride_z + args.padding_z;\n";
593 }
594 } else {
595 for (int z = 0; z < block_size.z; ++z) {
596 const std::string zind = std::to_string(z);
597 c += " int zc" + zind + " = DST_Z + " + zind + ";\n";
598 if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
599 c += " zc" + zind + " = clamp(zc" + zind +
600 ", 0, args.src_tensor.Depth() - 1);\n";
601 }
602 }
603 }
604 }
605 bool trivial_kernel_size =
606 conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
607 if (src_def.HasAxis(Axis::DEPTH)) {
608 trivial_kernel_size = trivial_kernel_size && conv_params_.z_kernel_is_1;
609 }
610 if (need_local_mem) {
611 c += " __local " + weights_data_type + " weights_cache[" +
612 std::to_string(local_mem_size) + "];\n";
613 } else if (conv_params.AreWeightsBuffer()) {
614 c += " " + weights_global_ptr + " weights_cache;\n";
615 } else if (!trivial_kernel_size) {
616 c += " int filter_offset = 0;\n";
617 }
618 if (conv_params.AreWeightsBuffer()) {
619 if (conv_params.different_weights_for_height) {
620 c += " " + weights_global_ptr +
621 " filters_loc = args.weights.GetPtr() + (DST_S * "
622 "args.src_tensor.Height() + DST_Y * " +
623 std::to_string(block_size.w) + ") * 4 * args.src_tensor.Slices();\n";
624 } else {
625 std::string kernel_spatial_offset = "";
626 if (!conv_params_.x_kernel_is_1) {
627 kernel_spatial_offset += " * args.kernel_size_x";
628 }
629 if (!conv_params_.y_kernel_is_1) {
630 kernel_spatial_offset += " * args.kernel_size_y";
631 }
632 if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
633 kernel_spatial_offset += " * args.kernel_size_z";
634 }
635 c += " " + weights_global_ptr +
636 " filters_loc = args.weights.GetPtr() + DST_S * 4 * "
637 "args.src_tensor.Slices()" +
638 kernel_spatial_offset + ";\n";
639 }
640 }
641 if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
642 c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
643 for (int z = 0; z < block_size.z; ++z) {
644 const std::string zck = "zck" + std::to_string(z);
645 c += " int zck" + std::to_string(z) + " = kz * args.dilation_z + zc" +
646 std::to_string(z) + ";\n";
647 if (!src_def.SupportsZeroClamp(Axis::DEPTH)) {
648 c += " bool in_z" + std::to_string(z) + " = " + zck + " >= 0 && " +
649 zck + " < args.src_tensor.Depth();\n";
650 if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
651 c += " " + zck + " = clamp(" + zck +
652 ", 0, args.src_tensor.Depth() - 1);\n";
653 }
654 }
655 }
656 }
657 if (!conv_params_.y_kernel_is_1) {
658 c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
659 for (int y = 0; y < block_size.y; ++y) {
660 const std::string yck = "yck" + std::to_string(y);
661 c += " int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) +
662 ";\n";
663 if (!src_def.SupportsZeroClamp(Axis::HEIGHT)) {
664 c += " bool in_y" + std::to_string(y) + " = " + yck + " >= 0 && " +
665 yck + " < args.src_tensor.Height();\n";
666 if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
667 c += " " + yck + " = clamp(" + yck +
668 ", 0, args.src_tensor.Height() - 1);\n";
669 }
670 }
671 }
672 }
673 if (!conv_params_.x_kernel_is_1) {
674 c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
675 for (int x = 0; x < block_size.x; ++x) {
676 const std::string xck = "xck" + std::to_string(x);
677 c += " int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" +
678 std::to_string(x) + ";\n";
679 if (!src_def.SupportsZeroClamp(Axis::WIDTH)) {
680 c += " bool in_x" + std::to_string(x) + " = " + xck + " >= 0 && " +
681 xck + " < args.src_tensor.Width();\n";
682 if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
683 c += " " + xck + " = clamp(" + xck +
684 ", 0, args.src_tensor.Width() - 1);\n";
685 }
686 }
687 }
688 }
689 const bool need_multiple_slice_strides =
690 src_def.ReturnsZeroForNegOneRead() && !trivial_kernel_size;
691 for (int z = 0; z < block_size.z; ++z) {
692 const std::string zind = std::to_string(z);
693 for (int y = 0; y < block_size.y; ++y) {
694 const std::string yind = std::to_string(y);
695 for (int x = 0; x < block_size.x; ++x) {
696 const std::string xind = std::to_string(x);
697 std::string xc = conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
698 std::string yc = conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
699 const std::string id = generate_id(xind, yind, zind);
700 std::string coords = "" + xc + ", " + yc;
701 if (src_def.HasAxis(Axis::DEPTH)) {
702 std::string zc =
703 conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
704 coords += ", " + zc;
705 }
706 if (src_def.IsLinear()) {
707 c += " args.src_tensor.GetAddress(addr" + id + ", " + coords +
708 ", 0);\n";
709 if (need_multiple_slice_strides) {
710 const std::string check = generate_check(xind, yind, zind);
711 c += " addr" + id + " = select(-1, addr" + id + ", (" + check +
712 "));\n";
713 c += " int ds" + id +
714 " = select(0, args.src_tensor.SliceStride(), (" + check +
715 "));\n";
716 }
717 }
718 }
719 }
720 }
721 if (src_def.IsLinear() && !need_multiple_slice_strides) {
722 c += " int ds = args.src_tensor.SliceStride();\n";
723 }
724
725 auto declare_src = [&]() {
726 for (int z = 0; z < block_size.z; ++z) {
727 const std::string zind = std::to_string(z);
728 for (int y = 0; y < block_size.y; ++y) {
729 const std::string yind = std::to_string(y);
730 for (int x = 0; x < block_size.x; ++x) {
731 const std::string xind = std::to_string(x);
732 const std::string id = generate_id(xind, yind, zind);
733 c += " " + weights_data_type + " src" + id + ";\n";
734 }
735 }
736 }
737 };
738 const bool conditional_read = gpu_info.IsMali();
739 auto read_src = [&]() {
740 const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
741 for (int z = 0; z < block_size.z; ++z) {
742 const std::string zind = std::to_string(z);
743 for (int y = 0; y < block_size.y; ++y) {
744 const std::string yind = std::to_string(y);
745 for (int x = 0; x < block_size.x; ++x) {
746 const std::string xind = std::to_string(x);
747 std::string id = generate_id(xind, yind, zind);
748 const std::string check = generate_check(xind, yind, zind);
749 std::string address;
750 if (src_def.IsLinear()) {
751 address = "addr" + id;
752 } else {
753 std::string xc =
754 conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
755 std::string yc =
756 conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
757 address = "" + xc + ", " + yc;
758 if (src_def.HasAxis(Axis::DEPTH)) {
759 std::string zc =
760 conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
761 address += ", " + zc;
762 }
763 address += ", s";
764 }
765 if (src_def.ReturnsZeroForNegOneRead()) {
766 c += " src" + id + " = args.src_tensor.Read<" + cl_type + ">(" +
767 address + ");\n";
768 const std::string ds = trivial_kernel_size ? "ds" : "ds" + id;
769 c += " " + address + " += " + ds + ";\n";
770 } else {
771 if (!check.empty()) {
772 if (conditional_read) {
773 c += " src" + id + " = " + check +
774 " ? args.src_tensor.Read<" + cl_type + ">(" + address +
775 ") : INIT_FLT4(0.0f);\n";
776 } else {
777 c += " src" + id + " = args.src_tensor.Read<" + cl_type +
778 ">(" + address + ") * INIT_FLT(" + check + ");\n";
779 }
780 } else {
781 c += " src" + id + " = args.src_tensor.Read<" + cl_type +
782 ">(" + address + ");\n";
783 }
784 if (src_def.IsLinear()) {
785 c += " " + address + " += ds;\n";
786 }
787 }
788 }
789 }
790 }
791 };
792 const bool weights_type_as_accum_type =
793 !(op_def.precision == CalculationsPrecision::F32_F16 &&
794 conv_params.weights_data_type == DataType::FLOAT16);
795 auto conv_core = [&](int shared_offset) {
796 const std::string channels[] = {"x", "y", "z", "w"};
797 for (int s = 0; s < block_size.w; ++s) {
798 const std::string sind = std::to_string(s);
799 if (weights_type_as_accum_type) {
800 for (int ch = 0; ch < 4; ++ch) {
801 for (int z = 0; z < block_size.z; ++z) {
802 const std::string zind = std::to_string(z);
803 for (int y = 0; y < block_size.y; ++y) {
804 const std::string yind = std::to_string(y);
805 for (int x = 0; x < block_size.x; ++x) {
806 const std::string xind = std::to_string(x);
807 std::string R = "r" + generate_id_full(xind, yind, zind, sind);
808 std::string S = "src" + generate_id(xind, yind, zind);
809 if (use_simd_broadcast) {
810 int simd_id = (s * 4 + ch + shared_offset) / simd_size;
811 int thread_id = (s * 4 + ch + shared_offset) % simd_size;
812 std::string w_val_x = "SUB_GROUP_BROADCAST(simd_w" +
813 std::to_string(simd_id) + ".x, " +
814 std::to_string(thread_id) + "u)";
815 std::string w_val_y = "SUB_GROUP_BROADCAST(simd_w" +
816 std::to_string(simd_id) + ".y, " +
817 std::to_string(thread_id) + "u)";
818 std::string w_val_z = "SUB_GROUP_BROADCAST(simd_w" +
819 std::to_string(simd_id) + ".z, " +
820 std::to_string(thread_id) + "u)";
821 std::string w_val_w = "SUB_GROUP_BROADCAST(simd_w" +
822 std::to_string(simd_id) + ".w, " +
823 std::to_string(thread_id) + "u)";
824 if (GetWeightsDescription().IsI4O4()) {
825 c += " " + R + ".x += " + w_val_x + " * " + S + "." +
826 channels[ch] + ";\n";
827 c += " " + R + ".y += " + w_val_y + " * " + S + "." +
828 channels[ch] + ";\n";
829 c += " " + R + ".z += " + w_val_z + " * " + S + "." +
830 channels[ch] + ";\n";
831 c += " " + R + ".w += " + w_val_w + " * " + S + "." +
832 channels[ch] + ";\n";
833 } else {
834 c += " " + R + "." + channels[ch] + " += " + w_val_x +
835 " * " + S + ".x;\n";
836 c += " " + R + "." + channels[ch] + " += " + w_val_y +
837 " * " + S + ".y;\n";
838 c += " " + R + "." + channels[ch] + " += " + w_val_z +
839 " * " + S + ".z;\n";
840 c += " " + R + "." + channels[ch] + " += " + w_val_w +
841 " * " + S + ".w;\n";
842 }
843 } else {
844 const std::string weight_id =
845 std::to_string(s * 4 + ch + shared_offset);
846 std::string w_val;
847 if (conv_params.AreWeightsBuffer()) {
848 w_val = "weights_cache[" + weight_id + "]";
849 } else {
850 w_val = "f" + weight_id;
851 }
852 if (GetWeightsDescription().IsI4O4()) {
853 c += " " + R + " += " + w_val + " * " + S + "." +
854 channels[ch] + ";\n";
855 } else {
856 c += " " + R + "." + channels[ch] + " += dot(" + w_val +
857 ", " + S + ");\n";
858 }
859 }
860 }
861 }
862 }
863 }
864 } else { // F32_F16 precision and weights type is float16
865 for (int z = 0; z < block_size.z; ++z) {
866 const std::string zind = std::to_string(z);
867 for (int y = 0; y < block_size.y; ++y) {
868 const std::string yind = std::to_string(y);
869 for (int x = 0; x < block_size.x; ++x) {
870 const std::string xind = std::to_string(x);
871 std::string R = "r" + generate_id_full(xind, yind, zind, sind);
872 std::string S = "src" + generate_id(xind, yind, zind);
873 std::vector<std::string> F(4);
874 for (int i = 0; i < 4; ++i) {
875 std::string weight_id =
876 std::to_string(s * 4 + i + shared_offset);
877 if (conv_params.AreWeightsBuffer()) {
878 F[i] = "weights_cache[" + weight_id + "]";
879 } else {
880 F[i] = "f" + weight_id;
881 }
882 }
883 if (GetWeightsDescription().IsI4O4()) {
884 c += " " + R + " += TO_ACCUM_TYPE(" + S + ".x * " + F[0] +
885 " + " + S + ".y * " + F[1] + " + " + S + ".z * " + F[2] +
886 " + " + S + ".w * " + F[3] + ");\n";
887 } else {
888 c += " " + R + ".x += dot(" + S + ", " + F[0] + ");\n";
889 c += " " + R + ".y += dot(" + S + ", " + F[1] + ");\n";
890 c += " " + R + ".z += dot(" + S + ", " + F[2] + ");\n";
891 c += " " + R + ".w += dot(" + S + ", " + F[3] + ");\n";
892 }
893 }
894 }
895 }
896 }
897 }
898 };
899
900 c += " int s = 0;\n";
901 c += " do {\n";
902 declare_src();
903 const int total_work_items =
904 work_group_size_.x * work_group_size_.y * work_group_size_.z;
905 if (conv_params.weights_upload_type ==
906 ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
907 c += GenerateAsyncUpload("weights_cache", "filters_loc",
908 /*global_offset_name*/ "", local_mem_size);
909 } else if (conv_params.weights_upload_type ==
910 ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
911 c += " " + barrier + ";\n";
912 c += GenerateUploadByThreads("weights_cache", "filters_loc",
913 /*global_offset_name*/ "", "lid",
914 total_work_items, local_mem_size);
915 } else if (use_simd_broadcast) {
916 int parts = local_mem_size / simd_size;
917 int reminder = local_mem_size % simd_size;
918 for (int i = 0; i < parts; ++i) {
919 c += " FLT4 simd_w" + std::to_string(i) + " = filters_loc[simd_id + " +
920 std::to_string(i * simd_size) + "];\n";
921 }
922 if (reminder) {
923 c += " FLT4 simd_w" + std::to_string(parts) + ";\n";
924 c += " if (simd_id < " + std::to_string(reminder) + ") {\n";
925 c += " simd_w" + std::to_string(parts) +
926 " = filters_loc[simd_id + " + std::to_string(parts * simd_size) +
927 "];\n";
928 c += " }\n";
929 }
930 } else if (conv_params.AreWeightsBuffer()) { // GLOBAL_MEM/CONSTANT_MEM
931 c += " weights_cache = filters_loc;\n";
932 } else { // TEXTURES_MEM
933 for (int dst_s = 0; dst_s < block_size.w; ++dst_s) {
934 std::string f_y = trivial_kernel_size ? "s" : "filter_offset";
935 if (conv_params.different_weights_for_height) {
936 f_y = "DST_Y * args.src_tensor.Slices() + s";
937 }
938 c += absl::Substitute(
939 R"( FLT4 f$2 = args.weights0.Read(DST_S + $0, $1);
940 FLT4 f$3 = args.weights1.Read(DST_S + $0, $1);
941 FLT4 f$4 = args.weights2.Read(DST_S + $0, $1);
942 FLT4 f$5 = args.weights3.Read(DST_S + $0, $1);
943 )",
944 dst_s, f_y, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2,
945 dst_s * 4 + 3);
946 }
947 if (!trivial_kernel_size) {
948 c += " filter_offset++;\n";
949 }
950 }
951 read_src();
952 c += " s += 1;\n";
953 if (conv_params.weights_upload_type ==
954 ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
955 c += " " + barrier + ";\n";
956 }
957 conv_core(0);
958 for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
959 read_src();
960 conv_core(i * block_size.w * 4);
961 c += " s += 1;\n";
962 }
963 if (conv_params.AreWeightsBuffer()) {
964 c += " filters_loc += " + std::to_string(local_mem_size) + ";\n";
965 }
966 c += " } while (s < args.src_tensor.Slices());\n";
967 if (!conv_params.x_kernel_is_1) {
968 c += " };\n";
969 }
970 if (!conv_params.y_kernel_is_1) {
971 c += " };\n";
972 }
973 if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
974 c += " };\n";
975 }
976 if (conv_params.AreWeightsBuffer()) {
977 if (conv_params.weights_upload_type ==
978 ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
979 c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S",
980 block_size.w);
981 } else if (conv_params.weights_upload_type ==
982 ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
983 c += " " + barrier + ";\n";
984 c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()",
985 "DST_S", "lid", total_work_items,
986 block_size.w);
987 c += " " + barrier + ";\n";
988 } else {
989 c += " weights_cache = args.biases.GetPtr() + DST_S;\n";
990 }
991 }
992 if (late_oob_check) {
993 c += " if (" + dst_oob_check + ") {\n";
994 c += " return;\n";
995 c += " }\n";
996 }
997
998 auto generate_dst_check = [&](int x, int y, int z) {
999 std::string check;
1000 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
1001 const std::vector<std::string> names{"Width()", "Height()", "Depth()"};
1002 std::vector<std::string> coords(3);
1003 coords[0] = "DST_X + " + std::to_string(x);
1004 coords[1] = "DST_Y + " + std::to_string(y);
1005 coords[2] = "DST_Z + " + std::to_string(z);
1006 const std::vector<int> ids{x, y, z};
1007 for (int i = 0; i < axes.size(); ++i) {
1008 const auto& axis = axes[i];
1009 if (src_def.HasAxis(axis) && ids[i] != 0) {
1010 if (!check.empty()) {
1011 check += " && ";
1012 }
1013 check += coords[i] + " < args.dst_tensor." + names[i];
1014 }
1015 }
1016 return check;
1017 };
1018
1019 for (int s = 0; s < block_size.w; ++s) {
1020 const std::string sind = std::to_string(s);
1021 c += " if (DST_S + " + sind + " >= args.dst_tensor.Slices()) return;\n";
1022 c += " {\n";
1023 if (conv_params.AreWeightsBuffer()) {
1024 c += " FLT4 bias_val = TO_FLT4(weights_cache[" + sind + "]);\n";
1025 } else {
1026 c += " FLT4 bias_val = args.biases.Read(DST_S + " + sind + ");\n";
1027 }
1028 for (int z = 0; z < block_size.z; ++z) {
1029 const std::string zind = std::to_string(z);
1030 for (int y = 0; y < block_size.y; ++y) {
1031 const std::string yind = std::to_string(y);
1032 for (int x = 0; x < block_size.x; ++x) {
1033 const std::string xind = std::to_string(x);
1034 const std::string id = generate_id_full(xind, yind, zind, sind);
1035 const std::string check = generate_dst_check(x, y, z);
1036 std::string coords = "DST_X + " + xind + ", DST_Y + " + yind;
1037 if (src_def.HasAxis(Axis::DEPTH)) {
1038 coords += ", DST_Z + " + zind;
1039 }
1040 coords += ", DST_S + " + sind;
1041 if (!check.empty()) {
1042 c += " if (" + check + ") {\n";
1043 } else {
1044 c += " {\n";
1045 }
1046 c += " FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
1047 c += " args.dst_tensor.Write(res, " + coords + ");\n";
1048 c += " }\n";
1049 }
1050 }
1051 }
1052 c += " }\n";
1053 }
1054 c += "}\n";
1055 return c;
1056 }
1057
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,int src_depth,int dst_depth,bool x_kernel_is_1,bool y_kernel_is_1,bool different_weights_for_height,const BHWC * dst_shape)1058 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1059 const GpuInfo& gpu_info, const OperationDef& definition, int src_depth,
1060 int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
1061 bool different_weights_for_height, const BHWC* dst_shape) {
1062 ConvParams conv_params;
1063 conv_params.linear_spatial = false;
1064 conv_params.linear_all = false;
1065 conv_params.block_size = int4(1, 1, 1, 1);
1066 conv_params.weights_data_type =
1067 DeduceDataTypeFromPrecision(definition.precision);
1068 conv_params.x_kernel_is_1 = x_kernel_is_1;
1069 conv_params.y_kernel_is_1 = y_kernel_is_1;
1070 conv_params.different_weights_for_height = different_weights_for_height;
1071 if (gpu_info.IsNvidia()) {
1072 if (different_weights_for_height) {
1073 work_group_size_ = int3(32, 1, 1);
1074 work_group_launch_order_ = int3(2, 0, 1);
1075 conv_params.fixed_work_group_size = true;
1076 } else {
1077 conv_params.linear_spatial = true;
1078 work_group_size_ = int3(32, 1, 1);
1079 work_group_launch_order_ = int3(1, 0, 2);
1080 conv_params.fixed_work_group_size = true;
1081 }
1082 conv_params.block_size = int4(2, 1, 1, 4);
1083 conv_params.src_depth_loop_size = 1;
1084 conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
1085 if (dst_depth % 4 == 0 || dst_depth >= 8) {
1086 conv_params.block_size.w = 4;
1087 } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1088 conv_params.block_size.w = 2;
1089 } else {
1090 conv_params.block_size.w = dst_depth;
1091 }
1092 if (dst_shape) {
1093 int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
1094 float task_size_per_cu =
1095 static_cast<float>(task_size) / gpu_info.GetComputeUnitsCount();
1096 int block_size = conv_params.block_size.x * conv_params.block_size.y *
1097 conv_params.block_size.w;
1098 float threads_per_cu = task_size_per_cu / block_size;
1099 float warps_per_cu = threads_per_cu / 32 /*warp_size*/;
1100 if (warps_per_cu < 8.0f) {
1101 conv_params.block_size.x = 1;
1102 }
1103 if (warps_per_cu < 4.0f && conv_params.block_size.w >= 4) {
1104 conv_params.block_size.w /= 2;
1105 }
1106 if (warps_per_cu < 2.0f && conv_params.block_size.w >= 2) {
1107 conv_params.block_size.w /= 2;
1108 }
1109 }
1110 if (src_depth % 2 == 0) {
1111 conv_params.src_depth_loop_size = 2;
1112 }
1113 if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1114 conv_params.src_depth_loop_size = 4;
1115 }
1116 } else if (gpu_info.IsPowerVR()) {
1117 if (different_weights_for_height) {
1118 work_group_size_ = int3(32, 1, 1);
1119 work_group_launch_order_ = int3(2, 0, 1);
1120 conv_params.fixed_work_group_size = true;
1121 } else {
1122 conv_params.linear_spatial = true;
1123 work_group_size_ = int3(32, 1, 1);
1124 work_group_launch_order_ = int3(1, 0, 2);
1125 conv_params.fixed_work_group_size = true;
1126 }
1127 conv_params.weights_data_type =
1128 definition.precision == CalculationsPrecision::F16 ? DataType::FLOAT16
1129 : DataType::FLOAT32;
1130 conv_params.block_size = int4(1, 1, 1, 4);
1131 conv_params.src_depth_loop_size = 1;
1132 conv_params.weights_upload_type =
1133 WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
1134 if (dst_depth % 8 == 0 || dst_depth >= 32) {
1135 conv_params.block_size.w = 8;
1136 } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
1137 conv_params.block_size.w = 4;
1138 } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1139 conv_params.block_size.w = 2;
1140 } else {
1141 conv_params.block_size.w = dst_depth;
1142 }
1143 if (definition.precision == CalculationsPrecision::F16) {
1144 conv_params.block_size.w = std::min(4, conv_params.block_size.w);
1145 if (src_depth % 2 == 0) {
1146 conv_params.src_depth_loop_size = 2;
1147 }
1148 if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1149 conv_params.src_depth_loop_size = 4;
1150 }
1151 if (conv_params.block_size.w == 1) {
1152 if (src_depth % 2 == 0) {
1153 conv_params.src_depth_loop_size = 2;
1154 }
1155 if (src_depth % 4 == 0) {
1156 conv_params.src_depth_loop_size = 4;
1157 }
1158 if (src_depth <= 8) {
1159 conv_params.src_depth_loop_size = src_depth;
1160 }
1161 }
1162 conv_params.block_size.x = 2;
1163 }
1164 } else if (gpu_info.IsAMD()) {
1165 if (different_weights_for_height) {
1166 work_group_size_ = int3(32, 1, 1);
1167 work_group_launch_order_ = int3(2, 0, 1);
1168 conv_params.fixed_work_group_size = true;
1169 } else {
1170 work_group_size_ = int3(8, 4, 1);
1171 work_group_launch_order_ = int3(2, 0, 1);
1172 conv_params.fixed_work_group_size = true;
1173 }
1174
1175 conv_params.block_size = int4(2, 1, 1, 1);
1176 if (x_kernel_is_1 && y_kernel_is_1) {
1177 conv_params.block_size.y = 2;
1178 }
1179 conv_params.src_depth_loop_size = 1;
1180 conv_params.weights_upload_type = WeightsUploadType::CONSTANT_MEM;
1181 if (dst_depth % 8 == 0 || dst_depth >= 32) {
1182 conv_params.block_size.w = 8;
1183 } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
1184 conv_params.block_size.w = 4;
1185 } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1186 conv_params.block_size.w = 2;
1187 } else {
1188 conv_params.block_size.w = 1;
1189 }
1190 if (src_depth % 2 == 0 && src_depth >= 16) {
1191 conv_params.src_depth_loop_size = 2;
1192 }
1193 } else if (gpu_info.IsMali()) {
1194 int block_size = 2;
1195 if (dst_shape) {
1196 int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
1197 block_size = GetRecommendedBlockSizeForConv(
1198 gpu_info, definition.precision, task_size);
1199 }
1200 if (!x_kernel_is_1 || !y_kernel_is_1) {
1201 block_size = std::min(block_size, 4);
1202 }
1203 if (block_size == 8) {
1204 if (dst_depth == 1 || dst_depth == 3) {
1205 conv_params.block_size = int4(2, 2, 1, 1);
1206 } else {
1207 conv_params.block_size = int4(2, 2, 1, 2);
1208 }
1209 } else if (block_size == 4) {
1210 if (dst_depth == 1 || dst_depth == 3) {
1211 conv_params.block_size = int4(2, 2, 1, 1);
1212 } else {
1213 conv_params.block_size = int4(2, 1, 1, 2);
1214 }
1215 } else if (block_size == 2) {
1216 conv_params.block_size = int4(2, 1, 1, 1);
1217 } else {
1218 conv_params.block_size = int4(1, 1, 1, 1);
1219 }
1220 conv_params.src_depth_loop_size = 1;
1221 MaliInfo mali_info = gpu_info.mali_info;
1222 if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
1223 conv_params.src_depth_loop_size = 2;
1224 }
1225 if (src_depth % 4 == 0 && block_size == 1 && !mali_info.IsMidgard() &&
1226 definition.precision == CalculationsPrecision::F16) {
1227 conv_params.src_depth_loop_size = 4;
1228 }
1229 work_group_size_ = int3(4, 4, 1);
1230 work_group_launch_order_ = int3(0, 1, 2);
1231 conv_params.fixed_work_group_size = false;
1232 conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1233 } else if (gpu_info.IsAdreno()) {
1234 conv_params.block_size = int4(2, 2, 1, 2);
1235 if (gpu_info.adreno_info.IsAdreno3xx()) {
1236 if (definition.precision == CalculationsPrecision::F16) {
1237 conv_params.block_size = int4(2, 2, 1, 2);
1238 } else if (definition.precision == CalculationsPrecision::F32_F16) {
1239 conv_params.block_size = int4(2, 1, 1, 2);
1240 } else { // F32
1241 conv_params.block_size = int4(2, 2, 1, 1);
1242 }
1243 }
1244 work_group_size_ = int3(8, 2, 1);
1245 work_group_launch_order_ = int3(0, 1, 2);
1246 conv_params.fixed_work_group_size = false;
1247 conv_params.src_depth_loop_size = 1;
1248 if (definition.src_tensors.size() == 2) {
1249 // dynamic weights supported only with buffers.
1250 conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1251 } else {
1252 conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM_X4;
1253 }
1254 } else if (gpu_info.IsIntel()) {
1255 if (different_weights_for_height) {
1256 work_group_size_ = int3(16, 1, 1);
1257 work_group_launch_order_ = int3(0, 1, 2);
1258 conv_params.fixed_work_group_size = true;
1259 } else {
1260 conv_params.linear_spatial = true;
1261 work_group_size_ = int3(16, 1, 1);
1262 work_group_launch_order_ = int3(0, 1, 2);
1263 conv_params.fixed_work_group_size = true;
1264 }
1265 conv_params.block_size = int4(1, 1, 1, 4);
1266 conv_params.src_depth_loop_size = 1;
1267 int sub_group_size = 16;
1268 const bool supports_subgroups =
1269 gpu_info.SupportsExtension("cl_khr_subgroups") ||
1270 gpu_info.SupportsExtension("cl_intel_subgroups");
1271 if (definition.precision != CalculationsPrecision::F32_F16 &&
1272 supports_subgroups &&
1273 gpu_info.SupportsExtension("cl_intel_required_subgroup_size") &&
1274 gpu_info.SupportsSubGroupWithSize(sub_group_size)) {
1275 conv_params.weights_upload_type =
1276 WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
1277 conv_params.simd_size = sub_group_size;
1278 } else {
1279 conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
1280 }
1281 if (dst_depth % 4 == 0 || dst_depth >= 8) {
1282 conv_params.block_size.w = 4;
1283 } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1284 conv_params.block_size.w = 2;
1285 } else {
1286 conv_params.block_size.w = dst_depth;
1287 }
1288 if (src_depth % 2 == 0) {
1289 conv_params.src_depth_loop_size = 2;
1290 }
1291 if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1292 conv_params.src_depth_loop_size = 4;
1293 }
1294 } else if (gpu_info.IsApple()) {
1295 conv_params.block_size = int4(2, 2, 1, 2);
1296 work_group_size_ = int3(8, 4, 1);
1297 work_group_launch_order_ = int3(0, 1, 2);
1298 conv_params.fixed_work_group_size = true;
1299 conv_params.src_depth_loop_size = 1;
1300 conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1301 } else {
1302 conv_params.block_size = int4(1, 1, 1, 4);
1303 work_group_size_ = int3(8, 2, 1);
1304 work_group_launch_order_ = int3(0, 1, 2);
1305 conv_params.fixed_work_group_size = false;
1306 conv_params.src_depth_loop_size = 1;
1307 conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
1308 if (dst_depth % 4 == 0 || dst_depth >= 8) {
1309 conv_params.block_size.w = 4;
1310 } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
1311 conv_params.block_size.w = 2;
1312 } else {
1313 conv_params.block_size.w = dst_depth;
1314 }
1315 if (src_depth % 2 == 0) {
1316 conv_params.src_depth_loop_size = 2;
1317 }
1318 if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
1319 conv_params.src_depth_loop_size = 4;
1320 }
1321 }
1322 if (conv_params.AreWeightsBuffer()) {
1323 if (gpu_info.IsApple()) {
1324 conv_params.weights_layout = WeightsLayout::kOHWIOGroupO4I4;
1325 } else {
1326 conv_params.weights_layout = WeightsLayout::kOHWIOGroupI4O4;
1327 }
1328 } else {
1329 if (gpu_info.IsApple()) {
1330 conv_params.weights_layout = WeightsLayout::k2DX4O4YIsHWIAndXIsOOGroupI4;
1331 } else {
1332 conv_params.weights_layout = WeightsLayout::k2DX4I4YIsHWIAndXIsOOGroupO4;
1333 }
1334 }
1335
1336 return conv_params;
1337 }
1338
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1339 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1340 const GpuInfo& gpu_info, const OperationDef& definition,
1341 const Convolution2DAttributes& attr, const BHWC* dst_shape) {
1342 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1343 const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1344 const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
1345 attr.dilations.w == 1 &&
1346 attr.padding.prepended.w == 0 &&
1347 attr.padding.appended.w == 0;
1348 const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
1349 attr.dilations.h == 1 &&
1350 attr.padding.prepended.h == 0 &&
1351 attr.padding.appended.h == 0;
1352 return GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1353 x_kernel_is_1, y_kernel_is_1, false, dst_shape);
1354 }
1355
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution3DAttributes & attr,const BHWDC * dst_shape)1356 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1357 const GpuInfo& gpu_info, const OperationDef& definition,
1358 const Convolution3DAttributes& attr, const BHWDC* dst_shape) {
1359 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1360 const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1361 const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
1362 attr.dilations.w == 1 &&
1363 attr.padding.prepended.w == 0 &&
1364 attr.padding.appended.w == 0;
1365 const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
1366 attr.dilations.h == 1 &&
1367 attr.padding.prepended.h == 0 &&
1368 attr.padding.appended.h == 0;
1369 const bool z_kernel_is_1 = attr.weights.shape.d == 1 && attr.strides.d == 1 &&
1370 attr.dilations.d == 1 &&
1371 attr.padding.prepended.d == 0 &&
1372 attr.padding.appended.d == 0;
1373
1374 ConvPowerVR::ConvParams result;
1375 BHWC shape;
1376 if (dst_shape) {
1377 shape.b = dst_shape->b;
1378 shape.h = dst_shape->h * dst_shape->d;
1379 shape.w = dst_shape->w;
1380 shape.c = dst_shape->c;
1381 result = GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1382 x_kernel_is_1, y_kernel_is_1, false, &shape);
1383 } else {
1384 result = GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1385 x_kernel_is_1, y_kernel_is_1, false, nullptr);
1386 }
1387 result.z_kernel_is_1 = z_kernel_is_1;
1388 return result;
1389 }
1390
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const BHWC * dst_shape)1391 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1392 const GpuInfo& gpu_info, const OperationDef& definition,
1393 const Convolution2DAttributes& attr, const BHWC& weights_shape,
1394 const BHWC* dst_shape) {
1395 const int dst_depth = DivideRoundUp(weights_shape.b, 4);
1396 const int src_depth = DivideRoundUp(weights_shape.c, 4);
1397 const bool x_kernel_is_1 =
1398 weights_shape.w == 1 && attr.strides.w == 1 && attr.dilations.w == 1 &&
1399 attr.padding.prepended.w == 0 && attr.padding.appended.w == 0;
1400 const bool y_kernel_is_1 =
1401 weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
1402 attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
1403 return GuessBestParams(gpu_info, definition, src_depth, dst_depth,
1404 x_kernel_is_1, y_kernel_is_1, false, dst_shape);
1405 }
1406
GuessBestParams(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr,const BHWC * dst_shape)1407 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
1408 const GpuInfo& gpu_info, const OperationDef& definition,
1409 const FullyConnectedAttributes& attr, const BHWC* dst_shape) {
1410 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1411 const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1412 ConvPowerVR::ConvParams params = GuessBestParams(
1413 gpu_info, definition, src_depth, dst_depth, true, true, false, dst_shape);
1414 work_group_size_.x *= work_group_size_.y;
1415 work_group_size_.y = 1;
1416 params.block_size.x *= params.block_size.y;
1417 params.block_size.y = 1;
1418 return params;
1419 }
1420
GuessBestParamsWinograd(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1421 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
1422 const GpuInfo& gpu_info, const OperationDef& definition,
1423 const Convolution2DAttributes& attr, const BHWC* dst_shape) {
1424 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
1425 const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
1426 ConvPowerVR::ConvParams params = GuessBestParams(
1427 gpu_info, definition, src_depth, dst_depth, true, true, true, dst_shape);
1428 params.block_size.x *= params.block_size.y;
1429 params.block_size.y = 1;
1430 return params;
1431 }
1432
CreateConvPowerVR(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1433 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
1434 const OperationDef& definition,
1435 const Convolution2DAttributes& attr,
1436 const BHWC* dst_shape) {
1437 ConvPowerVR result(definition, attr, gpu_info, dst_shape);
1438 result.GenerateCode(gpu_info);
1439 result.UploadData(attr.weights, attr.bias);
1440 return result;
1441 }
1442
CreateConvPowerVR(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr,const BHWC * dst_shape)1443 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
1444 const OperationDef& definition,
1445 const FullyConnectedAttributes& attr,
1446 const BHWC* dst_shape) {
1447 ConvPowerVR result(definition, attr, gpu_info, dst_shape);
1448 result.GenerateCode(gpu_info);
1449 result.UploadData(attr.weights, attr.bias);
1450 return result;
1451 }
1452
CreateConvPowerVRDynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC & weights_shape,const BHWC * dst_shape)1453 ConvPowerVR CreateConvPowerVRDynamicWeights(const GpuInfo& gpu_info,
1454 const OperationDef& definition,
1455 const Convolution2DAttributes& attr,
1456 const BHWC& weights_shape,
1457 const BHWC* dst_shape) {
1458 ConvPowerVR result(definition, attr, weights_shape, gpu_info, dst_shape);
1459 result.GenerateCode(gpu_info);
1460 result.UploadBias(attr.bias);
1461 return result;
1462 }
1463
CreateConvPowerVRWino4x4To6x6(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr,const BHWC * dst_shape)1464 ConvPowerVR CreateConvPowerVRWino4x4To6x6(const GpuInfo& gpu_info,
1465 const OperationDef& definition,
1466 const Convolution2DAttributes& attr,
1467 const BHWC* dst_shape) {
1468 ConvPowerVR result(definition);
1469 result.conv_params_ =
1470 result.GuessBestParamsWinograd(gpu_info, definition, attr, dst_shape);
1471 result.GenerateCode(gpu_info);
1472 result.UploadDataForWinograd4x4To6x6(attr.weights);
1473 return result;
1474 }
1475
CreateConvPowerVR3D(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution3DAttributes & attr,const BHWDC * dst_shape)1476 ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
1477 const OperationDef& definition,
1478 const Convolution3DAttributes& attr,
1479 const BHWDC* dst_shape) {
1480 ConvPowerVR result(definition, attr, gpu_info, dst_shape);
1481 result.GenerateCode(gpu_info);
1482 result.UploadWeights(attr.weights);
1483 result.UploadBias(attr.bias);
1484 return result;
1485 }
1486
1487 } // namespace gpu
1488 } // namespace tflite
1489