• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_CONV_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_CONV_H_
18 
19 #include <vector>
20 
21 #include "tensorflow/lite/delegates/gpu/common/model.h"
22 #include "tensorflow/lite/delegates/gpu/common/operations.h"
23 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
24 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
25 
26 namespace tflite {
27 namespace gpu {
28 namespace metal {
29 
30 std::vector<ComputeTaskDescriptorPtr> Convolution(
31     int id, ValueId input_id, ValueId output_id,
32     const Convolution2DAttributes& params,
33     const metal::RuntimeOptions& options);
34 
35 // Convolution for kernel 1x1
36 // require:
37 //   kernel_size = 1x1;
38 //   padding prepended and appended = 0x0
39 //   dilation = 1x1;
40 //   stride = 1x1;
41 // Works very good on A12 (IPhoneXS, etc).
42 // Works good on A9/A10/A11 (IPhone6S, IPhone7, IPhoneX, etc).
43 // Works bad on A7/A8 (IPhone5S, IPhone6, etc).
44 std::vector<ComputeTaskDescriptorPtr> Convolution1x1(
45     int id, ValueId input_id, ValueId output_id,
46     const Convolution2DAttributes& params, const RuntimeOptions& options);
47 
48 // TODO(impjdi): Move it inside module.
49 bool CheckConvolution1x1Support(const Convolution2DAttributes& attr);
50 
51 // This convolution pass all conv parameters (beside output_channels)
52 // as dynamic arguments (uniform buffer) to kernel.
53 // Depending on output_channels can be generated different kernels
54 // Kernel can proceed 4/8/12/16 output channels per one thread.
55 // 16 channels output is the fastest but the least flexible.
56 std::vector<ComputeTaskDescriptorPtr> ConvolutionGeneric(
57     int id, ValueId input_id, ValueId output_id,
58     const Convolution2DAttributes& params, const RuntimeOptions& options);
59 
60 // This convolution makes more precise mapping of threads on elements.
61 // For example, if we have output tensor 12x7 and work group = 8x4,
62 // then we need 4 workgroups to cover this tensor in usual case.
63 // But in general we have only 84 elements(12*7), and we can cover it with 3
64 // workgroups of size 32. So this version of convolution use this precise
65 // mapping.
66 // But this convolution, due to some hardware limitations, doesn't work better
67 // always. In general it works good on A12.
68 // Each thread process 2 pixels in XY dimension and variable amount of pixels
69 // in Z dimension(depends on dst_channels).
70 std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise(
71     int id, ValueId input_id, ValueId output_id,
72     const Convolution2DAttributes& params, const RuntimeOptions& options);
73 
74 // As previous, but specific for 1x1 and each thread process 1 pixel in XY
75 // dimension.
76 // This convolution for PowerVR in FP16 mode with FP32 accumulator
77 // It will work in other modes also, but not with good performance
78 std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise1x1PowerVR(
79     int id, ValueId input_id, ValueId output_id,
80     const Convolution2DAttributes& params, const RuntimeOptions& options);
81 
82 // TODO(impjdi): Move it inside module.
83 bool CheckConvolutionPrecise1x1Support(const Convolution2DAttributes& attr);
84 
85 // This function calculates amount of threads that should be launched for
86 // ConvolutionGeneric or Convolution1x1 (threads_count1) and amount of threads
87 // that should be launched for ConvolutionPrecise (threads_count2) and returns
88 // threads_count1 / threads_count2.
89 float GetThreadsRatioUsualToPreciseConvolution(const BHWC& dst_shape);
90 
91 }  // namespace metal
92 }  // namespace gpu
93 }  // namespace tflite
94 
95 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_CONV_H_
96