1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_CONV_H_ 17 #define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_CONV_H_ 18 19 #include <vector> 20 21 #include "tensorflow/lite/delegates/gpu/common/model.h" 22 #include "tensorflow/lite/delegates/gpu/common/operations.h" 23 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" 24 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h" 25 26 namespace tflite { 27 namespace gpu { 28 namespace metal { 29 30 std::vector<ComputeTaskDescriptorPtr> Convolution( 31 int id, ValueId input_id, ValueId output_id, 32 const Convolution2DAttributes& params, 33 const metal::RuntimeOptions& options); 34 35 // Convolution for kernel 1x1 36 // require: 37 // kernel_size = 1x1; 38 // padding prepended and appended = 0x0 39 // dilation = 1x1; 40 // stride = 1x1; 41 // Works very good on A12 (IPhoneXS, etc). 42 // Works good on A9/A10/A11 (IPhone6S, IPhone7, IPhoneX, etc). 43 // Works bad on A7/A8 (IPhone5S, IPhone6, etc). 44 std::vector<ComputeTaskDescriptorPtr> Convolution1x1( 45 int id, ValueId input_id, ValueId output_id, 46 const Convolution2DAttributes& params, const RuntimeOptions& options); 47 48 // TODO(impjdi): Move it inside module. 49 bool CheckConvolution1x1Support(const Convolution2DAttributes& attr); 50 51 // This convolution pass all conv parameters (beside output_channels) 52 // as dynamic arguments (uniform buffer) to kernel. 53 // Depending on output_channels can be generated different kernels 54 // Kernel can proceed 4/8/12/16 output channels per one thread. 55 // 16 channels output is the fastest but the least flexible. 56 std::vector<ComputeTaskDescriptorPtr> ConvolutionGeneric( 57 int id, ValueId input_id, ValueId output_id, 58 const Convolution2DAttributes& params, const RuntimeOptions& options); 59 60 // This convolution makes more precise mapping of threads on elements. 61 // For example, if we have output tensor 12x7 and work group = 8x4, 62 // then we need 4 workgroups to cover this tensor in usual case. 63 // But in general we have only 84 elements(12*7), and we can cover it with 3 64 // workgroups of size 32. So this version of convolution use this precise 65 // mapping. 66 // But this convolution, due to some hardware limitations, doesn't work better 67 // always. In general it works good on A12. 68 // Each thread process 2 pixels in XY dimension and variable amount of pixels 69 // in Z dimension(depends on dst_channels). 70 std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise( 71 int id, ValueId input_id, ValueId output_id, 72 const Convolution2DAttributes& params, const RuntimeOptions& options); 73 74 // As previous, but specific for 1x1 and each thread process 1 pixel in XY 75 // dimension. 76 // This convolution for PowerVR in FP16 mode with FP32 accumulator 77 // It will work in other modes also, but not with good performance 78 std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise1x1PowerVR( 79 int id, ValueId input_id, ValueId output_id, 80 const Convolution2DAttributes& params, const RuntimeOptions& options); 81 82 // TODO(impjdi): Move it inside module. 83 bool CheckConvolutionPrecise1x1Support(const Convolution2DAttributes& attr); 84 85 // This function calculates amount of threads that should be launched for 86 // ConvolutionGeneric or Convolution1x1 (threads_count1) and amount of threads 87 // that should be launched for ConvolutionPrecise (threads_count2) and returns 88 // threads_count1 / threads_count2. 89 float GetThreadsRatioUsualToPreciseConvolution(const BHWC& dst_shape); 90 91 } // namespace metal 92 } // namespace gpu 93 } // namespace tflite 94 95 #endif // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_CONV_H_ 96