1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_H_
18
19 #include <memory>
20 #include <vector>
21
22 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
28 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
31 #include "tensorflow/lite/delegates/gpu/common/types.h"
32
33 namespace tflite {
34 namespace gpu {
35
36 class DepthwiseConv3x3 : public GPUOperation {
37 public:
38 DepthwiseConv3x3() = default;
39 void GetPossibleKernelWorkGroups(
40 TuningType tuning_type, const GpuInfo& gpu_info,
41 const KernelInfo& kernel_info,
42 std::vector<int3>* work_groups) const override;
43 int3 GetGridSize() const override;
44
45 // Move only
46 DepthwiseConv3x3(DepthwiseConv3x3&& operation);
47 DepthwiseConv3x3& operator=(DepthwiseConv3x3&& operation);
48 DepthwiseConv3x3(const DepthwiseConv3x3&) = delete;
49 DepthwiseConv3x3& operator=(const DepthwiseConv3x3&) = delete;
50
51 private:
52 explicit DepthwiseConv3x3(const OperationDef& definition,
53 bool weights_are_buffer, bool local_mem_uploads,
54 const GpuInfo& gpu_info);
55 template <DataType T>
56 void UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI, T>& weights,
57 const tflite::gpu::Tensor<Linear, T>& biases,
58 bool weights_are_buffer);
59
60 friend DepthwiseConv3x3 CreateDepthwiseConv3x3(
61 const GpuInfo& gpu_info, const OperationDef& definition,
62 const DepthwiseConvolution2DAttributes& attr);
63
64 template <DataType S, typename T>
65 void RearrangeWeightsAndBiasesData(
66 const tflite::gpu::Tensor<OHWI, S>& weights,
67 const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
68
69 std::string GenerateDepthwiseConvCode(const OperationDef& op_def,
70 bool weights_are_buffer,
71 bool local_mem_uploads);
72
73 bool local_mem_uploads_;
74 };
75
76 template <DataType T>
UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases,bool weights_are_buffer)77 void DepthwiseConv3x3::UploadWeightsAndBiases(
78 const tflite::gpu::Tensor<OHWI, T>& weights,
79 const tflite::gpu::Tensor<Linear, T>& biases, bool weights_are_buffer) {
80 const int src_depth = DivideRoundUp(weights.shape.i, 4);
81 int texture_width = 10; // 3x3 kernel + 1 bias
82 int texture_height = src_depth;
83 const int elements_count = texture_width * texture_height;
84 const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
85 const int float4_size = fp32_weights ? 16 : 8;
86
87 std::vector<uint8_t> data(float4_size * elements_count);
88 if (fp32_weights) {
89 float4* ptr = reinterpret_cast<float4*>(data.data());
90 RearrangeWeightsAndBiasesData(weights, biases,
91 absl::MakeSpan(ptr, elements_count));
92 } else {
93 half4* ptr = reinterpret_cast<half4*>(data.data());
94 RearrangeWeightsAndBiasesData(weights, biases,
95 absl::MakeSpan(ptr, elements_count));
96 }
97
98 if (weights_are_buffer) {
99 BufferDescriptor desc;
100 desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
101 desc.element_size = 4;
102 desc.size = float4_size * elements_count;
103 desc.data = std::move(data);
104 args_.AddObject("weights",
105 absl::make_unique<BufferDescriptor>(std::move(desc)));
106 } else {
107 Texture2DDescriptor desc;
108 desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
109 desc.size = int2(texture_width, texture_height);
110 desc.data = std::move(data);
111 args_.AddObject("weights",
112 absl::make_unique<Texture2DDescriptor>(std::move(desc)));
113 }
114 }
115
116 template <DataType S, typename T>
RearrangeWeightsAndBiasesData(const tflite::gpu::Tensor<OHWI,S> & weights,const tflite::gpu::Tensor<Linear,S> & biases,absl::Span<T> dst)117 void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(
118 const tflite::gpu::Tensor<OHWI, S>& weights,
119 const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
120 const int src_depth = DivideRoundUp(weights.shape.i, 4);
121
122 int counter = 0;
123 for (int s = 0; s < src_depth; ++s) {
124 for (int y = 0; y < 3; ++y) {
125 for (int x = 0; x < 3; ++x) {
126 T filter_val;
127 for (int i = 0; i < 4; ++i) {
128 const int s_ch = s * 4 + i;
129 if (s_ch < weights.shape.i) {
130 const int f_index = weights.shape.LinearIndex({0, y, x, s_ch});
131 filter_val[i] = weights.data[f_index];
132 } else {
133 filter_val[i] = 0.0f;
134 }
135 }
136 dst[counter++] = filter_val;
137 }
138 }
139
140 T bias_val;
141 for (int i = 0; i < 4; ++i) {
142 const int dst_ch = s * 4 + i;
143 bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch];
144 }
145 dst[counter++] = bias_val;
146 }
147 }
148
149 bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr);
150
151 DepthwiseConv3x3 CreateDepthwiseConv3x3(
152 const GpuInfo& gpu_info, const OperationDef& definition,
153 const DepthwiseConvolution2DAttributes& attr);
154
155 } // namespace gpu
156 } // namespace tflite
157
158 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_H_
159