1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_H_
18
19 #include <memory>
20 #include <string>
21 #include <utility>
22 #include <vector>
23
24 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
25 #include "tensorflow/lite/delegates/gpu/common/operations.h"
26 #include "tensorflow/lite/delegates/gpu/common/shape.h"
27 #include "tensorflow/lite/delegates/gpu/common/status.h"
28 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
31 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
32 #include "tensorflow/lite/delegates/gpu/common/types.h"
33
34 namespace tflite {
35 namespace gpu {
36
37 class DepthwiseConv : public GPUOperation {
38 public:
39 int3 GetGridSize() const override;
40 void GetPossibleKernelWorkGroups(
41 TuningType tuning_type, const GpuInfo& gpu_info,
42 const KernelInfo& kernel_info,
43 std::vector<int3>* work_groups) const override;
44
45 // Move only
46 DepthwiseConv(DepthwiseConv&& operation) = default;
47 DepthwiseConv& operator=(DepthwiseConv&& operation) = default;
48 DepthwiseConv(const DepthwiseConv&) = delete;
49 DepthwiseConv& operator=(const DepthwiseConv&) = delete;
50
51 friend DepthwiseConv CreateDepthwiseConvolution2D(
52 const GpuInfo& gpu_info, const OperationDef& definition,
53 const DepthwiseConvolution2DAttributes& attr);
54
55 friend DepthwiseConv CreateDepthwiseConvolution2DDynamicWeights(
56 const GpuInfo& gpu_info, const OperationDef& definition,
57 const DepthwiseConvolution2DAttributes& attr);
58
59 friend DepthwiseConv CreateDepthwiseConvolution3D(
60 const GpuInfo& gpu_info, const OperationDef& definition,
61 const DepthwiseConvolution3DAttributes& attr);
62
63 private:
64 struct DepthwiseConvParams {
UseLocalMemDepthwiseConvParams65 bool UseLocalMem() const {
66 return use_weights_caching || use_spatial_caching;
67 }
GetKernelsTotalSizeDepthwiseConvParams68 int GetKernelsTotalSize() const {
69 return x_kernel_size * y_kernel_size * z_kernel_size;
70 }
GetWorkGroupTotalSizeDepthwiseConvParams71 int GetWorkGroupTotalSize() const {
72 return work_group_size.x * work_group_size.y * work_group_size.z;
73 }
74 int channel_multiplier;
75 // Supportd only tensors with Width & Height spatial dimensions
76 // optional, if true, spatial dims will be uploaded to local mem
77 bool use_spatial_caching = false;
78 // optional, if true, weights will be uploaded to local memory
79 bool use_weights_caching = false;
80 // optional, if UsesLocalMem() return true this field must be initialized
81 int3 work_group_size = int3(1, 1, 1);
82
83 // optional, if UsesLocalMem() return true this field must be initialized
84 int x_kernel_size = 1;
85 // optional, if UsesLocalMem() return true this field must be initialized
86 int y_kernel_size = 1;
87 // optional, if UsesLocalMem() return true this field must be initialized
88 int z_kernel_size = 1;
89
90 // optional, if use_spatial_caching true this field must be initialized
91 int x_dilation_size = 1;
92 // optional, if use_spatial_caching true this field must be initialized
93 int y_dilation_size = 1;
94 // optional, if use_spatial_caching true this field must be initialized
95 int z_dilation_size = 1;
96 };
97
98 explicit DepthwiseConv(const OperationDef& definition,
99 const DepthwiseConvParams& params);
100
101 std::string GenerateSrcUpload(const GpuInfo& gpu_info);
102 std::string GenerateWeightsUpload(const GpuInfo& gpu_info);
103 std::string GenerateCode(const GpuInfo& gpu_info);
104
105 template <DataType T>
106 void UploadWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, T>& weights,
107 bool weights_are_buffer);
108
109 template <DataType T>
110 void UploadWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, T>& weights,
111 bool weights_are_buffer);
112
113 DepthwiseConvParams params_;
114 };
115
116 template <DataType S, typename T>
RearrangeWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI,S> & weights,absl::Span<T> dst)117 void RearrangeWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, S>& weights,
118 absl::Span<T> dst) {
119 const int dst_channels = weights.shape.i * weights.shape.o;
120 const int dst_depth = DivideRoundUp(dst_channels, 4);
121 const int kernel_x = weights.shape.w;
122 const int kernel_y = weights.shape.h;
123
124 int counter = 0;
125 for (int d = 0; d < dst_depth; ++d) {
126 for (int y = 0; y < kernel_y; ++y) {
127 for (int x = 0; x < kernel_x; ++x) {
128 T filter_val;
129 for (int i = 0; i < 4; ++i) {
130 const int d_ch = d * 4 + i;
131 if (d_ch < dst_channels) {
132 const int f_index = weights.shape.LinearIndex(
133 {d_ch % weights.shape.o, y, x, d_ch / weights.shape.o});
134 filter_val[i] = weights.data[f_index];
135 } else {
136 filter_val[i] = 0.0f;
137 }
138 }
139 dst[counter++] = filter_val;
140 }
141 }
142 }
143 }
144
145 template <DataType T>
UploadWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI,T> & weights,bool weights_are_buffer)146 void DepthwiseConv::UploadWeightsForDWConv2D(
147 const tflite::gpu::Tensor<OHWI, T>& weights, bool weights_are_buffer) {
148 const int dst_channels = weights.shape.i * weights.shape.o;
149 const int dst_slices = DivideRoundUp(dst_channels, 4);
150 const int kernel_x = weights.shape.w;
151 const int kernel_y = weights.shape.h;
152
153 const int elements_count = kernel_x * kernel_y * dst_slices;
154
155 const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
156 const int float4_size = fp32_weights ? 16 : 8;
157
158 std::vector<uint8_t> data(float4_size * elements_count);
159
160 if (fp32_weights) {
161 float4* ptr = reinterpret_cast<float4*>(data.data());
162 RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
163 } else {
164 half4* ptr = reinterpret_cast<half4*>(data.data());
165 RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
166 }
167
168 if (weights_are_buffer) {
169 BufferDescriptor desc;
170 desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
171 desc.element_size = 4;
172 desc.size = float4_size * elements_count;
173 desc.data = std::move(data);
174 args_.AddObject("weights", std::make_unique<BufferDescriptor>(desc));
175 } else {
176 TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
177 fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
178 TensorStorageType::TEXTURE_2D, kernel_x * kernel_y, dst_slices,
179 data.data());
180 args_.AddObject("weights", std::make_unique<TensorDescriptor>(desc));
181 }
182 }
183
184 template <DataType S, typename T>
RearrangeWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI,S> & weights,absl::Span<T> dst)185 void RearrangeWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, S>& weights,
186 absl::Span<T> dst) {
187 const int dst_channels = weights.shape.i * weights.shape.o;
188 const int dst_slices = DivideRoundUp(dst_channels, 4);
189 const int kernel_x = weights.shape.w;
190 const int kernel_y = weights.shape.h;
191 const int kernel_z = weights.shape.d;
192
193 int counter = 0;
194 for (int d = 0; d < dst_slices; ++d) {
195 for (int z = 0; z < kernel_z; ++z) {
196 for (int y = 0; y < kernel_y; ++y) {
197 for (int x = 0; x < kernel_x; ++x) {
198 T filter_val;
199 for (int i = 0; i < 4; ++i) {
200 const int d_ch = d * 4 + i;
201 if (d_ch < dst_channels) {
202 const int f_index = weights.shape.LinearIndex(
203 {d_ch % weights.shape.o, y, x, z, d_ch / weights.shape.o});
204 filter_val[i] = weights.data[f_index];
205 } else {
206 filter_val[i] = 0.0f;
207 }
208 }
209 dst[counter++] = filter_val;
210 }
211 }
212 }
213 }
214 }
215
216 template <DataType T>
UploadWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI,T> & weights,bool weights_are_buffer)217 void DepthwiseConv::UploadWeightsForDWConv3D(
218 const tflite::gpu::Tensor<OHWDI, T>& weights, bool weights_are_buffer) {
219 const int dst_channels = weights.shape.i * weights.shape.o;
220 const int dst_slices = DivideRoundUp(dst_channels, 4);
221 const int kernel_x = weights.shape.w;
222 const int kernel_y = weights.shape.h;
223 const int kernel_z = weights.shape.d;
224
225 const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
226
227 const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
228 const int float4_size = fp32_weights ? 16 : 8;
229
230 std::vector<uint8_t> data(float4_size * elements_count);
231
232 if (fp32_weights) {
233 float4* ptr = reinterpret_cast<float4*>(data.data());
234 RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
235 } else {
236 half4* ptr = reinterpret_cast<half4*>(data.data());
237 RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
238 }
239
240 if (weights_are_buffer) {
241 BufferDescriptor desc;
242 desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
243 desc.element_size = 4;
244 desc.size = float4_size * elements_count;
245 desc.data = std::move(data);
246 args_.AddObject("weights",
247 std::make_unique<BufferDescriptor>(std::move(desc)));
248 } else {
249 TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
250 fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
251 TensorStorageType::TEXTURE_2D, kernel_x * kernel_y * kernel_z,
252 dst_slices, data.data());
253 args_.AddObject("weights", std::make_unique<TensorDescriptor>(desc));
254 }
255 }
256
257 DepthwiseConv CreateDepthwiseConvolution2D(
258 const GpuInfo& gpu_info, const OperationDef& definition,
259 const DepthwiseConvolution2DAttributes& attr);
260
261 DepthwiseConv CreateDepthwiseConvolution2DDynamicWeights(
262 const GpuInfo& gpu_info, const OperationDef& definition,
263 const DepthwiseConvolution2DAttributes& attr);
264
265 DepthwiseConv CreateDepthwiseConvolution3D(
266 const GpuInfo& gpu_info, const OperationDef& definition,
267 const DepthwiseConvolution3DAttributes& attr);
268
269 } // namespace gpu
270 } // namespace tflite
271
272 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_H_
273