• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_H_
18 
19 #include <memory>
20 #include <string>
21 #include <utility>
22 #include <vector>
23 
24 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
25 #include "tensorflow/lite/delegates/gpu/common/operations.h"
26 #include "tensorflow/lite/delegates/gpu/common/shape.h"
27 #include "tensorflow/lite/delegates/gpu/common/status.h"
28 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
31 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
32 #include "tensorflow/lite/delegates/gpu/common/types.h"
33 
34 namespace tflite {
35 namespace gpu {
36 
37 class DepthwiseConv : public GPUOperation {
38  public:
39   int3 GetGridSize() const override;
40   void GetPossibleKernelWorkGroups(
41       TuningType tuning_type, const GpuInfo& gpu_info,
42       const KernelInfo& kernel_info,
43       std::vector<int3>* work_groups) const override;
44 
45   // Move only
46   DepthwiseConv(DepthwiseConv&& operation) = default;
47   DepthwiseConv& operator=(DepthwiseConv&& operation) = default;
48   DepthwiseConv(const DepthwiseConv&) = delete;
49   DepthwiseConv& operator=(const DepthwiseConv&) = delete;
50 
51   friend DepthwiseConv CreateDepthwiseConvolution2D(
52       const GpuInfo& gpu_info, const OperationDef& definition,
53       const DepthwiseConvolution2DAttributes& attr);
54 
55   friend DepthwiseConv CreateDepthwiseConvolution2DDynamicWeights(
56       const GpuInfo& gpu_info, const OperationDef& definition,
57       const DepthwiseConvolution2DAttributes& attr);
58 
59   friend DepthwiseConv CreateDepthwiseConvolution3D(
60       const GpuInfo& gpu_info, const OperationDef& definition,
61       const DepthwiseConvolution3DAttributes& attr);
62 
63  private:
64   struct DepthwiseConvParams {
UseLocalMemDepthwiseConvParams65     bool UseLocalMem() const {
66       return use_weights_caching || use_spatial_caching;
67     }
GetKernelsTotalSizeDepthwiseConvParams68     int GetKernelsTotalSize() const {
69       return x_kernel_size * y_kernel_size * z_kernel_size;
70     }
GetWorkGroupTotalSizeDepthwiseConvParams71     int GetWorkGroupTotalSize() const {
72       return work_group_size.x * work_group_size.y * work_group_size.z;
73     }
74     int channel_multiplier;
75     // Supportd only tensors with Width & Height spatial dimensions
76     // optional, if true, spatial dims will be uploaded to local mem
77     bool use_spatial_caching = false;
78     // optional, if true, weights will be uploaded to local memory
79     bool use_weights_caching = false;
80     // optional, if UsesLocalMem() return true this field must be initialized
81     int3 work_group_size = int3(1, 1, 1);
82 
83     // optional, if UsesLocalMem() return true this field must be initialized
84     int x_kernel_size = 1;
85     // optional, if UsesLocalMem() return true this field must be initialized
86     int y_kernel_size = 1;
87     // optional, if UsesLocalMem() return true this field must be initialized
88     int z_kernel_size = 1;
89 
90     // optional, if use_spatial_caching true this field must be initialized
91     int x_dilation_size = 1;
92     // optional, if use_spatial_caching true this field must be initialized
93     int y_dilation_size = 1;
94     // optional, if use_spatial_caching true this field must be initialized
95     int z_dilation_size = 1;
96   };
97 
98   explicit DepthwiseConv(const OperationDef& definition,
99                          const DepthwiseConvParams& params);
100 
101   std::string GenerateSrcUpload(const GpuInfo& gpu_info);
102   std::string GenerateWeightsUpload(const GpuInfo& gpu_info);
103   std::string GenerateCode(const GpuInfo& gpu_info);
104 
105   template <DataType T>
106   void UploadWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, T>& weights,
107                                 bool weights_are_buffer);
108 
109   template <DataType T>
110   void UploadWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, T>& weights,
111                                 bool weights_are_buffer);
112 
113   DepthwiseConvParams params_;
114 };
115 
116 template <DataType S, typename T>
RearrangeWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI,S> & weights,absl::Span<T> dst)117 void RearrangeWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, S>& weights,
118                                  absl::Span<T> dst) {
119   const int dst_channels = weights.shape.i * weights.shape.o;
120   const int dst_depth = DivideRoundUp(dst_channels, 4);
121   const int kernel_x = weights.shape.w;
122   const int kernel_y = weights.shape.h;
123 
124   int counter = 0;
125   for (int d = 0; d < dst_depth; ++d) {
126     for (int y = 0; y < kernel_y; ++y) {
127       for (int x = 0; x < kernel_x; ++x) {
128         T filter_val;
129         for (int i = 0; i < 4; ++i) {
130           const int d_ch = d * 4 + i;
131           if (d_ch < dst_channels) {
132             const int f_index = weights.shape.LinearIndex(
133                 {d_ch % weights.shape.o, y, x, d_ch / weights.shape.o});
134             filter_val[i] = weights.data[f_index];
135           } else {
136             filter_val[i] = 0.0f;
137           }
138         }
139         dst[counter++] = filter_val;
140       }
141     }
142   }
143 }
144 
145 template <DataType T>
UploadWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI,T> & weights,bool weights_are_buffer)146 void DepthwiseConv::UploadWeightsForDWConv2D(
147     const tflite::gpu::Tensor<OHWI, T>& weights, bool weights_are_buffer) {
148   const int dst_channels = weights.shape.i * weights.shape.o;
149   const int dst_slices = DivideRoundUp(dst_channels, 4);
150   const int kernel_x = weights.shape.w;
151   const int kernel_y = weights.shape.h;
152 
153   const int elements_count = kernel_x * kernel_y * dst_slices;
154 
155   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
156   const int float4_size = fp32_weights ? 16 : 8;
157 
158   std::vector<uint8_t> data(float4_size * elements_count);
159 
160   if (fp32_weights) {
161     float4* ptr = reinterpret_cast<float4*>(data.data());
162     RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
163   } else {
164     half4* ptr = reinterpret_cast<half4*>(data.data());
165     RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
166   }
167 
168   if (weights_are_buffer) {
169     BufferDescriptor desc;
170     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
171     desc.element_size = 4;
172     desc.size = float4_size * elements_count;
173     desc.data = std::move(data);
174     args_.AddObject("weights", std::make_unique<BufferDescriptor>(desc));
175   } else {
176     TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
177         fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
178         TensorStorageType::TEXTURE_2D, kernel_x * kernel_y, dst_slices,
179         data.data());
180     args_.AddObject("weights", std::make_unique<TensorDescriptor>(desc));
181   }
182 }
183 
184 template <DataType S, typename T>
RearrangeWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI,S> & weights,absl::Span<T> dst)185 void RearrangeWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, S>& weights,
186                                  absl::Span<T> dst) {
187   const int dst_channels = weights.shape.i * weights.shape.o;
188   const int dst_slices = DivideRoundUp(dst_channels, 4);
189   const int kernel_x = weights.shape.w;
190   const int kernel_y = weights.shape.h;
191   const int kernel_z = weights.shape.d;
192 
193   int counter = 0;
194   for (int d = 0; d < dst_slices; ++d) {
195     for (int z = 0; z < kernel_z; ++z) {
196       for (int y = 0; y < kernel_y; ++y) {
197         for (int x = 0; x < kernel_x; ++x) {
198           T filter_val;
199           for (int i = 0; i < 4; ++i) {
200             const int d_ch = d * 4 + i;
201             if (d_ch < dst_channels) {
202               const int f_index = weights.shape.LinearIndex(
203                   {d_ch % weights.shape.o, y, x, z, d_ch / weights.shape.o});
204               filter_val[i] = weights.data[f_index];
205             } else {
206               filter_val[i] = 0.0f;
207             }
208           }
209           dst[counter++] = filter_val;
210         }
211       }
212     }
213   }
214 }
215 
216 template <DataType T>
UploadWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI,T> & weights,bool weights_are_buffer)217 void DepthwiseConv::UploadWeightsForDWConv3D(
218     const tflite::gpu::Tensor<OHWDI, T>& weights, bool weights_are_buffer) {
219   const int dst_channels = weights.shape.i * weights.shape.o;
220   const int dst_slices = DivideRoundUp(dst_channels, 4);
221   const int kernel_x = weights.shape.w;
222   const int kernel_y = weights.shape.h;
223   const int kernel_z = weights.shape.d;
224 
225   const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
226 
227   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
228   const int float4_size = fp32_weights ? 16 : 8;
229 
230   std::vector<uint8_t> data(float4_size * elements_count);
231 
232   if (fp32_weights) {
233     float4* ptr = reinterpret_cast<float4*>(data.data());
234     RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
235   } else {
236     half4* ptr = reinterpret_cast<half4*>(data.data());
237     RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
238   }
239 
240   if (weights_are_buffer) {
241     BufferDescriptor desc;
242     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
243     desc.element_size = 4;
244     desc.size = float4_size * elements_count;
245     desc.data = std::move(data);
246     args_.AddObject("weights",
247                     std::make_unique<BufferDescriptor>(std::move(desc)));
248   } else {
249     TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
250         fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
251         TensorStorageType::TEXTURE_2D, kernel_x * kernel_y * kernel_z,
252         dst_slices, data.data());
253     args_.AddObject("weights", std::make_unique<TensorDescriptor>(desc));
254   }
255 }
256 
257 DepthwiseConv CreateDepthwiseConvolution2D(
258     const GpuInfo& gpu_info, const OperationDef& definition,
259     const DepthwiseConvolution2DAttributes& attr);
260 
261 DepthwiseConv CreateDepthwiseConvolution2DDynamicWeights(
262     const GpuInfo& gpu_info, const OperationDef& definition,
263     const DepthwiseConvolution2DAttributes& attr);
264 
265 DepthwiseConv CreateDepthwiseConvolution3D(
266     const GpuInfo& gpu_info, const OperationDef& definition,
267     const DepthwiseConvolution3DAttributes& attr);
268 
269 }  // namespace gpu
270 }  // namespace tflite
271 
272 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_H_
273