1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_POWERVR_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_POWERVR_H_
18 
19 #include <cstring>
20 #include <vector>
21 
22 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
28 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
32 #include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
33 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
34 #include "tensorflow/lite/delegates/gpu/common/types.h"
35 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
36 
37 namespace tflite {
38 namespace gpu {
39 
40 class ConvPowerVR : public GPUOperation {
41  public:
42   ConvPowerVR() = default;
43   void GetPossibleKernelWorkGroups(
44       TuningType tuning_type, const GpuInfo& gpu_info,
45       const KernelInfo& kernel_info,
46       std::vector<int3>* work_groups) const override;
47   absl::Status BindArguments(ArgumentsBinder* args) override;
48   int3 GetGridSize() const override;
49 
GetWeightsDescription()50   WeightsDescription GetWeightsDescription() const {
51     WeightsDescription desc;
52     desc.layout = conv_params_.weights_layout;
53     desc.output_group_size = conv_params_.block_size.w;
54     return desc;
55   }
56 
57   // Move only
58   ConvPowerVR(ConvPowerVR&& operation);
59   ConvPowerVR& operator=(ConvPowerVR&& operation);
60   ConvPowerVR(const ConvPowerVR&) = delete;
61   ConvPowerVR& operator=(const ConvPowerVR&) = delete;
62 
63  private:
64   enum class WeightsUploadType {
65     LOCAL_MEM_ASYNC_SUBGROUP,  // we use it for PowerVR with workgroup size = 32
66     LOCAL_MEM_BY_THREADS,
67     GLOBAL_MEM,
68     CONSTANT_MEM,
69     PRIVATE_MEM_SIMD_BROADCAST,
70     TEXTURES_MEM_X4,  // 4 textures for weights
71   };
72 
73   struct ConvParams {
74     // Usually we use this combinations for CalculationPrecision:
75     // F32: all F32
76     // F16: all F16
77     // F32_F16: all besides accumulator is F16, including weights
78     // But for PowerVR we can achieve better performance in F32_F16 with F32
79     // weights, so for PowerVR in this kernel we have F32 weights for
80     // F32_F16 precision mode
81     DataType weights_data_type;  // used for weights and biases
82     int4 block_size;             // WHDS
83     bool fixed_work_group_size;
84     bool linear_spatial;  // spatial dimensions are Width/Height/Depth
85     bool linear_all;  // linear_spatial & linear_all can not be used together,
86                       // linear_all can not be used with WeightsUploadTypes
87                       // that use workgroups(subgroups) for
88                       // uploading(LOCAL_MEM_BY_THREADS for example).
89     bool different_weights_for_height;
90     int src_depth_loop_size;
91     WeightsUploadType weights_upload_type;
92     bool x_kernel_is_1;
93     bool y_kernel_is_1;
94     bool z_kernel_is_1;
95     WeightsLayout weights_layout;
96 
97     // used only with PRIVATE_MEM_SIMD_BROADCAST
98     int simd_size = 1;
99 
AreWeightsBufferConvParams100     bool AreWeightsBuffer() const {
101       return weights_upload_type != WeightsUploadType::TEXTURES_MEM_X4;
102     }
103 
IsPrivateMemBroadcastConvParams104     bool IsPrivateMemBroadcast() const {
105       return weights_upload_type ==
106              WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
107     }
108   };
109 
110   ConvPowerVR(const OperationDef& definition,
111               const Convolution2DAttributes& attr, const GpuInfo& gpu_info,
112               const BHWC* dst_shape = nullptr);
113   ConvPowerVR(const OperationDef& definition,
114               const Convolution2DAttributes& attr, const BHWC& weights_shape,
115               const GpuInfo& gpu_info, const BHWC* dst_shape = nullptr);
116   ConvPowerVR(const OperationDef& definition,
117               const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
118               const BHWC* dst_shape = nullptr);
119   explicit ConvPowerVR(const OperationDef& definition);
120   ConvPowerVR(const OperationDef& definition,
121               const Convolution3DAttributes& attr, const GpuInfo& gpu_info,
122               const BHWDC* dst_shape = nullptr);
123 
124   void GenerateCode(const GpuInfo& gpu_info);
125 
126   template <DataType T>
127   void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
128                   const tflite::gpu::Tensor<Linear, T>& biases);
129   template <DataType T>
130   void UploadDataForWinograd4x4To6x6(
131       const tflite::gpu::Tensor<OHWI, T>& weights);
132 
133   template <DataType T>
134   void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
135 
136   template <DataType T>
137   void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
138 
139   template <DataType T>
140   void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
141 
142   friend ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
143                                        const OperationDef& definition,
144                                        const Convolution2DAttributes& attr,
145                                        const BHWC* dst_shape);
146 
147   friend ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
148                                        const OperationDef& definition,
149                                        const FullyConnectedAttributes& attr,
150                                        const BHWC* dst_shape);
151 
152   friend ConvPowerVR CreateConvPowerVRDynamicWeights(
153       const GpuInfo& gpu_info, const OperationDef& definition,
154       const Convolution2DAttributes& attr, const BHWC& weights_shape,
155       const BHWC* dst_shape);
156 
157   friend ConvPowerVR CreateConvPowerVRWino4x4To6x6(
158       const GpuInfo& gpu_info, const OperationDef& definition,
159       const Convolution2DAttributes& attr, const BHWC* dst_shape);
160 
161   friend ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
162                                          const OperationDef& definition,
163                                          const Convolution3DAttributes& attr,
164                                          const BHWDC* dst_shape);
165 
166   ConvParams GuessBestParams(const GpuInfo& gpu_info,
167                              const OperationDef& definition,
168                              const Convolution2DAttributes& attr,
169                              const BHWC* dst_shape = nullptr);
170   ConvParams GuessBestParams(const GpuInfo& gpu_info,
171                              const OperationDef& definition,
172                              const Convolution2DAttributes& attr,
173                              const BHWC& weights_shape,
174                              const BHWC* dst_shape = nullptr);
175   ConvParams GuessBestParams(const GpuInfo& gpu_info,
176                              const OperationDef& definition,
177                              const FullyConnectedAttributes& attr,
178                              const BHWC* dst_shape = nullptr);
179   ConvParams GuessBestParamsWinograd(const GpuInfo& gpu_info,
180                                      const OperationDef& definition,
181                                      const Convolution2DAttributes& attr,
182                                      const BHWC* dst_shape = nullptr);
183   ConvParams GuessBestParams(const GpuInfo& gpu_info,
184                              const OperationDef& definition,
185                              const Convolution3DAttributes& attr,
186                              const BHWDC* dst_shape = nullptr);
187   ConvParams GuessBestParams(const GpuInfo& gpu_info,
188                              const OperationDef& definition, int src_depth,
189                              int dst_depth, bool x_kernel_is_1,
190                              bool y_kernel_is_1,
191                              bool different_weights_for_height,
192                              const BHWC* dst_shape = nullptr);
193 
194   std::string GenerateConv(const GpuInfo& gpu_info, const OperationDef& op_def,
195                            bool stride_correction,
196                            const ConvParams& conv_params);
197 
198   int4 stride_;
199   int4 padding_;
200   int4 kernel_size_;
201   int4 dilation_;
202   ConvParams conv_params_;
203 };
204 
205 template <DataType T>
UploadData(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases)206 void ConvPowerVR::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
207                              const tflite::gpu::Tensor<Linear, T>& biases) {
208   UploadWeights(weights);
209   UploadBias(biases);
210 }
211 
212 template <DataType T>
UploadDataForWinograd4x4To6x6(const tflite::gpu::Tensor<OHWI,T> & weights)213 void ConvPowerVR::UploadDataForWinograd4x4To6x6(
214     const tflite::gpu::Tensor<OHWI, T>& weights) {
215   tflite::gpu::Tensor<OHWI, T> wino_weights;
216   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
217   UploadWeights(wino_weights);
218   tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
219   biases.shape = Linear(weights.shape.o);
220   biases.data.resize(weights.shape.o, 0.0f);
221   UploadBias(biases);
222 }
223 
224 template <DataType T>
UploadBias(const tflite::gpu::Tensor<Linear,T> & bias)225 void ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
226   BufferDescriptor desc;
227   desc.element_type = conv_params_.weights_data_type;
228   desc.element_size = 4;
229   desc.memory_type = conv_params_.weights_upload_type ==
230                              ConvPowerVR::WeightsUploadType::CONSTANT_MEM
231                          ? MemoryType::CONSTANT
232                          : MemoryType::GLOBAL;
233   const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
234                              ? sizeof(float)
235                              : sizeof(half);
236   int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.w);
237   desc.size = float_size * aligned_channels;
238   desc.data.resize(desc.size);
239   if (conv_params_.weights_data_type == DataType::FLOAT32) {
240     float* gpu_data = reinterpret_cast<float*>(desc.data.data());
241     for (int i = 0; i < aligned_channels; ++i) {
242       gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
243     }
244   } else {
245     half* gpu_data = reinterpret_cast<half*>(desc.data.data());
246     for (int i = 0; i < aligned_channels; ++i) {
247       gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
248     }
249   }
250   args_.AddObject("biases",
251                   absl::make_unique<BufferDescriptor>(std::move(desc)));
252 }
253 
254 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights)255 void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
256   const int flt_count =
257       GetTotalElementsCountForLayout(GetWeightsDescription(), weights.shape);
258   DataType weights_type = conv_params_.weights_data_type;
259 
260   std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_type));
261   RearrangeWeights(weights, GetWeightsDescription(), weights_type,
262                    absl::MakeSpan(weights_data));
263 
264   if (conv_params_.AreWeightsBuffer()) {
265     BufferDescriptor desc;
266     desc.element_type = weights_type;
267     desc.element_size = 4;
268     desc.memory_type = conv_params_.weights_upload_type ==
269                                ConvPowerVR::WeightsUploadType::CONSTANT_MEM
270                            ? MemoryType::CONSTANT
271                            : MemoryType::GLOBAL;
272     desc.size = weights_data.size();
273     desc.data = std::move(weights_data);
274     args_.AddObject("weights",
275                     absl::make_unique<BufferDescriptor>(std::move(desc)));
276   } else {
277     const int dst_depth =
278         AlignByN(DivideRoundUp(weights.shape.o, 4), conv_params_.block_size.w);
279     const int src_depth = DivideRoundUp(weights.shape.i, 4);
280     const int kernel_x = weights.shape.w;
281     const int kernel_y = weights.shape.h;
282     int texture_width = dst_depth;
283     int texture_height = src_depth * kernel_x * kernel_y;
284     int sub_size = SizeOf(weights_type) * 4 * texture_width * texture_height;
285     for (int i = 0; i < 4; ++i) {
286       Texture2DDescriptor desc;
287       desc.element_type = weights_type;
288       desc.size = int2(texture_width, texture_height);
289       desc.data.resize(sub_size);
290       memcpy(desc.data.data(), weights_data.data() + sub_size * i, sub_size);
291       const std::string name = "weights" + std::to_string(i);
292       args_.AddObject(name,
293                       absl::make_unique<Texture2DDescriptor>(std::move(desc)));
294     }
295   }
296 }
297 
298 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWDI,T> & weights)299 void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
300   const int flt_count =
301       GetTotalElementsCountForLayout(GetWeightsDescription(), weights.shape);
302   DataType weights_type = conv_params_.weights_data_type;
303 
304   std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_type));
305   RearrangeWeights(weights, GetWeightsDescription(), weights_type,
306                    absl::MakeSpan(weights_data));
307 
308   if (conv_params_.AreWeightsBuffer()) {
309     BufferDescriptor desc;
310     desc.element_type = weights_type;
311     desc.element_size = 4;
312     desc.size = weights_data.size();
313     desc.data = std::move(weights_data);
314     args_.AddObject("weights",
315                     absl::make_unique<BufferDescriptor>(std::move(desc)));
316   } else {
317     const int dst_slices =
318         AlignByN(DivideRoundUp(weights.shape.o, 4), conv_params_.block_size.w);
319     const int src_slices = DivideRoundUp(weights.shape.i, 4);
320     const int kernel_x = weights.shape.w;
321     const int kernel_y = weights.shape.h;
322     const int kernel_z = weights.shape.d;
323     int texture_width = dst_slices;
324     int texture_height = src_slices * kernel_x * kernel_y * kernel_z;
325     int sub_size = SizeOf(weights_type) * 4 * texture_width * texture_height;
326     for (int i = 0; i < 4; ++i) {
327       Texture2DDescriptor desc;
328       desc.element_type = weights_type;
329       desc.size = int2(texture_width, texture_height);
330       desc.data.resize(sub_size);
331       memcpy(desc.data.data(), weights_data.data() + sub_size * i, sub_size);
332       const std::string name = "weights" + std::to_string(i);
333       args_.AddObject(name,
334                       absl::make_unique<Texture2DDescriptor>(std::move(desc)));
335     }
336   }
337 }
338 
339 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
340                               const OperationDef& definition,
341                               const Convolution2DAttributes& attr,
342                               const BHWC* dst_shape = nullptr);
343 
344 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
345                               const OperationDef& definition,
346                               const FullyConnectedAttributes& attr,
347                               const BHWC* dst_shape = nullptr);
348 
349 ConvPowerVR CreateConvPowerVRDynamicWeights(const GpuInfo& gpu_info,
350                                             const OperationDef& definition,
351                                             const Convolution2DAttributes& attr,
352                                             const BHWC& weights_shape,
353                                             const BHWC* dst_shape = nullptr);
354 
355 ConvPowerVR CreateConvPowerVRWino4x4To6x6(const GpuInfo& gpu_info,
356                                           const OperationDef& definition,
357                                           const Convolution2DAttributes& attr,
358                                           const BHWC* dst_shape = nullptr);
359 
360 ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
361                                 const OperationDef& definition,
362                                 const Convolution3DAttributes& attr,
363                                 const BHWDC* dst_shape = nullptr);
364 
365 }  // namespace gpu
366 }  // namespace tflite
367 
368 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_POWERVR_H_
369