1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_POWERVR_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_POWERVR_H_
18
19 #include <cstring>
20 #include <vector>
21
22 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
28 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
32 #include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
33 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
34 #include "tensorflow/lite/delegates/gpu/common/types.h"
35 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
36
37 namespace tflite {
38 namespace gpu {
39
40 class ConvPowerVR : public GPUOperation {
41 public:
42 ConvPowerVR() = default;
43 void GetPossibleKernelWorkGroups(
44 TuningType tuning_type, const GpuInfo& gpu_info,
45 const KernelInfo& kernel_info,
46 std::vector<int3>* work_groups) const override;
47 absl::Status BindArguments(ArgumentsBinder* args) override;
48 int3 GetGridSize() const override;
49
GetWeightsDescription()50 WeightsDescription GetWeightsDescription() const {
51 WeightsDescription desc;
52 desc.layout = conv_params_.weights_layout;
53 desc.output_group_size = conv_params_.block_size.w;
54 return desc;
55 }
56
57 // Move only
58 ConvPowerVR(ConvPowerVR&& operation);
59 ConvPowerVR& operator=(ConvPowerVR&& operation);
60 ConvPowerVR(const ConvPowerVR&) = delete;
61 ConvPowerVR& operator=(const ConvPowerVR&) = delete;
62
63 private:
64 enum class WeightsUploadType {
65 LOCAL_MEM_ASYNC_SUBGROUP, // we use it for PowerVR with workgroup size = 32
66 LOCAL_MEM_BY_THREADS,
67 GLOBAL_MEM,
68 CONSTANT_MEM,
69 PRIVATE_MEM_SIMD_BROADCAST,
70 TEXTURES_MEM_X4, // 4 textures for weights
71 };
72
73 struct ConvParams {
74 // Usually we use this combinations for CalculationPrecision:
75 // F32: all F32
76 // F16: all F16
77 // F32_F16: all besides accumulator is F16, including weights
78 // But for PowerVR we can achieve better performance in F32_F16 with F32
79 // weights, so for PowerVR in this kernel we have F32 weights for
80 // F32_F16 precision mode
81 DataType weights_data_type; // used for weights and biases
82 int4 block_size; // WHDS
83 bool fixed_work_group_size;
84 bool linear_spatial; // spatial dimensions are Width/Height/Depth
85 bool linear_all; // linear_spatial & linear_all can not be used together,
86 // linear_all can not be used with WeightsUploadTypes
87 // that use workgroups(subgroups) for
88 // uploading(LOCAL_MEM_BY_THREADS for example).
89 bool different_weights_for_height;
90 int src_depth_loop_size;
91 WeightsUploadType weights_upload_type;
92 bool x_kernel_is_1;
93 bool y_kernel_is_1;
94 bool z_kernel_is_1;
95 WeightsLayout weights_layout;
96
97 // used only with PRIVATE_MEM_SIMD_BROADCAST
98 int simd_size = 1;
99
AreWeightsBufferConvParams100 bool AreWeightsBuffer() const {
101 return weights_upload_type != WeightsUploadType::TEXTURES_MEM_X4;
102 }
103
IsPrivateMemBroadcastConvParams104 bool IsPrivateMemBroadcast() const {
105 return weights_upload_type ==
106 WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
107 }
108 };
109
110 ConvPowerVR(const OperationDef& definition,
111 const Convolution2DAttributes& attr, const GpuInfo& gpu_info,
112 const BHWC* dst_shape = nullptr);
113 ConvPowerVR(const OperationDef& definition,
114 const Convolution2DAttributes& attr, const BHWC& weights_shape,
115 const GpuInfo& gpu_info, const BHWC* dst_shape = nullptr);
116 ConvPowerVR(const OperationDef& definition,
117 const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
118 const BHWC* dst_shape = nullptr);
119 explicit ConvPowerVR(const OperationDef& definition);
120 ConvPowerVR(const OperationDef& definition,
121 const Convolution3DAttributes& attr, const GpuInfo& gpu_info,
122 const BHWDC* dst_shape = nullptr);
123
124 void GenerateCode(const GpuInfo& gpu_info);
125
126 template <DataType T>
127 void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
128 const tflite::gpu::Tensor<Linear, T>& biases);
129 template <DataType T>
130 void UploadDataForWinograd4x4To6x6(
131 const tflite::gpu::Tensor<OHWI, T>& weights);
132
133 template <DataType T>
134 void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
135
136 template <DataType T>
137 void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
138
139 template <DataType T>
140 void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
141
142 friend ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
143 const OperationDef& definition,
144 const Convolution2DAttributes& attr,
145 const BHWC* dst_shape);
146
147 friend ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
148 const OperationDef& definition,
149 const FullyConnectedAttributes& attr,
150 const BHWC* dst_shape);
151
152 friend ConvPowerVR CreateConvPowerVRDynamicWeights(
153 const GpuInfo& gpu_info, const OperationDef& definition,
154 const Convolution2DAttributes& attr, const BHWC& weights_shape,
155 const BHWC* dst_shape);
156
157 friend ConvPowerVR CreateConvPowerVRWino4x4To6x6(
158 const GpuInfo& gpu_info, const OperationDef& definition,
159 const Convolution2DAttributes& attr, const BHWC* dst_shape);
160
161 friend ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
162 const OperationDef& definition,
163 const Convolution3DAttributes& attr,
164 const BHWDC* dst_shape);
165
166 ConvParams GuessBestParams(const GpuInfo& gpu_info,
167 const OperationDef& definition,
168 const Convolution2DAttributes& attr,
169 const BHWC* dst_shape = nullptr);
170 ConvParams GuessBestParams(const GpuInfo& gpu_info,
171 const OperationDef& definition,
172 const Convolution2DAttributes& attr,
173 const BHWC& weights_shape,
174 const BHWC* dst_shape = nullptr);
175 ConvParams GuessBestParams(const GpuInfo& gpu_info,
176 const OperationDef& definition,
177 const FullyConnectedAttributes& attr,
178 const BHWC* dst_shape = nullptr);
179 ConvParams GuessBestParamsWinograd(const GpuInfo& gpu_info,
180 const OperationDef& definition,
181 const Convolution2DAttributes& attr,
182 const BHWC* dst_shape = nullptr);
183 ConvParams GuessBestParams(const GpuInfo& gpu_info,
184 const OperationDef& definition,
185 const Convolution3DAttributes& attr,
186 const BHWDC* dst_shape = nullptr);
187 ConvParams GuessBestParams(const GpuInfo& gpu_info,
188 const OperationDef& definition, int src_depth,
189 int dst_depth, bool x_kernel_is_1,
190 bool y_kernel_is_1,
191 bool different_weights_for_height,
192 const BHWC* dst_shape = nullptr);
193
194 std::string GenerateConv(const GpuInfo& gpu_info, const OperationDef& op_def,
195 bool stride_correction,
196 const ConvParams& conv_params);
197
198 int4 stride_;
199 int4 padding_;
200 int4 kernel_size_;
201 int4 dilation_;
202 ConvParams conv_params_;
203 };
204
205 template <DataType T>
UploadData(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases)206 void ConvPowerVR::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
207 const tflite::gpu::Tensor<Linear, T>& biases) {
208 UploadWeights(weights);
209 UploadBias(biases);
210 }
211
212 template <DataType T>
UploadDataForWinograd4x4To6x6(const tflite::gpu::Tensor<OHWI,T> & weights)213 void ConvPowerVR::UploadDataForWinograd4x4To6x6(
214 const tflite::gpu::Tensor<OHWI, T>& weights) {
215 tflite::gpu::Tensor<OHWI, T> wino_weights;
216 RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
217 UploadWeights(wino_weights);
218 tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
219 biases.shape = Linear(weights.shape.o);
220 biases.data.resize(weights.shape.o, 0.0f);
221 UploadBias(biases);
222 }
223
224 template <DataType T>
UploadBias(const tflite::gpu::Tensor<Linear,T> & bias)225 void ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
226 BufferDescriptor desc;
227 desc.element_type = conv_params_.weights_data_type;
228 desc.element_size = 4;
229 desc.memory_type = conv_params_.weights_upload_type ==
230 ConvPowerVR::WeightsUploadType::CONSTANT_MEM
231 ? MemoryType::CONSTANT
232 : MemoryType::GLOBAL;
233 const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
234 ? sizeof(float)
235 : sizeof(half);
236 int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.w);
237 desc.size = float_size * aligned_channels;
238 desc.data.resize(desc.size);
239 if (conv_params_.weights_data_type == DataType::FLOAT32) {
240 float* gpu_data = reinterpret_cast<float*>(desc.data.data());
241 for (int i = 0; i < aligned_channels; ++i) {
242 gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
243 }
244 } else {
245 half* gpu_data = reinterpret_cast<half*>(desc.data.data());
246 for (int i = 0; i < aligned_channels; ++i) {
247 gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
248 }
249 }
250 args_.AddObject("biases",
251 absl::make_unique<BufferDescriptor>(std::move(desc)));
252 }
253
254 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights)255 void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
256 const int flt_count =
257 GetTotalElementsCountForLayout(GetWeightsDescription(), weights.shape);
258 DataType weights_type = conv_params_.weights_data_type;
259
260 std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_type));
261 RearrangeWeights(weights, GetWeightsDescription(), weights_type,
262 absl::MakeSpan(weights_data));
263
264 if (conv_params_.AreWeightsBuffer()) {
265 BufferDescriptor desc;
266 desc.element_type = weights_type;
267 desc.element_size = 4;
268 desc.memory_type = conv_params_.weights_upload_type ==
269 ConvPowerVR::WeightsUploadType::CONSTANT_MEM
270 ? MemoryType::CONSTANT
271 : MemoryType::GLOBAL;
272 desc.size = weights_data.size();
273 desc.data = std::move(weights_data);
274 args_.AddObject("weights",
275 absl::make_unique<BufferDescriptor>(std::move(desc)));
276 } else {
277 const int dst_depth =
278 AlignByN(DivideRoundUp(weights.shape.o, 4), conv_params_.block_size.w);
279 const int src_depth = DivideRoundUp(weights.shape.i, 4);
280 const int kernel_x = weights.shape.w;
281 const int kernel_y = weights.shape.h;
282 int texture_width = dst_depth;
283 int texture_height = src_depth * kernel_x * kernel_y;
284 int sub_size = SizeOf(weights_type) * 4 * texture_width * texture_height;
285 for (int i = 0; i < 4; ++i) {
286 Texture2DDescriptor desc;
287 desc.element_type = weights_type;
288 desc.size = int2(texture_width, texture_height);
289 desc.data.resize(sub_size);
290 memcpy(desc.data.data(), weights_data.data() + sub_size * i, sub_size);
291 const std::string name = "weights" + std::to_string(i);
292 args_.AddObject(name,
293 absl::make_unique<Texture2DDescriptor>(std::move(desc)));
294 }
295 }
296 }
297
298 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWDI,T> & weights)299 void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
300 const int flt_count =
301 GetTotalElementsCountForLayout(GetWeightsDescription(), weights.shape);
302 DataType weights_type = conv_params_.weights_data_type;
303
304 std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_type));
305 RearrangeWeights(weights, GetWeightsDescription(), weights_type,
306 absl::MakeSpan(weights_data));
307
308 if (conv_params_.AreWeightsBuffer()) {
309 BufferDescriptor desc;
310 desc.element_type = weights_type;
311 desc.element_size = 4;
312 desc.size = weights_data.size();
313 desc.data = std::move(weights_data);
314 args_.AddObject("weights",
315 absl::make_unique<BufferDescriptor>(std::move(desc)));
316 } else {
317 const int dst_slices =
318 AlignByN(DivideRoundUp(weights.shape.o, 4), conv_params_.block_size.w);
319 const int src_slices = DivideRoundUp(weights.shape.i, 4);
320 const int kernel_x = weights.shape.w;
321 const int kernel_y = weights.shape.h;
322 const int kernel_z = weights.shape.d;
323 int texture_width = dst_slices;
324 int texture_height = src_slices * kernel_x * kernel_y * kernel_z;
325 int sub_size = SizeOf(weights_type) * 4 * texture_width * texture_height;
326 for (int i = 0; i < 4; ++i) {
327 Texture2DDescriptor desc;
328 desc.element_type = weights_type;
329 desc.size = int2(texture_width, texture_height);
330 desc.data.resize(sub_size);
331 memcpy(desc.data.data(), weights_data.data() + sub_size * i, sub_size);
332 const std::string name = "weights" + std::to_string(i);
333 args_.AddObject(name,
334 absl::make_unique<Texture2DDescriptor>(std::move(desc)));
335 }
336 }
337 }
338
339 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
340 const OperationDef& definition,
341 const Convolution2DAttributes& attr,
342 const BHWC* dst_shape = nullptr);
343
344 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
345 const OperationDef& definition,
346 const FullyConnectedAttributes& attr,
347 const BHWC* dst_shape = nullptr);
348
349 ConvPowerVR CreateConvPowerVRDynamicWeights(const GpuInfo& gpu_info,
350 const OperationDef& definition,
351 const Convolution2DAttributes& attr,
352 const BHWC& weights_shape,
353 const BHWC* dst_shape = nullptr);
354
355 ConvPowerVR CreateConvPowerVRWino4x4To6x6(const GpuInfo& gpu_info,
356 const OperationDef& definition,
357 const Convolution2DAttributes& attr,
358 const BHWC* dst_shape = nullptr);
359
360 ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
361 const OperationDef& definition,
362 const Convolution3DAttributes& attr,
363 const BHWDC* dst_shape = nullptr);
364
365 } // namespace gpu
366 } // namespace tflite
367
368 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_POWERVR_H_
369