1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_POWERVR_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_POWERVR_H_
18
19 #include <cstring>
20 #include <vector>
21
22 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
28 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
32 #include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
33 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
34 #include "tensorflow/lite/delegates/gpu/common/types.h"
35 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
36
37 namespace tflite {
38 namespace gpu {
39
40 class ConvPowerVR : public GPUOperation {
41 public:
42 ConvPowerVR() = default;
43 void GetPossibleKernelWorkGroups(
44 TuningType tuning_type, const GpuInfo& gpu_info,
45 const KernelInfo& kernel_info,
46 std::vector<int3>* work_groups) const override;
47 absl::Status BindArguments(ArgumentsBinder* args) override;
48 int3 GetGridSize() const override;
49
GetWeightsDescription()50 WeightsDescription GetWeightsDescription() const {
51 WeightsDescription desc;
52 desc.layout = conv_params_.weights_layout;
53 desc.output_group_size = conv_params_.block_size.w;
54 return desc;
55 }
56
57 // Move only
58 ConvPowerVR(ConvPowerVR&& operation);
59 ConvPowerVR& operator=(ConvPowerVR&& operation);
60 ConvPowerVR(const ConvPowerVR&) = delete;
61 ConvPowerVR& operator=(const ConvPowerVR&) = delete;
62
63 private:
64 enum class WeightsUploadType {
65 LOCAL_MEM_ASYNC_SUBGROUP, // we use it for PowerVR with workgroup size = 32
66 LOCAL_MEM_BY_THREADS,
67 GLOBAL_MEM,
68 CONSTANT_MEM,
69 PRIVATE_MEM_SIMD_BROADCAST,
70 TEXTURES_MEM_X4, // 4 textures for weights
71 };
72
73 struct ConvParams {
74 // Usually we use this combinations for CalculationPrecision:
75 // F32: all F32
76 // F16: all F16
77 // F32_F16: all besides accumulator is F16, including weights
78 // But for PowerVR we can achieve better performance in F32_F16 with F32
79 // weights, so for PowerVR in this kernel we have F32 weights for
80 // F32_F16 precision mode
81 DataType weights_data_type; // used for weights and biases
82 int4 block_size; // WHDS
83 bool fixed_work_group_size;
84 bool linear_spatial; // spatial dimensions are Width/Height/Depth
85 bool linear_all; // linear_spatial & linear_all can not be used together,
86 // linear_all can not be used with WeightsUploadTypes
87 // that use workgroups(subgroups) for
88 // uploading(LOCAL_MEM_BY_THREADS for example).
89 bool different_weights_for_height;
90 int src_depth_loop_size;
91 WeightsUploadType weights_upload_type;
92 bool x_kernel_is_1;
93 bool y_kernel_is_1;
94 bool z_kernel_is_1;
95 WeightsLayout weights_layout;
96
97 // used only with PRIVATE_MEM_SIMD_BROADCAST
98 int simd_size = 1;
99
AreWeightsBufferConvParams100 bool AreWeightsBuffer() const {
101 return weights_upload_type != WeightsUploadType::TEXTURES_MEM_X4;
102 }
103
IsPrivateMemBroadcastConvParams104 bool IsPrivateMemBroadcast() const {
105 return weights_upload_type ==
106 WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
107 }
108 };
109
110 ConvPowerVR(const OperationDef& definition,
111 const Convolution2DAttributes& attr, const GpuInfo& gpu_info,
112 const BHWC* dst_shape = nullptr);
113 ConvPowerVR(const OperationDef& definition,
114 const Convolution2DAttributes& attr, const BHWC& weights_shape,
115 const GpuInfo& gpu_info, const BHWC* dst_shape = nullptr);
116 ConvPowerVR(const OperationDef& definition,
117 const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
118 const BHWC* dst_shape = nullptr);
119 explicit ConvPowerVR(const OperationDef& definition);
120 ConvPowerVR(const OperationDef& definition,
121 const Convolution3DAttributes& attr, const GpuInfo& gpu_info,
122 const BHWDC* dst_shape = nullptr);
123
124 void GenerateCode(const GpuInfo& gpu_info);
125
126 template <DataType T>
127 void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
128 const tflite::gpu::Tensor<Linear, T>& biases);
129 template <DataType T>
130 void UploadDataForWinograd4x4To6x6(
131 const tflite::gpu::Tensor<OHWI, T>& weights);
132
133 template <DataType T>
134 void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
135
136 template <DataType T>
137 void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
138
139 template <DataType T>
140 void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
141
142 friend ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
143 const OperationDef& definition,
144 const Convolution2DAttributes& attr,
145 const BHWC* dst_shape);
146
147 friend ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
148 const OperationDef& definition,
149 const FullyConnectedAttributes& attr,
150 const BHWC* dst_shape);
151
152 friend ConvPowerVR CreateConvPowerVRDynamicWeights(
153 const GpuInfo& gpu_info, const OperationDef& definition,
154 const Convolution2DAttributes& attr, const BHWC& weights_shape,
155 const BHWC* dst_shape);
156
157 friend ConvPowerVR CreateConvPowerVRWino4x4To6x6(
158 const GpuInfo& gpu_info, const OperationDef& definition,
159 const Convolution2DAttributes& attr, const BHWC* dst_shape);
160
161 friend ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
162 const OperationDef& definition,
163 const Convolution3DAttributes& attr,
164 const BHWDC* dst_shape);
165
166 ConvParams GuessBestParams(const GpuInfo& gpu_info,
167 const OperationDef& definition,
168 const Convolution2DAttributes& attr,
169 const BHWC* dst_shape = nullptr);
170 ConvParams GuessBestParams(const GpuInfo& gpu_info,
171 const OperationDef& definition,
172 const Convolution2DAttributes& attr,
173 const BHWC& weights_shape,
174 const BHWC* dst_shape = nullptr);
175 ConvParams GuessBestParams(const GpuInfo& gpu_info,
176 const OperationDef& definition,
177 const FullyConnectedAttributes& attr,
178 const BHWC* dst_shape = nullptr);
179 ConvParams GuessBestParamsWinograd(const GpuInfo& gpu_info,
180 const OperationDef& definition,
181 const Convolution2DAttributes& attr,
182 const BHWC* dst_shape = nullptr);
183 ConvParams GuessBestParams(const GpuInfo& gpu_info,
184 const OperationDef& definition,
185 const Convolution3DAttributes& attr,
186 const BHWDC* dst_shape = nullptr);
187 ConvParams GuessBestParams(const GpuInfo& gpu_info,
188 const OperationDef& definition, int src_depth,
189 int dst_depth, bool x_kernel_is_1,
190 bool y_kernel_is_1,
191 bool different_weights_for_height,
192 const BHWC* dst_shape = nullptr);
193
194 std::string GenerateConv(const GpuInfo& gpu_info, const OperationDef& op_def,
195 bool stride_correction,
196 const ConvParams& conv_params);
197
198 int4 stride_;
199 int4 padding_;
200 int4 kernel_size_;
201 int4 dilation_;
202 ConvParams conv_params_;
203 };
204
205 template <DataType T>
UploadData(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases)206 void ConvPowerVR::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
207 const tflite::gpu::Tensor<Linear, T>& biases) {
208 UploadWeights(weights);
209 UploadBias(biases);
210 }
211
212 template <DataType T>
UploadDataForWinograd4x4To6x6(const tflite::gpu::Tensor<OHWI,T> & weights)213 void ConvPowerVR::UploadDataForWinograd4x4To6x6(
214 const tflite::gpu::Tensor<OHWI, T>& weights) {
215 tflite::gpu::Tensor<OHWI, T> wino_weights;
216 RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
217 UploadWeights(wino_weights);
218 tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
219 biases.shape = Linear(weights.shape.o);
220 biases.data.resize(weights.shape.o, 0.0f);
221 UploadBias(biases);
222 }
223
224 template <DataType T>
UploadBias(const tflite::gpu::Tensor<Linear,T> & bias)225 void ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
226 BufferDescriptor desc;
227 desc.element_type = conv_params_.weights_data_type;
228 desc.element_size = 4;
229 desc.memory_type = conv_params_.weights_upload_type ==
230 ConvPowerVR::WeightsUploadType::CONSTANT_MEM
231 ? MemoryType::CONSTANT
232 : MemoryType::GLOBAL;
233 const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
234 ? sizeof(float)
235 : sizeof(half);
236 int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.w);
237 desc.size = float_size * aligned_channels;
238 desc.data.resize(desc.size);
239 if (conv_params_.weights_data_type == DataType::FLOAT32) {
240 float* gpu_data = reinterpret_cast<float*>(desc.data.data());
241 for (int i = 0; i < aligned_channels; ++i) {
242 gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
243 }
244 } else {
245 half* gpu_data = reinterpret_cast<half*>(desc.data.data());
246 for (int i = 0; i < aligned_channels; ++i) {
247 gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
248 }
249 }
250 args_.AddObject("biases",
251 absl::make_unique<BufferDescriptor>(std::move(desc)));
252 }
253
254 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights)255 void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
256 const int flt_count =
257 GetTotalElementsCountForLayout(GetWeightsDescription(), weights.shape);
258 DataType weights_type = conv_params_.weights_data_type;
259
260 std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_type));
261 RearrangeWeights(weights, GetWeightsDescription(), weights_type,
262 absl::MakeSpan(weights_data));
263
264 if (conv_params_.AreWeightsBuffer()) {
265 BufferDescriptor desc;
266 desc.element_type = weights_type;
267 desc.element_size = 4;
268 desc.memory_type = conv_params_.weights_upload_type ==
269 ConvPowerVR::WeightsUploadType::CONSTANT_MEM
270 ? MemoryType::CONSTANT
271 : MemoryType::GLOBAL;
272 desc.size = weights_data.size();
273 desc.data = std::move(weights_data);
274 args_.AddObject("weights",
275 absl::make_unique<BufferDescriptor>(std::move(desc)));
276 } else {
277 const int dst_depth =
278 AlignByN(DivideRoundUp(weights.shape.o, 4), conv_params_.block_size.w);
279 const int src_depth = DivideRoundUp(weights.shape.i, 4);
280 const int kernel_x = weights.shape.w;
281 const int kernel_y = weights.shape.h;
282 int texture_width = dst_depth;
283 int texture_height = src_depth * kernel_x * kernel_y;
284 int sub_size = SizeOf(weights_type) * 4 * texture_width * texture_height;
285 for (int i = 0; i < 4; ++i) {
286 Texture2DDescriptor desc;
287 desc.element_type = weights_type;
288 desc.size = int2(texture_width, texture_height);
289 desc.data.resize(sub_size);
290 memcpy(desc.data.data(), weights_data.data() + sub_size * i, sub_size);
291 const std::string name = "weights" + std::to_string(i);
292 args_.AddObject(name,
293 absl::make_unique<Texture2DDescriptor>(std::move(desc)));
294 }
295 }
296 }
297
298 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWDI,T> & weights)299 void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
300 const int block_size = conv_params_.block_size.w;
301 const int dst_slices =
302 AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
303 const int src_slices = DivideRoundUp(weights.shape.i, 4);
304
305 const int elements_count = weights.shape.d * weights.shape.h *
306 weights.shape.w * src_slices * dst_slices * 4;
307 const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
308
309 const int float4_size = f32_weights ? 16 : 8;
310
311 std::vector<uint8_t> data(float4_size * elements_count);
312
313 if (f32_weights) {
314 float4* ptr = reinterpret_cast<float4*>(data.data());
315 if (conv_params_.AreWeightsBuffer()) {
316 RearrangeWeightsToODHWIOGroupI4O4(weights, conv_params_.block_size.w,
317 absl::MakeSpan(ptr, elements_count));
318 } else {
319 RearrangeWeightsToI4DHWIOOGroupO4(weights, conv_params_.block_size.w,
320 absl::MakeSpan(ptr, elements_count));
321 }
322 } else {
323 half4* ptr = reinterpret_cast<half4*>(data.data());
324 if (conv_params_.AreWeightsBuffer()) {
325 RearrangeWeightsToODHWIOGroupI4O4(weights, conv_params_.block_size.w,
326 absl::MakeSpan(ptr, elements_count));
327 } else {
328 RearrangeWeightsToI4DHWIOOGroupO4(weights, conv_params_.block_size.w,
329 absl::MakeSpan(ptr, elements_count));
330 }
331 }
332
333 if (conv_params_.AreWeightsBuffer()) {
334 BufferDescriptor desc;
335 desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
336 desc.element_size = 4;
337 desc.size = float4_size * elements_count;
338 desc.data = std::move(data);
339 args_.AddObject("weights",
340 absl::make_unique<BufferDescriptor>(std::move(desc)));
341 } else {
342 const int texture_width = dst_slices;
343 const int texture_height =
344 src_slices * weights.shape.d * weights.shape.h * weights.shape.w;
345 int sub_size = float4_size * texture_width * texture_height;
346 for (int i = 0; i < 4; ++i) {
347 Texture2DDescriptor desc;
348 desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
349 desc.size = int2(texture_width, texture_height);
350 desc.data.resize(sub_size);
351 memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
352 const std::string name = "weights" + std::to_string(i);
353 args_.AddObject(name,
354 absl::make_unique<Texture2DDescriptor>(std::move(desc)));
355 }
356 }
357 }
358
359 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
360 const OperationDef& definition,
361 const Convolution2DAttributes& attr,
362 const BHWC* dst_shape = nullptr);
363
364 ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
365 const OperationDef& definition,
366 const FullyConnectedAttributes& attr,
367 const BHWC* dst_shape = nullptr);
368
369 ConvPowerVR CreateConvPowerVRDynamicWeights(const GpuInfo& gpu_info,
370 const OperationDef& definition,
371 const Convolution2DAttributes& attr,
372 const BHWC& weights_shape,
373 const BHWC* dst_shape = nullptr);
374
375 ConvPowerVR CreateConvPowerVRWino4x4To6x6(const GpuInfo& gpu_info,
376 const OperationDef& definition,
377 const Convolution2DAttributes& attr,
378 const BHWC* dst_shape = nullptr);
379
380 ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
381 const OperationDef& definition,
382 const Convolution3DAttributes& attr,
383 const BHWDC* dst_shape = nullptr);
384
385 } // namespace gpu
386 } // namespace tflite
387
388 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_POWERVR_H_
389