1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_BUFFER_1X1_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_BUFFER_1X1_H_
18
19 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
20 #include "tensorflow/lite/delegates/gpu/common/operations.h"
21 #include "tensorflow/lite/delegates/gpu/common/shape.h"
22 #include "tensorflow/lite/delegates/gpu/common/status.h"
23 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
28 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
29 #include "tensorflow/lite/delegates/gpu/common/types.h"
30 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
31
32 namespace tflite {
33 namespace gpu {
34
35 class ConvBuffer1x1 : public GPUOperation {
36 public:
37 ConvBuffer1x1() = default;
38
39 // Move only
40 ConvBuffer1x1(ConvBuffer1x1&& operation);
41 ConvBuffer1x1& operator=(ConvBuffer1x1&& operation);
42 ConvBuffer1x1(const ConvBuffer1x1&) = delete;
43 ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
44
45 void GetPossibleKernelWorkGroups(
46 TuningType tuning_type, const GpuInfo& gpu_info,
47 const KernelInfo& kernel_info,
48 std::vector<int3>* work_groups) const override;
49 int3 GetGridSize() const override;
50
GetWeightsDescription()51 WeightsDescription GetWeightsDescription() const {
52 WeightsDescription desc;
53 desc.layout = WeightsLayout::kOSpatialIOGroupI4O4;
54 desc.output_group_size = conv_params_.block_size.z;
55 return desc;
56 }
57
58 struct ConvParams {
59 int3 block_size = int3(1, 1, 1);
60 int element_size = 4; // can be 4, 8 or 16
61
62 // By default in 2d convolution we have the same weights for WH dims, but in
63 // some cases we need separate weights for H dimension and convolution
64 // kernel requires very small modifications to support it.
65 bool different_weights_for_height = false;
66 };
67
68 private:
69 ConvBuffer1x1(const OperationDef& definition, const ConvParams& conv_params,
70 const GpuInfo& gpu_info);
71 friend ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
72 const OperationDef& definition,
73 const Convolution2DAttributes& attr,
74 const BHWC* shape);
75 friend ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
76 const OperationDef& definition,
77 const FullyConnectedAttributes& attr,
78 const BHWC* shape);
79 friend ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
80 const GpuInfo& gpu_info, const OperationDef& definition,
81 const Convolution2DAttributes& attr, const BHWC* shape);
82 friend ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
83 const GpuInfo& gpu_info, const OperationDef& definition,
84 const Convolution2DAttributes& attr, const BHWC& weights_shape,
85 const BHWC* dst_shape);
86
87 template <DataType T>
88 void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
89 const tflite::gpu::Tensor<Linear, T>& biases);
90 template <DataType T>
91 void UploadDataForWinograd4x4To6x6(
92 const tflite::gpu::Tensor<OHWI, T>& weights);
93
94 template <DataType T>
95 void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
96
97 template <DataType T>
98 void UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases);
99
100 std::string GenerateConvBuffer1x1(
101 const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
102 const GpuInfo& gpu_info, Arguments* args);
103
104 ConvParams conv_params_;
105 };
106
107 template <DataType T>
UploadData(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases)108 void ConvBuffer1x1::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
109 const tflite::gpu::Tensor<Linear, T>& biases) {
110 UploadWeights(weights);
111 UploadBiases(biases);
112 }
113
114 template <DataType T>
UploadDataForWinograd4x4To6x6(const tflite::gpu::Tensor<OHWI,T> & weights)115 void ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
116 const tflite::gpu::Tensor<OHWI, T>& weights) {
117 tflite::gpu::Tensor<OHWI, T> wino_weights;
118 RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
119 UploadWeights(wino_weights);
120 tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
121 bias.shape = Linear(weights.shape.o);
122 bias.data.resize(weights.shape.o, 0.0f);
123 UploadBiases(bias);
124 }
125
126 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights)127 void ConvBuffer1x1::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
128 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
129 const int src_depth = DivideRoundUp(weights.shape.i, 4);
130
131 const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
132 const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
133
134 const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
135 const int elements_count =
136 weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
137
138 BufferDescriptor desc;
139 desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
140 desc.element_size = 16;
141 desc.memory_type = MemoryType::GLOBAL;
142 desc.size = float4_size * elements_count;
143 desc.data.resize(desc.size);
144
145 if (f32_weights) {
146 float4* ptr = reinterpret_cast<float4*>(desc.data.data());
147 RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
148 absl::MakeSpan(ptr, elements_count));
149 } else {
150 half4* ptr = reinterpret_cast<half4*>(desc.data.data());
151 RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
152 absl::MakeSpan(ptr, elements_count));
153 }
154
155 args_.AddObject("weights",
156 absl::make_unique<BufferDescriptor>(std::move(desc)));
157 }
158
159 template <DataType T>
UploadBiases(const tflite::gpu::Tensor<Linear,T> & biases)160 void ConvBuffer1x1::UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases) {
161 TensorLinearDescriptor desc;
162 desc.storage_type = LinearStorageType::BUFFER;
163 desc.element_type = definition_.GetDataType();
164 int depth = AlignByN(biases.shape.v, 4 * conv_params_.block_size.z) / 4;
165 desc.UploadLinearData(biases, depth);
166 args_.AddObject("biases",
167 absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
168 }
169
170 bool IsConvBuffer1x1Supported(const OperationDef& definition,
171 const Convolution2DAttributes& attr);
172
173 bool IsConvBuffer1x1Supported(const OperationDef& definition,
174 const BHWC& weights_shape,
175 const Convolution2DAttributes& attr);
176
177 ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
178 const OperationDef& definition,
179 const Convolution2DAttributes& attr,
180 const BHWC* shape = nullptr);
181
182 ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
183 const OperationDef& definition,
184 const FullyConnectedAttributes& attr,
185 const BHWC* shape = nullptr);
186
187 ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
188 const GpuInfo& gpu_info, const OperationDef& definition,
189 const Convolution2DAttributes& attr, const BHWC& weights_shape,
190 const BHWC* dst_shape = nullptr);
191
192 ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
193 const GpuInfo& gpu_info, const OperationDef& definition,
194 const Convolution2DAttributes& attr, const BHWC* shape = nullptr);
195
196 } // namespace gpu
197 } // namespace tflite
198
199 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_BUFFER_1X1_H_
200