1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_BUFFER_1X1_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_BUFFER_1X1_H_
18
19 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
20 #include "tensorflow/lite/delegates/gpu/common/operations.h"
21 #include "tensorflow/lite/delegates/gpu/common/shape.h"
22 #include "tensorflow/lite/delegates/gpu/common/status.h"
23 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
28 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
29 #include "tensorflow/lite/delegates/gpu/common/types.h"
30 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
31
32 namespace tflite {
33 namespace gpu {
34
35 class ConvBuffer1x1 : public GPUOperation {
36 public:
37 ConvBuffer1x1() = default;
38
39 // Move only
40 ConvBuffer1x1(ConvBuffer1x1&& operation);
41 ConvBuffer1x1& operator=(ConvBuffer1x1&& operation);
42 ConvBuffer1x1(const ConvBuffer1x1&) = delete;
43 ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
44
45 void GetPossibleKernelWorkGroups(
46 TuningType tuning_type, const GpuInfo& gpu_info,
47 const KernelInfo& kernel_info,
48 std::vector<int3>* work_groups) const override;
49 int3 GetGridSize() const override;
50
GetWeightsDescription()51 WeightsDescription GetWeightsDescription() const {
52 WeightsDescription desc;
53 desc.layout = WeightsLayout::kOHWIOGroupI4O4;
54 desc.output_group_size = conv_params_.block_size.z;
55 return desc;
56 }
57
58 struct ConvParams {
59 int3 block_size = int3(1, 1, 1);
60 int element_size = 4; // can be 4, 8 or 16
61
62 // By default in 2d convolution we have the same weights for WH dims, but in
63 // some cases we need separate weights for H dimension and convolution
64 // kernel requires very small modifications to support it.
65 bool different_weights_for_height = false;
66 };
67
68 private:
69 ConvBuffer1x1(const OperationDef& definition, const ConvParams& conv_params);
70 friend ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
71 const OperationDef& definition,
72 const Convolution2DAttributes& attr,
73 const BHWC* shape);
74 friend ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
75 const OperationDef& definition,
76 const FullyConnectedAttributes& attr,
77 const BHWC* shape);
78 friend ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
79 const GpuInfo& gpu_info, const OperationDef& definition,
80 const Convolution2DAttributes& attr, const BHWC* shape);
81 friend ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
82 const GpuInfo& gpu_info, const OperationDef& definition,
83 const Convolution2DAttributes& attr, const BHWC& weights_shape,
84 const BHWC* dst_shape);
85
86 template <DataType T>
87 void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
88 const tflite::gpu::Tensor<Linear, T>& biases);
89 template <DataType T>
90 void UploadDataForWinograd4x4To6x6(
91 const tflite::gpu::Tensor<OHWI, T>& weights);
92
93 template <DataType T>
94 void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
95
96 template <DataType T>
97 void UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases);
98
99 std::string GenerateConvBuffer1x1(
100 const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
101 Arguments* args);
102
103 ConvParams conv_params_;
104 };
105
106 template <DataType T>
UploadData(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases)107 void ConvBuffer1x1::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
108 const tflite::gpu::Tensor<Linear, T>& biases) {
109 UploadWeights(weights);
110 UploadBiases(biases);
111 }
112
113 template <DataType T>
UploadDataForWinograd4x4To6x6(const tflite::gpu::Tensor<OHWI,T> & weights)114 void ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
115 const tflite::gpu::Tensor<OHWI, T>& weights) {
116 tflite::gpu::Tensor<OHWI, T> wino_weights;
117 RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
118 UploadWeights(wino_weights);
119 tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
120 bias.shape = Linear(weights.shape.o);
121 bias.data.resize(weights.shape.o, 0.0f);
122 UploadBiases(bias);
123 }
124
125 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights)126 void ConvBuffer1x1::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
127 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
128 const int src_depth = DivideRoundUp(weights.shape.i, 4);
129
130 const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
131 const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
132
133 const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
134 const int elements_count =
135 weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
136
137 BufferDescriptor desc;
138 desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
139 desc.element_size = 16;
140 desc.memory_type = MemoryType::GLOBAL;
141 desc.size = float4_size * elements_count;
142 desc.data.resize(desc.size);
143
144 if (f32_weights) {
145 float4* ptr = reinterpret_cast<float4*>(desc.data.data());
146 RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
147 absl::MakeSpan(ptr, elements_count));
148 } else {
149 half4* ptr = reinterpret_cast<half4*>(desc.data.data());
150 RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
151 absl::MakeSpan(ptr, elements_count));
152 }
153
154 args_.AddObject("weights",
155 absl::make_unique<BufferDescriptor>(std::move(desc)));
156 }
157
158 template <DataType T>
UploadBiases(const tflite::gpu::Tensor<Linear,T> & biases)159 void ConvBuffer1x1::UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases) {
160 TensorLinearDescriptor desc;
161 desc.storage_type = LinearStorageType::BUFFER;
162 desc.element_type = definition_.GetDataType();
163 int depth = AlignByN(biases.shape.v, 4 * conv_params_.block_size.z) / 4;
164 desc.UploadLinearData(biases, depth);
165 args_.AddObject("biases",
166 absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
167 }
168
169 bool IsConvBuffer1x1Supported(const OperationDef& definition,
170 const Convolution2DAttributes& attr);
171
172 bool IsConvBuffer1x1Supported(const OperationDef& definition,
173 const BHWC& weights_shape,
174 const Convolution2DAttributes& attr);
175
176 ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
177 const OperationDef& definition,
178 const Convolution2DAttributes& attr,
179 const BHWC* shape = nullptr);
180
181 ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
182 const OperationDef& definition,
183 const FullyConnectedAttributes& attr,
184 const BHWC* shape = nullptr);
185
186 ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
187 const GpuInfo& gpu_info, const OperationDef& definition,
188 const Convolution2DAttributes& attr, const BHWC& weights_shape,
189 const BHWC* dst_shape = nullptr);
190
191 ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
192 const GpuInfo& gpu_info, const OperationDef& definition,
193 const Convolution2DAttributes& attr, const BHWC* shape = nullptr);
194
195 } // namespace gpu
196 } // namespace tflite
197
198 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_BUFFER_1X1_H_
199