1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
18
19 #include <stdint.h>
20
21 #include <string>
22 #include <utility>
23 #include <vector>
24
25 #include "absl/memory/memory.h"
26 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
27 #include "tensorflow/lite/delegates/gpu/common/operations.h"
28 #include "tensorflow/lite/delegates/gpu/common/shape.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
32 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
33 #include "tensorflow/lite/delegates/gpu/common/types.h"
34 #include "tensorflow/lite/delegates/gpu/common/util.h"
35
36 namespace tflite {
37 namespace gpu {
38
39 template <DataType T, typename S>
RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)40 void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
41 S* dst) {
42 const int src_channels = weights.shape.i;
43 const int padded_src_channels = AlignByN(src_channels, 4);
44 const int dst_channels = weights.shape.o;
45 const int padded_dst_channels = AlignByN(dst_channels, 4);
46
47 for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
48 for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
49 for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
50 for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
51 int y = 4 * block_y + y_in_block;
52 int x = 4 * block_x + x_in_block;
53 int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
54 x_in_block * 4 + y_in_block;
55 if (x < src_channels && y < dst_channels) {
56 dst[dst_index] = weights.data[src_channels * y + x];
57 } else {
58 dst[dst_index] = 0.0f;
59 }
60 }
61 }
62 }
63 }
64 }
65
66 template <DataType T, typename S>
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)67 void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
68 S* dst) {
69 const int src_channels = weights.shape.i;
70 const int src_depth = DivideRoundUp(src_channels, 4);
71 const int dst_channels = weights.shape.o;
72 const int dst_depth = DivideRoundUp(dst_channels, 4);
73
74 int counter = 0;
75 for (int d = 0; d < dst_depth; ++d) {
76 for (int s = 0; s < src_depth; ++s) {
77 for (int i = 0; i < 4; ++i) {
78 const int src_ch = s * 4 + i;
79 for (int j = 0; j < 4; ++j) {
80 const int dst_ch = d * 4 + j;
81 if (src_ch < src_channels && dst_ch < dst_channels) {
82 dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
83 } else {
84 dst[counter++] = 0.0f;
85 }
86 }
87 }
88 }
89 }
90 }
91
92 class FCFCAdd : public GPUOperation {
93 public:
94 FCFCAdd() = default;
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups)95 void GetPossibleKernelWorkGroups(
96 TuningType tuning_type, const GpuInfo& gpu_info,
97 const KernelInfo& kernel_info,
98 std::vector<int3>* work_groups) const override {
99 work_groups->push_back(work_group_size_);
100 }
101 int3 GetGridSize() const override;
102
103 // Move only
104 FCFCAdd(FCFCAdd&& kernel);
105 FCFCAdd& operator=(FCFCAdd&& kernel);
106 FCFCAdd(const FCFCAdd&) = delete;
107 FCFCAdd& operator=(const FCFCAdd&) = delete;
108
109 private:
110 FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info);
111 friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
112 const OperationDef& definition,
113 const FullyConnectedAttributes& attr0,
114 const FullyConnectedAttributes& attr1);
115 friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
116 const OperationDef& definition,
117 const FullyConnectedInt8Attributes& attr0,
118 const FullyConnectedInt8Attributes& attr1);
119
120 void UploadQuantizedWeights(
121 const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
122 float zero_point, int index);
123
124 template <DataType T>
125 void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
126 const std::string& name, bool weights_are_buffer);
127
128 std::string GetFCFCAddKernelCode(const OperationDef& op_def,
129 const GpuInfo& gpu_info,
130 bool weights_are_buffer, bool quantized_0,
131 bool quantized_1);
132 };
133
134 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights,const std::string & name,bool weights_are_buffer)135 void FCFCAdd::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
136 const std::string& name, bool weights_are_buffer) {
137 const int src_depth = DivideRoundUp(weights.shape.i, 4);
138 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
139
140 const int elements_count = src_depth * dst_depth * 4;
141 const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
142
143 const int float4_size = f32_weights ? 16 : 8;
144
145 if (weights_are_buffer) {
146 BufferDescriptor desc;
147 desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
148 desc.element_size = 16;
149 desc.size = float4_size * elements_count;
150 desc.data.resize(desc.size);
151
152 if (f32_weights) {
153 float* ptr = reinterpret_cast<float*>(desc.data.data());
154 RearrangeFCWeightsToIOO4I4(weights, ptr);
155 } else {
156 half* ptr = reinterpret_cast<half*>(desc.data.data());
157 RearrangeFCWeightsToIOO4I4(weights, ptr);
158 }
159
160 args_.AddObject(name, absl::make_unique<BufferDescriptor>(std::move(desc)));
161 } else {
162 Texture2DDescriptor desc;
163 desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
164 // desc.element_type = DataType::UINT8;
165 // desc.normalized = true;
166 // desc.normalized_type = f32_weights ? DataType::FLOAT32 :
167 // DataType::FLOAT16;
168 desc.size = int2(src_depth * 4, dst_depth);
169 desc.data.resize(float4_size * elements_count);
170
171 if (f32_weights) {
172 float* ptr = reinterpret_cast<float*>(desc.data.data());
173 RearrangeFCWeightsToOIO4I4(weights, ptr);
174 } else {
175 half* ptr = reinterpret_cast<half*>(desc.data.data());
176 RearrangeFCWeightsToOIO4I4(weights, ptr);
177 }
178
179 args_.AddObject(name,
180 absl::make_unique<Texture2DDescriptor>(std::move(desc)));
181 }
182 }
183
184 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
185 const FullyConnectedAttributes& attr0,
186 const FullyConnectedAttributes& attr1);
187
188 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
189 const FullyConnectedInt8Attributes& attr0,
190 const FullyConnectedInt8Attributes& attr1);
191
192 } // namespace gpu
193 } // namespace tflite
194
195 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
196