1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h"
17
18 #include <string>
19 #include <utility>
20 #include <vector>
21
22 #include "absl/memory/memory.h"
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
26 #include "tensorflow/lite/delegates/gpu/common/types.h"
27
28 namespace tflite {
29 namespace gpu {
30 namespace {
UseBufferForWeights(const GpuInfo & gpu_info)31 bool UseBufferForWeights(const GpuInfo& gpu_info) {
32 return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali();
33 }
34
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,DataType::INT8> & weights,uint8_t * dst)35 void RearrangeFCWeightsToOIO4I4(
36 const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, uint8_t* dst) {
37 const int src_depth = DivideRoundUp(weights.shape.i, 4);
38 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
39
40 int counter = 0;
41 for (int d = 0; d < dst_depth; ++d) {
42 for (int s = 0; s < src_depth; ++s) {
43 for (int i = 0; i < 4; ++i) {
44 const int src_ch = s * 4 + i;
45 for (int j = 0; j < 4; ++j) {
46 const int dst_ch = d * 4 + j;
47 if (src_ch < weights.shape.i && dst_ch < weights.shape.o) {
48 int t =
49 127 +
50 weights.data[weights.shape.LinearIndex({dst_ch, 0, 0, src_ch})];
51 if (t < 0) {
52 t = 0;
53 }
54 dst[counter++] = t;
55 } else {
56 dst[counter++] = 127;
57 }
58 }
59 }
60 }
61 }
62 }
63 } // namespace
64
FCFCAdd(const OperationDef & definition,const GpuInfo & gpu_info)65 FCFCAdd::FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info)
66 : GPUOperation(definition) {
67 if (gpu_info.IsAdreno()) {
68 if (gpu_info.adreno_info.IsAdreno3xx()) {
69 work_group_size_ = int3(16, 4, 1);
70 } else if (gpu_info.adreno_info.IsAdreno4xx()) {
71 work_group_size_ = int3(32, 4, 1);
72 } else {
73 work_group_size_ = int3(32, 4, 1);
74 }
75 } else if (gpu_info.IsIntel()) {
76 work_group_size_ = int3(8, 4, 1);
77 } else if (gpu_info.IsNvidia()) {
78 work_group_size_ = int3(8, 4, 1);
79 } else if (gpu_info.IsPowerVR()) {
80 work_group_size_ = int3(8, 4, 1);
81 } else {
82 work_group_size_ = int3(16, 4, 1);
83 }
84 }
85
FCFCAdd(FCFCAdd && kernel)86 FCFCAdd::FCFCAdd(FCFCAdd&& kernel) : GPUOperation(std::move(kernel)) {}
87
operator =(FCFCAdd && kernel)88 FCFCAdd& FCFCAdd::operator=(FCFCAdd&& kernel) {
89 if (this != &kernel) {
90 GPUOperation::operator=(std::move(kernel));
91 }
92 return *this;
93 }
94
95 // We split vec vec dot (every thread do vec vec dot product in basic
96 // vec mat mult) on 4 parts to create more threads
97 // tid.y thread process every 4-th element in vec vec dot
98 // Good results for ~1024 x 1024 sizes, for other can be written more
99 // optimized shaders
100
GetFCFCAddKernelCode(const OperationDef & op_def,const GpuInfo & gpu_info,bool weights_are_buffer,bool quantized_0,bool quantized_1)101 std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
102 const GpuInfo& gpu_info,
103 bool weights_are_buffer,
104 bool quantized_0, bool quantized_1) {
105 AddSrcTensor("src_tensor_0", op_def.src_tensors[0]);
106 AddSrcTensor("src_tensor_1", op_def.src_tensors[1]);
107 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
108
109 std::string c;
110 switch (op_def.precision) {
111 case CalculationsPrecision::F32:
112 c += "#define FLT16 float16\n";
113 break;
114 case CalculationsPrecision::F32_F16:
115 case CalculationsPrecision::F16:
116 c += "#define FLT16 half16\n";
117 break;
118 }
119
120 c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
121 c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
122
123 c += R"(MAIN_FUNCTION($0) {
124 int gid = get_global_id(0);
125 int2 tid;
126 tid.x = LOCAL_ID_0;
127 tid.y = LOCAL_ID_1;
128 ACCUM_FLT4 s = INIT_ACCUM_FLT4(0.0f);
129 if (gid < args.dst_tensor.Slices()) {
130 for (int c = tid.y; c < args.src_tensor_0.Slices(); c += WG_Y) {
131 FLT4 v = args.src_tensor_0.Read(0, 0, c);
132 )";
133 if (weights_are_buffer) {
134 c += R"(FLT16 w = args.weights0.Read(c * args.dst_tensor.Slices() + gid);
135 FLT4 partial = v.x * FLT16_0123(w);
136 partial += v.y * FLT16_4567(w);
137 partial += v.z * FLT16_89ab(w);
138 partial += v.w * FLT16_cdef(w);
139 s += TO_ACCUM_TYPE(partial);
140 )";
141 } else {
142 c += R"(FLT4 w0 = args.weights0.Read(c * 4 + 0, gid);
143 FLT4 w1 = args.weights0.Read(c * 4 + 1, gid);
144 FLT4 w2 = args.weights0.Read(c * 4 + 2, gid);
145 FLT4 w3 = args.weights0.Read(c * 4 + 3, gid);
146 )";
147 if (quantized_0) {
148 c += R"(w0 = w0 * args.q0_m + args.q0_a;
149 w1 = w1 * args.q0_m + args.q0_a;
150 w2 = w2 * args.q0_m + args.q0_a;
151 w3 = w3 * args.q0_m + args.q0_a;
152 )";
153 }
154 c += R"(FLT4 partial = v.x * w0;
155 partial += v.y * w1;
156 partial += v.z * w2;
157 partial += v.w * w3;
158 s += TO_ACCUM_TYPE(partial);
159 )";
160 }
161 c += R"( }
162 for (int c = tid.y; c < args.src_tensor_1.Slices(); c += WG_Y) {
163 FLT4 v = args.src_tensor_1.Read(0, 0, c);
164 )";
165 if (weights_are_buffer) {
166 c += R"(FLT16 w = args.weights1.Read(c * args.dst_tensor.Slices() + gid);
167 FLT4 partial = v.x * FLT16_0123(w);
168 partial += v.y * FLT16_4567(w);
169 partial += v.z * FLT16_89ab(w);
170 partial += v.w * FLT16_cdef(w);
171 s += TO_ACCUM_TYPE(partial);
172 )";
173 } else {
174 c += R"(FLT4 w0 = args.weights1.Read(c * 4 + 0, gid);
175 FLT4 w1 = args.weights1.Read(c * 4 + 1, gid);
176 FLT4 w2 = args.weights1.Read(c * 4 + 2, gid);
177 FLT4 w3 = args.weights1.Read(c * 4 + 3, gid);
178 )";
179 if (quantized_1) {
180 c += R"(w0 = w0 * args.q1_m + args.q1_a;
181 w1 = w1 * args.q1_m + args.q1_a;
182 w2 = w2 * args.q1_m + args.q1_a;
183 w3 = w3 * args.q1_m + args.q1_a;
184 )";
185 }
186 c += R"(FLT4 partial = v.x * w0;
187 partial += v.y * w1;
188 partial += v.z * w2;
189 partial += v.w * w3;
190 s += TO_ACCUM_TYPE(partial);
191 )";
192 }
193 c += R"( }
194 }
195 __local ACCUM_FLT4 temp[WG_X][WG_Y];
196 temp[tid.x][tid.y] = s;
197 LOCAL_MEM_BARRIER;
198 if (gid >= args.dst_tensor.Slices()) {
199 return;
200 }
201 if (tid.y == 0) {
202 )";
203 for (int i = 1; i < work_group_size_.y; ++i) {
204 c += " s += temp[tid.x][" + std::to_string(i) + "];\n";
205 }
206 c +=
207 R"( FLT4 r0 = TO_FLT4(s) + args.biases0.Read(gid) + args.biases1.Read(gid);
208 args.dst_tensor.Write(r0, 0, 0, gid);
209 }
210 })";
211
212 return c;
213 }
214
GetGridSize() const215 int3 FCFCAdd::GetGridSize() const { return int3(dst_[0]->Slices(), 1, 1); }
216
UploadQuantizedWeights(const tflite::gpu::Tensor<OHWI,DataType::INT8> & weights,float scale,float zero_point,int index)217 void FCFCAdd::UploadQuantizedWeights(
218 const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
219 float zero_point, int index) {
220 const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
221 const int src_depth = DivideRoundUp(weights.shape.i, 4);
222 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
223 Texture2DDescriptor desc;
224 desc.element_type = DataType::UINT8;
225 desc.normalized = true;
226 desc.normalized_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
227 desc.size = int2(src_depth * 4, dst_depth);
228 desc.data.resize(src_depth * 4 * dst_depth * 4);
229 RearrangeFCWeightsToOIO4I4(weights, desc.data.data());
230
231 std::string q_name = "q" + std::to_string(index) + "_";
232 if (definition_.precision == CalculationsPrecision::F32) {
233 args_.AddFloat(q_name + "m", scale * 255.0f);
234 args_.AddFloat(q_name + "a", -scale * (127.0 + zero_point));
235 } else {
236 args_.AddHalf(q_name + "m", half(scale * 255.0f));
237 args_.AddHalf(q_name + "a", half(-scale * (127.0 + zero_point)));
238 }
239 args_.AddObject("weights" + std::to_string(index),
240 absl::make_unique<Texture2DDescriptor>(std::move(desc)));
241 }
242
CreateFCFCAdd(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr0,const FullyConnectedAttributes & attr1)243 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
244 const FullyConnectedAttributes& attr0,
245 const FullyConnectedAttributes& attr1) {
246 FCFCAdd result(definition, gpu_info);
247 bool weights_are_buffer = UseBufferForWeights(gpu_info);
248 result.UploadWeights(attr0.weights, "weights0", weights_are_buffer);
249 result.UploadWeights(attr1.weights, "weights1", weights_are_buffer);
250 result.code_ = result.GetFCFCAddKernelCode(definition, gpu_info,
251 weights_are_buffer, false, false);
252
253 TensorLinearDescriptor desc0;
254 desc0.storage_type = LinearStorageType::TEXTURE_2D;
255 desc0.element_type = definition.GetDataType();
256 desc0.UploadLinearData(attr0.bias);
257 result.args_.AddObject(
258 "biases0", absl::make_unique<TensorLinearDescriptor>(std::move(desc0)));
259
260 TensorLinearDescriptor desc1;
261 desc1.storage_type = LinearStorageType::TEXTURE_2D;
262 desc1.element_type = definition.GetDataType();
263 desc1.UploadLinearData(attr1.bias);
264 result.args_.AddObject(
265 "biases1", absl::make_unique<TensorLinearDescriptor>(std::move(desc1)));
266
267 return result;
268 }
269
CreateFCFCAdd(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedInt8Attributes & attr0,const FullyConnectedInt8Attributes & attr1)270 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
271 const FullyConnectedInt8Attributes& attr0,
272 const FullyConnectedInt8Attributes& attr1) {
273 FCFCAdd result(definition, gpu_info);
274 result.UploadQuantizedWeights(attr0.weights, attr0.scale, attr0.zero_point,
275 0);
276 result.UploadQuantizedWeights(attr1.weights, attr1.scale, attr1.zero_point,
277 1);
278 result.code_ =
279 result.GetFCFCAddKernelCode(definition, gpu_info, false, true, true);
280
281 TensorLinearDescriptor desc0;
282 desc0.storage_type = LinearStorageType::TEXTURE_2D;
283 desc0.element_type = definition.GetDataType();
284 desc0.UploadLinearData(attr0.bias);
285 result.args_.AddObject(
286 "biases0", absl::make_unique<TensorLinearDescriptor>(std::move(desc0)));
287
288 TensorLinearDescriptor desc1;
289 desc1.storage_type = LinearStorageType::TEXTURE_2D;
290 desc1.element_type = definition.GetDataType();
291 desc1.UploadLinearData(attr1.bias);
292 result.args_.AddObject(
293 "biases1", absl::make_unique<TensorLinearDescriptor>(std::move(desc1)));
294
295 return result;
296 }
297
298 } // namespace gpu
299 } // namespace tflite
300