• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h"
17 
18 #include <string>
19 #include <utility>
20 #include <vector>
21 
22 #include "absl/memory/memory.h"
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
26 #include "tensorflow/lite/delegates/gpu/common/types.h"
27 
28 namespace tflite {
29 namespace gpu {
30 namespace {
UseBufferForWeights(const GpuInfo & gpu_info)31 bool UseBufferForWeights(const GpuInfo& gpu_info) {
32   return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali();
33 }
34 
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,DataType::INT8> & weights,uint8_t * dst)35 void RearrangeFCWeightsToOIO4I4(
36     const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, uint8_t* dst) {
37   const int src_depth = DivideRoundUp(weights.shape.i, 4);
38   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
39 
40   int counter = 0;
41   for (int d = 0; d < dst_depth; ++d) {
42     for (int s = 0; s < src_depth; ++s) {
43       for (int i = 0; i < 4; ++i) {
44         const int src_ch = s * 4 + i;
45         for (int j = 0; j < 4; ++j) {
46           const int dst_ch = d * 4 + j;
47           if (src_ch < weights.shape.i && dst_ch < weights.shape.o) {
48             int t =
49                 127 +
50                 weights.data[weights.shape.LinearIndex({dst_ch, 0, 0, src_ch})];
51             if (t < 0) {
52               t = 0;
53             }
54             dst[counter++] = t;
55           } else {
56             dst[counter++] = 127;
57           }
58         }
59       }
60     }
61   }
62 }
63 }  // namespace
64 
FCFCAdd(const OperationDef & definition,const GpuInfo & gpu_info)65 FCFCAdd::FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info)
66     : GPUOperation(definition) {
67   if (gpu_info.IsAdreno()) {
68     if (gpu_info.adreno_info.IsAdreno3xx()) {
69       work_group_size_ = int3(16, 4, 1);
70     } else if (gpu_info.adreno_info.IsAdreno4xx()) {
71       work_group_size_ = int3(32, 4, 1);
72     } else {
73       work_group_size_ = int3(32, 4, 1);
74     }
75   } else if (gpu_info.IsIntel()) {
76     work_group_size_ = int3(8, 4, 1);
77   } else if (gpu_info.IsNvidia()) {
78     work_group_size_ = int3(8, 4, 1);
79   } else if (gpu_info.IsPowerVR()) {
80     work_group_size_ = int3(8, 4, 1);
81   } else {
82     work_group_size_ = int3(16, 4, 1);
83   }
84 }
85 
FCFCAdd(FCFCAdd && kernel)86 FCFCAdd::FCFCAdd(FCFCAdd&& kernel) : GPUOperation(std::move(kernel)) {}
87 
operator =(FCFCAdd && kernel)88 FCFCAdd& FCFCAdd::operator=(FCFCAdd&& kernel) {
89   if (this != &kernel) {
90     GPUOperation::operator=(std::move(kernel));
91   }
92   return *this;
93 }
94 
95 // We split vec vec dot (every thread do vec vec dot product in basic
96 // vec mat mult) on 4 parts to create more threads
97 // tid.y thread process every 4-th element in vec vec dot
98 // Good results for ~1024 x 1024 sizes, for other can be written more
99 // optimized shaders
100 
GetFCFCAddKernelCode(const OperationDef & op_def,const GpuInfo & gpu_info,bool weights_are_buffer,bool quantized_0,bool quantized_1)101 std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
102                                           const GpuInfo& gpu_info,
103                                           bool weights_are_buffer,
104                                           bool quantized_0, bool quantized_1) {
105   AddSrcTensor("src_tensor_0", op_def.src_tensors[0]);
106   AddSrcTensor("src_tensor_1", op_def.src_tensors[1]);
107   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
108 
109   std::string c;
110   switch (op_def.precision) {
111     case CalculationsPrecision::F32:
112       c += "#define FLT16 float16\n";
113       break;
114     case CalculationsPrecision::F32_F16:
115     case CalculationsPrecision::F16:
116       c += "#define FLT16 half16\n";
117       break;
118   }
119 
120   c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
121   c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
122 
123   c += R"(MAIN_FUNCTION($0) {
124   int gid = get_global_id(0);
125   int2 tid;
126   tid.x = LOCAL_ID_0;
127   tid.y = LOCAL_ID_1;
128   ACCUM_FLT4 s = INIT_ACCUM_FLT4(0.0f);
129   if (gid < args.dst_tensor.Slices()) {
130     for (int c = tid.y; c < args.src_tensor_0.Slices(); c += WG_Y) {
131       FLT4 v = args.src_tensor_0.Read(0, 0, c);
132 )";
133   if (weights_are_buffer) {
134     c += R"(FLT16 w = args.weights0.Read(c * args.dst_tensor.Slices() + gid);
135       FLT4 partial = v.x * FLT16_0123(w);
136       partial += v.y * FLT16_4567(w);
137       partial += v.z * FLT16_89ab(w);
138       partial += v.w * FLT16_cdef(w);
139       s += TO_ACCUM_TYPE(partial);
140 )";
141   } else {
142     c += R"(FLT4 w0 = args.weights0.Read(c * 4 + 0, gid);
143       FLT4 w1 = args.weights0.Read(c * 4 + 1, gid);
144       FLT4 w2 = args.weights0.Read(c * 4 + 2, gid);
145       FLT4 w3 = args.weights0.Read(c * 4 + 3, gid);
146       )";
147     if (quantized_0) {
148       c += R"(w0 = w0 * args.q0_m + args.q0_a;
149       w1 = w1 * args.q0_m + args.q0_a;
150       w2 = w2 * args.q0_m + args.q0_a;
151       w3 = w3 * args.q0_m + args.q0_a;
152 )";
153     }
154     c += R"(FLT4 partial = v.x * w0;
155       partial += v.y * w1;
156       partial += v.z * w2;
157       partial += v.w * w3;
158       s += TO_ACCUM_TYPE(partial);
159 )";
160   }
161   c += R"(    }
162     for (int c = tid.y; c < args.src_tensor_1.Slices(); c += WG_Y) {
163       FLT4 v = args.src_tensor_1.Read(0, 0, c);
164       )";
165   if (weights_are_buffer) {
166     c += R"(FLT16 w = args.weights1.Read(c * args.dst_tensor.Slices() + gid);
167       FLT4 partial = v.x * FLT16_0123(w);
168       partial += v.y * FLT16_4567(w);
169       partial += v.z * FLT16_89ab(w);
170       partial += v.w * FLT16_cdef(w);
171       s += TO_ACCUM_TYPE(partial);
172 )";
173   } else {
174     c += R"(FLT4 w0 = args.weights1.Read(c * 4 + 0, gid);
175       FLT4 w1 = args.weights1.Read(c * 4 + 1, gid);
176       FLT4 w2 = args.weights1.Read(c * 4 + 2, gid);
177       FLT4 w3 = args.weights1.Read(c * 4 + 3, gid);
178       )";
179     if (quantized_1) {
180       c += R"(w0 = w0 * args.q1_m + args.q1_a;
181       w1 = w1 * args.q1_m + args.q1_a;
182       w2 = w2 * args.q1_m + args.q1_a;
183       w3 = w3 * args.q1_m + args.q1_a;
184 )";
185     }
186     c += R"(FLT4 partial = v.x * w0;
187       partial += v.y * w1;
188       partial += v.z * w2;
189       partial += v.w * w3;
190       s += TO_ACCUM_TYPE(partial);
191 )";
192   }
193   c += R"(    }
194   }
195   __local ACCUM_FLT4 temp[WG_X][WG_Y];
196   temp[tid.x][tid.y] = s;
197   LOCAL_MEM_BARRIER;
198   if (gid >= args.dst_tensor.Slices()) {
199     return;
200   }
201   if (tid.y == 0) {
202 )";
203   for (int i = 1; i < work_group_size_.y; ++i) {
204     c += "    s += temp[tid.x][" + std::to_string(i) + "];\n";
205   }
206   c +=
207       R"(    FLT4 r0 = TO_FLT4(s) + args.biases0.Read(gid) + args.biases1.Read(gid);
208     args.dst_tensor.Write(r0, 0, 0, gid);
209   }
210 })";
211 
212   return c;
213 }
214 
GetGridSize() const215 int3 FCFCAdd::GetGridSize() const { return int3(dst_[0]->Slices(), 1, 1); }
216 
UploadQuantizedWeights(const tflite::gpu::Tensor<OHWI,DataType::INT8> & weights,float scale,float zero_point,int index)217 void FCFCAdd::UploadQuantizedWeights(
218     const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
219     float zero_point, int index) {
220   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
221   const int src_depth = DivideRoundUp(weights.shape.i, 4);
222   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
223   Texture2DDescriptor desc;
224   desc.element_type = DataType::UINT8;
225   desc.normalized = true;
226   desc.normalized_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
227   desc.size = int2(src_depth * 4, dst_depth);
228   desc.data.resize(src_depth * 4 * dst_depth * 4);
229   RearrangeFCWeightsToOIO4I4(weights, desc.data.data());
230 
231   std::string q_name = "q" + std::to_string(index) + "_";
232   if (definition_.precision == CalculationsPrecision::F32) {
233     args_.AddFloat(q_name + "m", scale * 255.0f);
234     args_.AddFloat(q_name + "a", -scale * (127.0 + zero_point));
235   } else {
236     args_.AddHalf(q_name + "m", half(scale * 255.0f));
237     args_.AddHalf(q_name + "a", half(-scale * (127.0 + zero_point)));
238   }
239   args_.AddObject("weights" + std::to_string(index),
240                   absl::make_unique<Texture2DDescriptor>(std::move(desc)));
241 }
242 
CreateFCFCAdd(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr0,const FullyConnectedAttributes & attr1)243 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
244                       const FullyConnectedAttributes& attr0,
245                       const FullyConnectedAttributes& attr1) {
246   FCFCAdd result(definition, gpu_info);
247   bool weights_are_buffer = UseBufferForWeights(gpu_info);
248   result.UploadWeights(attr0.weights, "weights0", weights_are_buffer);
249   result.UploadWeights(attr1.weights, "weights1", weights_are_buffer);
250   result.code_ = result.GetFCFCAddKernelCode(definition, gpu_info,
251                                              weights_are_buffer, false, false);
252 
253   TensorLinearDescriptor desc0;
254   desc0.storage_type = LinearStorageType::TEXTURE_2D;
255   desc0.element_type = definition.GetDataType();
256   desc0.UploadLinearData(attr0.bias);
257   result.args_.AddObject(
258       "biases0", absl::make_unique<TensorLinearDescriptor>(std::move(desc0)));
259 
260   TensorLinearDescriptor desc1;
261   desc1.storage_type = LinearStorageType::TEXTURE_2D;
262   desc1.element_type = definition.GetDataType();
263   desc1.UploadLinearData(attr1.bias);
264   result.args_.AddObject(
265       "biases1", absl::make_unique<TensorLinearDescriptor>(std::move(desc1)));
266 
267   return result;
268 }
269 
CreateFCFCAdd(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedInt8Attributes & attr0,const FullyConnectedInt8Attributes & attr1)270 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
271                       const FullyConnectedInt8Attributes& attr0,
272                       const FullyConnectedInt8Attributes& attr1) {
273   FCFCAdd result(definition, gpu_info);
274   result.UploadQuantizedWeights(attr0.weights, attr0.scale, attr0.zero_point,
275                                 0);
276   result.UploadQuantizedWeights(attr1.weights, attr1.scale, attr1.zero_point,
277                                 1);
278   result.code_ =
279       result.GetFCFCAddKernelCode(definition, gpu_info, false, true, true);
280 
281   TensorLinearDescriptor desc0;
282   desc0.storage_type = LinearStorageType::TEXTURE_2D;
283   desc0.element_type = definition.GetDataType();
284   desc0.UploadLinearData(attr0.bias);
285   result.args_.AddObject(
286       "biases0", absl::make_unique<TensorLinearDescriptor>(std::move(desc0)));
287 
288   TensorLinearDescriptor desc1;
289   desc1.storage_type = LinearStorageType::TEXTURE_2D;
290   desc1.element_type = definition.GetDataType();
291   desc1.UploadLinearData(attr1.bias);
292   result.args_.AddObject(
293       "biases1", absl::make_unique<TensorLinearDescriptor>(std::move(desc1)));
294 
295   return result;
296 }
297 
298 }  // namespace gpu
299 }  // namespace tflite
300