• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h"
17 
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22 
23 #include "absl/memory/memory.h"
24 #include "tensorflow/lite/delegates/gpu/common/operations.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
27 #include "tensorflow/lite/delegates/gpu/common/types.h"
28 
29 namespace tflite {
30 namespace gpu {
31 
32 namespace {
UseBufferForWeights(const GpuInfo & gpu_info)33 bool UseBufferForWeights(const GpuInfo& gpu_info) {
34   return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali() ||
35          gpu_info.IsApple();
36 }
37 
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,DataType::INT8> & weights,uint8_t * dst)38 void RearrangeFCWeightsToOIO4I4(
39     const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, uint8_t* dst) {
40   const int src_depth = DivideRoundUp(weights.shape.i, 4);
41   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
42 
43   int counter = 0;
44   for (int d = 0; d < dst_depth; ++d) {
45     for (int s = 0; s < src_depth; ++s) {
46       for (int i = 0; i < 4; ++i) {
47         const int src_ch = s * 4 + i;
48         for (int j = 0; j < 4; ++j) {
49           const int dst_ch = d * 4 + j;
50           if (src_ch < weights.shape.i && dst_ch < weights.shape.o) {
51             int t =
52                 127 +
53                 weights.data[weights.shape.LinearIndex({dst_ch, 0, 0, src_ch})];
54             if (t < 0) {
55               t = 0;
56             }
57             dst[counter++] = t;
58           } else {
59             dst[counter++] = 127;
60           }
61         }
62       }
63     }
64   }
65 }
66 
67 }  // namespace
68 
FullyConnected(const OperationDef & definition,const GpuInfo & gpu_info)69 FullyConnected::FullyConnected(const OperationDef& definition,
70                                const GpuInfo& gpu_info)
71     : GPUOperation(definition) {
72   if (gpu_info.IsAdreno()) {
73     if (gpu_info.adreno_info.IsAdreno3xx()) {
74       work_group_size_ = int3(16, 4, 1);
75     } else if (gpu_info.adreno_info.IsAdreno4xx()) {
76       work_group_size_ = int3(32, 4, 1);
77     } else {
78       work_group_size_ = int3(32, 4, 1);
79     }
80   } else if (gpu_info.IsIntel() || gpu_info.IsNvidia() ||
81              gpu_info.IsPowerVR() || gpu_info.IsApple()) {
82     work_group_size_ = int3(8, 4, 1);
83   } else {
84     work_group_size_ = int3(16, 4, 1);
85   }
86 }
87 
FullyConnected(FullyConnected && kernel)88 FullyConnected::FullyConnected(FullyConnected&& kernel)
89     : GPUOperation(std::move(kernel)) {}
90 
operator =(FullyConnected && kernel)91 FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
92   if (this != &kernel) {
93     GPUOperation::operator=(std::move(kernel));
94   }
95   return *this;
96 }
97 
98 // We split vec vec dot (every thread do vec vec dot product in basic
99 // vec mat mult) on 4 parts to create more threads
100 // tid.y thread process every 4-th element in vec vec dot
101 // Good results for ~1024 x 1024 sizes, for other can be written more
102 // optimized shaders
103 
GetFullyConnectedKernelCode(const OperationDef & op_def,const GpuInfo & gpu_info,bool weights_are_buffer,bool quantized)104 std::string FullyConnected::GetFullyConnectedKernelCode(
105     const OperationDef& op_def, const GpuInfo& gpu_info,
106     bool weights_are_buffer, bool quantized) {
107   const int wg_total_size = work_group_size_.x * work_group_size_.y;
108   const std::string barrier =
109       wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
110           ? "SIMD_LOCAL_MEM_BARRIER"
111           : "LOCAL_MEM_BARRIER";
112   AddSrcTensor("src_tensor", op_def.src_tensors[0]);
113   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
114 
115   std::string c;
116   switch (op_def.precision) {
117     case CalculationsPrecision::F32:
118       c += "#define FLT16 float16\n";
119       break;
120     case CalculationsPrecision::F32_F16:
121     case CalculationsPrecision::F16:
122       c += "#define FLT16 half16\n";
123       break;
124   }
125 
126   c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
127   c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
128 
129   c += R"(MAIN_FUNCTION($0) {
130   int gid = GLOBAL_ID_0;
131   int2 tid = INIT_INT2v2(LOCAL_ID_0, LOCAL_ID_1);
132   ACCUM_FLT4 s = INIT_ACCUM_FLT4(0.0f);
133   if (gid < args.dst_tensor.Slices()) {
134     for (int c = tid.y; c < args.src_tensor.Slices(); c += WG_Y) {
135       FLT4 v = args.src_tensor.Read(0, 0, c);
136 )";
137   if (weights_are_buffer) {
138     c += R"(FLT16 w = args.weights.Read(c * args.dst_tensor.Slices() + gid);
139       FLT4 partial = v.x * FLT16_0123(w);
140       partial += v.y * FLT16_4567(w);
141       partial += v.z * FLT16_89ab(w);
142       partial += v.w * FLT16_cdef(w);
143       s += TO_ACCUM_TYPE(partial);
144 )";
145   } else {
146     const std::string read_as_type =
147         op_def.precision == CalculationsPrecision::F32 ? "float" : "half";
148     c += "      FLT4 w0 = args.weights.Read<" + read_as_type +
149          ">(c * 4 + 0, gid);\n";
150     c += "      FLT4 w1 = args.weights.Read<" + read_as_type +
151          ">(c * 4 + 1, gid);\n";
152     c += "      FLT4 w2 = args.weights.Read<" + read_as_type +
153          ">(c * 4 + 2, gid);\n";
154     c += "      FLT4 w3 = args.weights.Read<" + read_as_type +
155          ">(c * 4 + 3, gid);\n";
156     if (quantized) {
157       c += R"(w0 = w0 * args.q0 + args.q1;
158       w1 = w1 * args.q0 + args.q1;
159       w2 = w2 * args.q0 + args.q1;
160       w3 = w3 * args.q0 + args.q1;
161 )";
162     }
163     c += R"(FLT4 partial = v.x * w0;
164       partial += v.y * w1;
165       partial += v.z * w2;
166       partial += v.w * w3;
167       s += TO_ACCUM_TYPE(partial);
168 )";
169   }
170   c += R"(    }
171   }
172   __local ACCUM_FLT4 temp[WG_X][WG_Y];
173   temp[tid.x][tid.y] = s;
174 )";
175   c += "  " + barrier + ";\n";
176   c += R"(
177   if (gid >= args.dst_tensor.Slices()) {
178     return;
179   }
180   if (tid.y == 0) {
181 )";
182   for (int i = 1; i < work_group_size_.y; ++i) {
183     c += "    s += temp[tid.x][" + std::to_string(i) + "];\n";
184   }
185   c += R"(    FLT4 r0 = TO_FLT4(s) + args.biases.Read(gid);
186     args.dst_tensor.Write(r0, 0, 0, gid);
187   }
188 })";
189 
190   return c;
191 }
192 
GetGridSize() const193 int3 FullyConnected::GetGridSize() const {
194   return int3(dst_[0]->Slices(), 1, 1);
195 }
196 
UploadQuantizedWeights(const tflite::gpu::Tensor<OHWI,DataType::INT8> & weights,float scale,float zero_point)197 void FullyConnected::UploadQuantizedWeights(
198     const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
199     float zero_point) {
200   const int src_depth = DivideRoundUp(weights.shape.i, 4);
201   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
202 
203   std::vector<uint8_t> data(src_depth * 4 * dst_depth * 4);
204   RearrangeFCWeightsToOIO4I4(weights, data.data());
205   TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
206       DataType::UINT8, TensorStorageType::TEXTURE_2D, src_depth * 4, dst_depth,
207       data.data());
208 
209   if (definition_.precision == CalculationsPrecision::F32) {
210     args_.AddFloat("q0", scale);
211     args_.AddFloat("q1", -scale * (127.0 + zero_point));
212   } else {
213     args_.AddHalf("q0", half(scale));
214     args_.AddHalf("q1", half(-scale * (127.0 + zero_point)));
215   }
216   args_.AddObject("weights",
217                   std::make_unique<TensorDescriptor>(std::move(desc)));
218 }
219 
CreateFullyConnected(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr)220 FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
221                                     const OperationDef& definition,
222                                     const FullyConnectedAttributes& attr) {
223   FullyConnected result(definition, gpu_info);
224   result.UploadWeights(attr.weights, UseBufferForWeights(gpu_info));
225   result.code_ = result.GetFullyConnectedKernelCode(
226       definition, gpu_info, UseBufferForWeights(gpu_info), false);
227 
228   TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
229       gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
230   result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
231                                        std::move(bias_tensor_desc)));
232 
233   return result;
234 }
235 
CreateFullyConnected(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedInt8Attributes & attr)236 FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
237                                     const OperationDef& definition,
238                                     const FullyConnectedInt8Attributes& attr) {
239   FullyConnected result(definition, gpu_info);
240   result.UploadQuantizedWeights(attr.weights, attr.scale, attr.zero_point);
241   result.code_ =
242       result.GetFullyConnectedKernelCode(definition, gpu_info, false, true);
243 
244   TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
245       gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
246   result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
247                                        std::move(bias_tensor_desc)));
248 
249   return result;
250 
251   return result;
252 }
253 
254 }  // namespace gpu
255 }  // namespace tflite
256