1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h"
17
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22
23 #include "absl/memory/memory.h"
24 #include "tensorflow/lite/delegates/gpu/common/operations.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
27 #include "tensorflow/lite/delegates/gpu/common/types.h"
28
29 namespace tflite {
30 namespace gpu {
31
32 namespace {
UseBufferForWeights(const GpuInfo & gpu_info)33 bool UseBufferForWeights(const GpuInfo& gpu_info) {
34 return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali() ||
35 gpu_info.IsApple();
36 }
37
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,DataType::INT8> & weights,uint8_t * dst)38 void RearrangeFCWeightsToOIO4I4(
39 const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, uint8_t* dst) {
40 const int src_depth = DivideRoundUp(weights.shape.i, 4);
41 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
42
43 int counter = 0;
44 for (int d = 0; d < dst_depth; ++d) {
45 for (int s = 0; s < src_depth; ++s) {
46 for (int i = 0; i < 4; ++i) {
47 const int src_ch = s * 4 + i;
48 for (int j = 0; j < 4; ++j) {
49 const int dst_ch = d * 4 + j;
50 if (src_ch < weights.shape.i && dst_ch < weights.shape.o) {
51 int t =
52 127 +
53 weights.data[weights.shape.LinearIndex({dst_ch, 0, 0, src_ch})];
54 if (t < 0) {
55 t = 0;
56 }
57 dst[counter++] = t;
58 } else {
59 dst[counter++] = 127;
60 }
61 }
62 }
63 }
64 }
65 }
66
67 } // namespace
68
FullyConnected(const OperationDef & definition,const GpuInfo & gpu_info)69 FullyConnected::FullyConnected(const OperationDef& definition,
70 const GpuInfo& gpu_info)
71 : GPUOperation(definition) {
72 if (gpu_info.IsAdreno()) {
73 if (gpu_info.adreno_info.IsAdreno3xx()) {
74 work_group_size_ = int3(16, 4, 1);
75 } else if (gpu_info.adreno_info.IsAdreno4xx()) {
76 work_group_size_ = int3(32, 4, 1);
77 } else {
78 work_group_size_ = int3(32, 4, 1);
79 }
80 } else if (gpu_info.IsIntel() || gpu_info.IsNvidia() ||
81 gpu_info.IsPowerVR() || gpu_info.IsApple()) {
82 work_group_size_ = int3(8, 4, 1);
83 } else {
84 work_group_size_ = int3(16, 4, 1);
85 }
86 }
87
FullyConnected(FullyConnected && kernel)88 FullyConnected::FullyConnected(FullyConnected&& kernel)
89 : GPUOperation(std::move(kernel)) {}
90
operator =(FullyConnected && kernel)91 FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
92 if (this != &kernel) {
93 GPUOperation::operator=(std::move(kernel));
94 }
95 return *this;
96 }
97
98 // We split vec vec dot (every thread do vec vec dot product in basic
99 // vec mat mult) on 4 parts to create more threads
100 // tid.y thread process every 4-th element in vec vec dot
101 // Good results for ~1024 x 1024 sizes, for other can be written more
102 // optimized shaders
103
GetFullyConnectedKernelCode(const OperationDef & op_def,const GpuInfo & gpu_info,bool weights_are_buffer,bool quantized)104 std::string FullyConnected::GetFullyConnectedKernelCode(
105 const OperationDef& op_def, const GpuInfo& gpu_info,
106 bool weights_are_buffer, bool quantized) {
107 const int wg_total_size = work_group_size_.x * work_group_size_.y;
108 const std::string barrier =
109 wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
110 ? "SIMD_LOCAL_MEM_BARRIER"
111 : "LOCAL_MEM_BARRIER";
112 AddSrcTensor("src_tensor", op_def.src_tensors[0]);
113 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
114
115 std::string c;
116 switch (op_def.precision) {
117 case CalculationsPrecision::F32:
118 c += "#define FLT16 float16\n";
119 break;
120 case CalculationsPrecision::F32_F16:
121 case CalculationsPrecision::F16:
122 c += "#define FLT16 half16\n";
123 break;
124 }
125
126 c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
127 c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
128
129 c += R"(MAIN_FUNCTION($0) {
130 int gid = GLOBAL_ID_0;
131 int2 tid = INIT_INT2v2(LOCAL_ID_0, LOCAL_ID_1);
132 ACCUM_FLT4 s = INIT_ACCUM_FLT4(0.0f);
133 if (gid < args.dst_tensor.Slices()) {
134 for (int c = tid.y; c < args.src_tensor.Slices(); c += WG_Y) {
135 FLT4 v = args.src_tensor.Read(0, 0, c);
136 )";
137 if (weights_are_buffer) {
138 c += R"(FLT16 w = args.weights.Read(c * args.dst_tensor.Slices() + gid);
139 FLT4 partial = v.x * FLT16_0123(w);
140 partial += v.y * FLT16_4567(w);
141 partial += v.z * FLT16_89ab(w);
142 partial += v.w * FLT16_cdef(w);
143 s += TO_ACCUM_TYPE(partial);
144 )";
145 } else {
146 const std::string read_as_type =
147 op_def.precision == CalculationsPrecision::F32 ? "float" : "half";
148 c += " FLT4 w0 = args.weights.Read<" + read_as_type +
149 ">(c * 4 + 0, gid);\n";
150 c += " FLT4 w1 = args.weights.Read<" + read_as_type +
151 ">(c * 4 + 1, gid);\n";
152 c += " FLT4 w2 = args.weights.Read<" + read_as_type +
153 ">(c * 4 + 2, gid);\n";
154 c += " FLT4 w3 = args.weights.Read<" + read_as_type +
155 ">(c * 4 + 3, gid);\n";
156 if (quantized) {
157 c += R"(w0 = w0 * args.q0 + args.q1;
158 w1 = w1 * args.q0 + args.q1;
159 w2 = w2 * args.q0 + args.q1;
160 w3 = w3 * args.q0 + args.q1;
161 )";
162 }
163 c += R"(FLT4 partial = v.x * w0;
164 partial += v.y * w1;
165 partial += v.z * w2;
166 partial += v.w * w3;
167 s += TO_ACCUM_TYPE(partial);
168 )";
169 }
170 c += R"( }
171 }
172 __local ACCUM_FLT4 temp[WG_X][WG_Y];
173 temp[tid.x][tid.y] = s;
174 )";
175 c += " " + barrier + ";\n";
176 c += R"(
177 if (gid >= args.dst_tensor.Slices()) {
178 return;
179 }
180 if (tid.y == 0) {
181 )";
182 for (int i = 1; i < work_group_size_.y; ++i) {
183 c += " s += temp[tid.x][" + std::to_string(i) + "];\n";
184 }
185 c += R"( FLT4 r0 = TO_FLT4(s) + args.biases.Read(gid);
186 args.dst_tensor.Write(r0, 0, 0, gid);
187 }
188 })";
189
190 return c;
191 }
192
GetGridSize() const193 int3 FullyConnected::GetGridSize() const {
194 return int3(dst_[0]->Slices(), 1, 1);
195 }
196
UploadQuantizedWeights(const tflite::gpu::Tensor<OHWI,DataType::INT8> & weights,float scale,float zero_point)197 void FullyConnected::UploadQuantizedWeights(
198 const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
199 float zero_point) {
200 const int src_depth = DivideRoundUp(weights.shape.i, 4);
201 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
202
203 std::vector<uint8_t> data(src_depth * 4 * dst_depth * 4);
204 RearrangeFCWeightsToOIO4I4(weights, data.data());
205 TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
206 DataType::UINT8, TensorStorageType::TEXTURE_2D, src_depth * 4, dst_depth,
207 data.data());
208
209 if (definition_.precision == CalculationsPrecision::F32) {
210 args_.AddFloat("q0", scale);
211 args_.AddFloat("q1", -scale * (127.0 + zero_point));
212 } else {
213 args_.AddHalf("q0", half(scale));
214 args_.AddHalf("q1", half(-scale * (127.0 + zero_point)));
215 }
216 args_.AddObject("weights",
217 std::make_unique<TensorDescriptor>(std::move(desc)));
218 }
219
CreateFullyConnected(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr)220 FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
221 const OperationDef& definition,
222 const FullyConnectedAttributes& attr) {
223 FullyConnected result(definition, gpu_info);
224 result.UploadWeights(attr.weights, UseBufferForWeights(gpu_info));
225 result.code_ = result.GetFullyConnectedKernelCode(
226 definition, gpu_info, UseBufferForWeights(gpu_info), false);
227
228 TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
229 gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
230 result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
231 std::move(bias_tensor_desc)));
232
233 return result;
234 }
235
CreateFullyConnected(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedInt8Attributes & attr)236 FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
237 const OperationDef& definition,
238 const FullyConnectedInt8Attributes& attr) {
239 FullyConnected result(definition, gpu_info);
240 result.UploadQuantizedWeights(attr.weights, attr.scale, attr.zero_point);
241 result.code_ =
242 result.GetFullyConnectedKernelCode(definition, gpu_info, false, true);
243
244 TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
245 gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
246 result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
247 std::move(bias_tensor_desc)));
248
249 return result;
250
251 return result;
252 }
253
254 } // namespace gpu
255 } // namespace tflite
256