1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h"
17
18 #include <string>
19 #include <utility>
20 #include <vector>
21
22 #include "absl/memory/memory.h"
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/storage_type_util.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
28 #include "tensorflow/lite/delegates/gpu/common/types.h"
29
30 namespace tflite {
31 namespace gpu {
32
33 namespace {
UseBufferForWeights(const GpuInfo & gpu_info)34 bool UseBufferForWeights(const GpuInfo& gpu_info) {
35 return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali() ||
36 gpu_info.IsApple();
37 }
38 } // namespace
39
FullyConnected(const OperationDef & definition,const GpuInfo & gpu_info)40 FullyConnected::FullyConnected(const OperationDef& definition,
41 const GpuInfo& gpu_info)
42 : GPUOperation(definition) {
43 if (gpu_info.IsAdreno()) {
44 if (gpu_info.adreno_info.IsAdreno3xx()) {
45 work_group_size_ = int3(16, 4, 1);
46 } else if (gpu_info.adreno_info.IsAdreno4xx()) {
47 work_group_size_ = int3(32, 4, 1);
48 } else {
49 work_group_size_ = int3(32, 4, 1);
50 }
51 } else if (gpu_info.IsIntel() || gpu_info.IsNvidia() ||
52 gpu_info.IsPowerVR() || gpu_info.IsApple()) {
53 work_group_size_ = int3(8, 4, 1);
54 } else {
55 work_group_size_ = int3(16, 4, 1);
56 }
57 code_ = GetFullyConnectedKernelCode(definition_, gpu_info);
58 }
59
FullyConnected(FullyConnected && kernel)60 FullyConnected::FullyConnected(FullyConnected&& kernel)
61 : GPUOperation(std::move(kernel)) {}
62
operator =(FullyConnected && kernel)63 FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
64 if (this != &kernel) {
65 GPUOperation::operator=(std::move(kernel));
66 }
67 return *this;
68 }
69
70 // We split vec vec dot (every thread do vec vec dot product in basic
71 // vec mat mult) on 4 parts to create more threads
72 // tid.y thread process every 4-th element in vec vec dot
73 // Good results for ~1024 x 1024 sizes, for other can be written more
74 // optimized shaders
75
GetFullyConnectedKernelCode(const OperationDef & op_def,const GpuInfo & gpu_info)76 std::string FullyConnected::GetFullyConnectedKernelCode(
77 const OperationDef& op_def, const GpuInfo& gpu_info) {
78 const int wg_total_size = work_group_size_.x * work_group_size_.y;
79 const std::string barrier =
80 wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
81 ? "SIMD_LOCAL_MEM_BARRIER"
82 : "LOCAL_MEM_BARRIER";
83 AddSrcTensor("src_tensor", op_def.src_tensors[0]);
84 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
85
86 const bool weights_are_buffer = UseBufferForWeights(gpu_info);
87
88 std::string c;
89 switch (op_def.precision) {
90 case CalculationsPrecision::F32:
91 c += "#define FLT16 float16\n";
92 break;
93 case CalculationsPrecision::F32_F16:
94 case CalculationsPrecision::F16:
95 c += "#define FLT16 half16\n";
96 break;
97 }
98
99 c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
100 c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
101
102 c += R"(MAIN_FUNCTION($0) {
103 int gid = GLOBAL_ID_0;
104 int2 tid = INIT_INT2v2(LOCAL_ID_0, LOCAL_ID_1);
105 ACCUM_FLT4 s = INIT_ACCUM_FLT4(0.0f);
106 if (gid < args.dst_tensor.Slices()) {
107 for (int c = tid.y; c < args.src_tensor.Slices(); c += WG_Y) {
108 FLT4 v = args.src_tensor.Read(0, 0, c);
109 )";
110 if (weights_are_buffer) {
111 c += R"(FLT16 w = args.weights.Read(c * args.dst_tensor.Slices() + gid);
112 FLT4 partial = v.x * FLT16_0123(w);
113 partial += v.y * FLT16_4567(w);
114 partial += v.z * FLT16_89ab(w);
115 partial += v.w * FLT16_cdef(w);
116 s += TO_ACCUM_TYPE(partial);
117 )";
118 } else {
119 c += R"(FLT4 w0 = args.weights.Read(c * 4 + 0, gid);
120 FLT4 w1 = args.weights.Read(c * 4 + 1, gid);
121 FLT4 w2 = args.weights.Read(c * 4 + 2, gid);
122 FLT4 w3 = args.weights.Read(c * 4 + 3, gid);
123 FLT4 partial = v.x * w0;
124 partial += v.y * w1;
125 partial += v.z * w2;
126 partial += v.w * w3;
127 s += TO_ACCUM_TYPE(partial);
128 )";
129 }
130 c += R"( }
131 }
132 __local ACCUM_FLT4 temp[WG_X][WG_Y];
133 temp[tid.x][tid.y] = s;
134 )";
135 c += " " + barrier + ";\n";
136 c += R"(
137 if (gid >= args.dst_tensor.Slices()) {
138 return;
139 }
140 if (tid.y == 0) {
141 )";
142 for (int i = 1; i < work_group_size_.y; ++i) {
143 c += " s += temp[tid.x][" + std::to_string(i) + "];\n";
144 }
145 c += R"( FLT4 r0 = TO_FLT4(s) + args.biases.Read(gid);
146 args.dst_tensor.Write(r0, 0, 0, gid);
147 }
148 })";
149
150 return c;
151 }
152
GetGridSize() const153 int3 FullyConnected::GetGridSize() const {
154 return int3(dst_[0]->Slices(), 1, 1);
155 }
156
CreateFullyConnected(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr)157 FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
158 const OperationDef& definition,
159 const FullyConnectedAttributes& attr) {
160 FullyConnected result(definition, gpu_info);
161 result.UploadWeights(attr.weights, UseBufferForWeights(gpu_info));
162
163 TensorLinearDescriptor desc;
164 desc.storage_type = gpu_info.SupportsImages() ? LinearStorageType::TEXTURE_2D
165 : LinearStorageType::BUFFER;
166 if (gpu_info.IsApple()) {
167 desc.storage_type =
168 DeduceLinearStorageType(definition.GetPrimaryStorageType());
169 }
170 desc.element_type = definition.GetDataType();
171 desc.UploadLinearData(attr.bias);
172 result.args_.AddObject(
173 "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
174
175 return result;
176 }
177
178 } // namespace gpu
179 } // namespace tflite
180