1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h"
17
18 #include <string>
19 #include <utility>
20 #include <vector>
21
22 #include "absl/memory/memory.h"
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
26 #include "tensorflow/lite/delegates/gpu/common/types.h"
27
28 namespace tflite {
29 namespace gpu {
30 namespace {
UseBufferForWeights(const GpuInfo & gpu_info)31 bool UseBufferForWeights(const GpuInfo& gpu_info) {
32 return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali();
33 }
34 } // namespace
35
FCFCAdd(const OperationDef & definition,const GpuInfo & gpu_info)36 FCFCAdd::FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info)
37 : GPUOperation(definition) {
38 if (gpu_info.IsAdreno()) {
39 if (gpu_info.adreno_info.IsAdreno3xx()) {
40 work_group_size_ = int3(16, 4, 1);
41 } else if (gpu_info.adreno_info.IsAdreno4xx()) {
42 work_group_size_ = int3(32, 4, 1);
43 } else {
44 work_group_size_ = int3(32, 4, 1);
45 }
46 } else if (gpu_info.IsIntel()) {
47 work_group_size_ = int3(8, 4, 1);
48 } else if (gpu_info.IsNvidia()) {
49 work_group_size_ = int3(8, 4, 1);
50 } else if (gpu_info.IsPowerVR()) {
51 work_group_size_ = int3(8, 4, 1);
52 } else {
53 work_group_size_ = int3(16, 4, 1);
54 }
55 code_ = GetFCFCAddKernelCode(definition_, gpu_info);
56 }
57
FCFCAdd(FCFCAdd && kernel)58 FCFCAdd::FCFCAdd(FCFCAdd&& kernel) : GPUOperation(std::move(kernel)) {}
59
operator =(FCFCAdd && kernel)60 FCFCAdd& FCFCAdd::operator=(FCFCAdd&& kernel) {
61 if (this != &kernel) {
62 GPUOperation::operator=(std::move(kernel));
63 }
64 return *this;
65 }
66
67 // We split vec vec dot (every thread do vec vec dot product in basic
68 // vec mat mult) on 4 parts to create more threads
69 // tid.y thread process every 4-th element in vec vec dot
70 // Good results for ~1024 x 1024 sizes, for other can be written more
71 // optimized shaders
72
GetFCFCAddKernelCode(const OperationDef & op_def,const GpuInfo & gpu_info)73 std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
74 const GpuInfo& gpu_info) {
75 AddSrcTensor("src_tensor_0", op_def.src_tensors[0]);
76 AddSrcTensor("src_tensor_1", op_def.src_tensors[1]);
77 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
78
79 const bool weights_are_buffer = UseBufferForWeights(gpu_info);
80
81 std::string c;
82 switch (op_def.precision) {
83 case CalculationsPrecision::F32:
84 c += "#define FLT16 float16\n";
85 break;
86 case CalculationsPrecision::F32_F16:
87 case CalculationsPrecision::F16:
88 c += "#define FLT16 half16\n";
89 break;
90 }
91
92 c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
93 c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
94
95 c += R"(MAIN_FUNCTION($0) {
96 int gid = get_global_id(0);
97 int2 tid;
98 tid.x = LOCAL_ID_0;
99 tid.y = LOCAL_ID_1;
100 ACCUM_FLT4 s = INIT_ACCUM_FLT4(0.0f);
101 if (gid < args.dst_tensor.Slices()) {
102 for (int c = tid.y; c < args.src_tensor_0.Slices(); c += WG_Y) {
103 FLT4 v = args.src_tensor_0.Read(0, 0, c);
104 )";
105 if (weights_are_buffer) {
106 c += R"(FLT16 w = args.weights0.Read(c * args.dst_tensor.Slices() + gid);
107 FLT4 partial = v.x * FLT16_0123(w);
108 partial += v.y * FLT16_4567(w);
109 partial += v.z * FLT16_89ab(w);
110 partial += v.w * FLT16_cdef(w);
111 s += TO_ACCUM_TYPE(partial);
112 )";
113 } else {
114 c += R"(FLT4 w0 = args.weights0.Read(c * 4 + 0, gid);
115 FLT4 w1 = args.weights0.Read(c * 4 + 1, gid);
116 FLT4 w2 = args.weights0.Read(c * 4 + 2, gid);
117 FLT4 w3 = args.weights0.Read(c * 4 + 3, gid);
118 FLT4 partial = v.x * w0;
119 partial += v.y * w1;
120 partial += v.z * w2;
121 partial += v.w * w3;
122 s += TO_ACCUM_TYPE(partial);
123 )";
124 }
125 c += R"( }
126 for (int c = tid.y; c < args.src_tensor_1.Slices(); c += WG_Y) {
127 FLT4 v = args.src_tensor_1.Read(0, 0, c);
128 )";
129 if (weights_are_buffer) {
130 c += R"(FLT16 w = args.weights1.Read(c * args.dst_tensor.Slices() + gid);
131 FLT4 partial = v.x * FLT16_0123(w);
132 partial += v.y * FLT16_4567(w);
133 partial += v.z * FLT16_89ab(w);
134 partial += v.w * FLT16_cdef(w);
135 s += TO_ACCUM_TYPE(partial);
136 )";
137 } else {
138 c += R"(FLT4 w0 = args.weights1.Read(c * 4 + 0, gid);
139 FLT4 w1 = args.weights1.Read(c * 4 + 1, gid);
140 FLT4 w2 = args.weights1.Read(c * 4 + 2, gid);
141 FLT4 w3 = args.weights1.Read(c * 4 + 3, gid);
142 FLT4 partial = v.x * w0;
143 partial += v.y * w1;
144 partial += v.z * w2;
145 partial += v.w * w3;
146 s += TO_ACCUM_TYPE(partial);
147 )";
148 }
149 c += R"( }
150 }
151 __local ACCUM_FLT4 temp[WG_X][WG_Y];
152 temp[tid.x][tid.y] = s;
153 LOCAL_MEM_BARRIER;
154 if (gid >= args.dst_tensor.Slices()) {
155 return;
156 }
157 if (tid.y == 0) {
158 )";
159 for (int i = 1; i < work_group_size_.y; ++i) {
160 c += " s += temp[tid.x][" + std::to_string(i) + "];\n";
161 }
162 c +=
163 R"( FLT4 r0 = TO_FLT4(s) + args.biases0.Read(gid) + args.biases1.Read(gid);
164 args.dst_tensor.Write(r0, 0, 0, gid);
165 }
166 })";
167
168 return c;
169 }
170
GetGridSize() const171 int3 FCFCAdd::GetGridSize() const { return int3(dst_[0]->Slices(), 1, 1); }
172
CreateFCFCAdd(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr0,const FullyConnectedAttributes & attr1)173 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
174 const FullyConnectedAttributes& attr0,
175 const FullyConnectedAttributes& attr1) {
176 FCFCAdd result(definition, gpu_info);
177 result.UploadWeights(attr0.weights, "weights0",
178 UseBufferForWeights(gpu_info));
179 result.UploadWeights(attr1.weights, "weights1",
180 UseBufferForWeights(gpu_info));
181
182 TensorLinearDescriptor desc0;
183 desc0.storage_type = LinearStorageType::TEXTURE_2D;
184 desc0.element_type = definition.GetDataType();
185 desc0.UploadLinearData(attr0.bias);
186 result.args_.AddObject(
187 "biases0", absl::make_unique<TensorLinearDescriptor>(std::move(desc0)));
188
189 TensorLinearDescriptor desc1;
190 desc1.storage_type = LinearStorageType::TEXTURE_2D;
191 desc1.element_type = definition.GetDataType();
192 desc1.UploadLinearData(attr1.bias);
193 result.args_.AddObject(
194 "biases1", absl::make_unique<TensorLinearDescriptor>(std::move(desc1)));
195
196 return result;
197 }
198
199 } // namespace gpu
200 } // namespace tflite
201