1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/gl/kernels/conv.h"
17
18 #include <memory>
19 #include <vector>
20
21 #include "absl/memory/memory.h"
22 #include "absl/strings/str_cat.h"
23 #include "tensorflow/lite/delegates/gpu/common/convert.h"
24 #include "tensorflow/lite/delegates/gpu/common/operations.h"
25 #include "tensorflow/lite/delegates/gpu/common/shape.h"
26 #include "tensorflow/lite/delegates/gpu/common/status.h"
27 #include "tensorflow/lite/delegates/gpu/common/types.h"
28 #include "tensorflow/lite/delegates/gpu/common/util.h"
29 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
30 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
31 #include "tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h"
32
33 namespace tflite {
34 namespace gpu {
35 namespace gl {
36 namespace {
37
38 class Convolution : public NodeShader {
39 public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const40 absl::Status GenerateCode(const GenerationContext& ctx,
41 GeneratedCode* generated_code) const final {
42 if (ctx.input_shapes.size() != 1) {
43 return absl::UnimplementedError(
44 "Convolution does not support more than 1 runtime tensor");
45 }
46 const auto& attr =
47 absl::any_cast<const Convolution2DAttributes&>(ctx.op_attr);
48 auto weights = attr.weights.shape;
49 const int offsets_count = weights.h * weights.w;
50 const bool offsets_count_too_large = offsets_count > kMaxConstArraySize;
51 std::vector<Variable> parameters;
52 if (offsets_count_too_large) {
53 parameters = {
54 {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
55 {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
56 {"padding_w", attr.padding.prepended.w},
57 {"padding_h", attr.padding.prepended.h},
58 {"dilation_w", attr.dilations.w},
59 {"dilation_h", attr.dilations.h},
60 {"kernel_w", weights.w},
61 {"kernel_h", weights.h},
62 {"src_depth", DivideRoundUp(weights.i, 4)},
63 {"stride", int2(attr.strides.w, attr.strides.h)},
64 };
65 } else {
66 std::vector<int2> offsets;
67 for (int h = 0; h < weights.h; ++h) {
68 for (int w = 0; w < weights.w; ++w) {
69 offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w,
70 h * attr.dilations.h - attr.padding.prepended.h);
71 }
72 }
73 parameters = {
74 {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
75 {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
76 {"offsets_count", offsets_count},
77 {"offsets", offsets},
78 {"src_depth", DivideRoundUp(weights.i, 4)},
79 {"stride", int2(attr.strides.w, attr.strides.h)},
80 };
81 }
82
83 // at least one padding is not empty
84 bool non_empty_padding =
85 attr.padding.appended.h != 0 || attr.padding.appended.w != 0 ||
86 attr.padding.prepended.h != 0 || attr.padding.prepended.w != 0;
87
88 std::vector<std::pair<std::string, Object>> objects = {
89 {"weights", MakeReadonlyObject(Get3DSizeForPHWO4I4(attr.weights.shape),
90 ConvertToPHWO4I4(attr.weights))}};
91
92 std::string source;
93 if (offsets_count_too_large) {
94 source = R"(
95 int i = 0;
96 for (int ky = 0; ky < $kernel_h$; ky++) {
97 for (int kx = 0; kx < $kernel_w$; kx++, i++) {
98 ivec2 coord = gid.xy * $stride$ + ivec2(kx * $dilation_w$ - $padding_w$, ky * $dilation_h$ - $padding_h$);)";
99 } else {
100 source = R"(
101 for (int i = 0; i < $offsets_count$; ++i) {
102 ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)";
103 }
104 if (non_empty_padding) {
105 source += R"(
106 if (coord.x < 0 || coord.y < 0 || coord.x >= $input_data_0_w$ || coord.y >= $input_data_0_h$) {
107 continue;
108 })";
109 }
110 source += R"(
111 for (int l = 0; l < $src_depth$; ++l) {
112 vec4 input_ = $input_data_0[coord.x, coord.y, l]$;
113 value_0.x += dot(input_, $weights[l * 4 + 0, i, gid.z]$);
114 value_0.y += dot(input_, $weights[l * 4 + 1, i, gid.z]$);
115 value_0.z += dot(input_, $weights[l * 4 + 2, i, gid.z]$);
116 value_0.w += dot(input_, $weights[l * 4 + 3, i, gid.z]$);
117 }
118 }
119 )";
120 if (offsets_count_too_large) {
121 source += R"(
122 }
123 )";
124 }
125 if (!attr.bias.data.empty()) {
126 source += "value_0 += $bias[gid.z]$;\n";
127 objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
128 }
129
130 *generated_code = {
131 /*parameters=*/std::move(parameters),
132 /*objects=*/std::move(objects),
133 /*shared_variables=*/{},
134 /*workload=*/uint3(),
135 /*workgroup=*/
136 GetIdealWorkgroupIfPossible(
137 *ctx.gpu_info, OperationType::CONVOLUTION_2D,
138 HW(weights.h, weights.w), attr.strides, uint3(0, 0, 0),
139 OHWI(weights.o, ctx.input_shapes[0][1], ctx.input_shapes[0][2],
140 ctx.input_shapes[0][3])),
141 /*source_code=*/std::move(source),
142 /*input=*/IOStructure::ONLY_DEFINITIONS,
143 /*output=*/IOStructure::AUTO,
144 };
145 return absl::OkStatus();
146 }
147 };
148
SelectMultiplier(int32_t input_width,const NodeShader::GenerationContext & ctx)149 int SelectMultiplier(int32_t input_width,
150 const NodeShader::GenerationContext& ctx) {
151 std::vector<int> multipliers = {4, 2};
152 if (ctx.gpu_info->IsAMD()) {
153 return 1;
154 }
155 if (!ctx.compiler_options.allow_precision_loss && ctx.gpu_info->IsMali()) {
156 multipliers = {2};
157 }
158 for (int i : multipliers) {
159 if (input_width % i == 0) {
160 return i;
161 }
162 }
163 return 1;
164 }
165
166 class Convolution1x1 : public NodeShader {
167 public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const168 absl::Status GenerateCode(const GenerationContext& ctx,
169 GeneratedCode* generated_code) const final {
170 if (ctx.input_shapes.size() != 1) {
171 return absl::UnimplementedError(
172 "Convolution does not support more than 1 runtime tensor");
173 }
174 const auto& attr =
175 absl::any_cast<const Convolution2DAttributes&>(ctx.op_attr);
176 if (attr.weights.shape.h != 1 || attr.weights.shape.w != 1) {
177 return absl::UnimplementedError("Height and width should be 1.");
178 }
179 if (attr.dilations.h != 1 || attr.dilations.w != 1) {
180 return absl::UnimplementedError("Dilations are not supported.");
181 }
182 if (attr.strides.h != 1 || attr.strides.w != 1) {
183 return absl::UnimplementedError("Strides are not supported.");
184 }
185 if (attr.padding.appended.h != 0 || attr.padding.appended.w != 0 ||
186 attr.padding.prepended.h != 0 || attr.padding.prepended.w != 0) {
187 return absl::UnimplementedError("Padding is not supported.");
188 }
189
190 int multiplier = SelectMultiplier(ctx.input_shapes[0][2], ctx);
191
192 std::vector<Variable> parameters = {
193 {"src_depth",
194 DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)},
195 };
196
197 std::vector<std::pair<std::string, Object>> objects = {
198 {"weights",
199 MakeReadonlyObject(uint3(4, DivideRoundUp(attr.weights.shape.i, 4),
200 DivideRoundUp(attr.weights.shape.o, 4)),
201 ConvertToPHWO4I4(attr.weights))}};
202 std::string source;
203 for (int i = 0; i < multiplier; i++) {
204 absl::StrAppend(&source, "highp vec4 result", i, " = vec4(0);\n");
205 }
206 absl::StrAppend(&source, "vec4 f;\n");
207 absl::StrAppend(&source, "for (int l = 0; l < $src_depth$; ++l) {\n");
208 for (int i = 0; i < multiplier; i++) {
209 absl::StrAppend(&source, " vec4 input", i, " = $input_data_0[gid.x * ",
210 multiplier, " + ", i, ",gid.y,l]$;\n");
211 }
212 for (int k = 0; k < 4; k++) {
213 absl::StrAppend(&source, " f = $weights[", k, ", l, gid.z]$;\n");
214 for (int i = 0; i < multiplier; i++) {
215 absl::StrAppend(&source, " result", i, "[", k, "] += dot(input", i,
216 ", f);\n");
217 }
218 }
219 absl::StrAppend(&source, "}\n");
220 if (!attr.bias.data.empty()) {
221 objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
222 absl::StrAppend(&source, "vec4 b = $bias[gid.z]$;\n");
223 for (int i = 0; i < multiplier; i++) {
224 absl::StrAppend(&source, "result", i, " += b;\n");
225 }
226 }
227 if (multiplier != 1) {
228 for (int i = 0; i < multiplier; i++) {
229 absl::StrAppend(&source, "$inplace_update:result", i, "$\n");
230 absl::StrAppend(&source, "$output_data_0[gid.x * ", multiplier, " + ",
231 i, ",gid.y,gid.z] = result", i, "$;\n");
232 }
233 } else {
234 absl::StrAppend(&source, "value_0 = result0;\n");
235 }
236
237 auto dst_depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
238 uint3 workgroup = uint3(16, 16, 1);
239 if (ctx.gpu_info->IsAdreno()) {
240 if (dst_depth >= 2) {
241 workgroup = uint3(8, 8, 2);
242 }
243 if (dst_depth >= 4) {
244 workgroup = uint3(4, 8, 4);
245 }
246 if (dst_depth >= 8) {
247 workgroup = uint3(4, 4, 8);
248 }
249 if (dst_depth >= 32) {
250 workgroup = uint3(4, 4, 16);
251 }
252 if (dst_depth >= 64) {
253 workgroup = uint3(2, 8, 16);
254 }
255 } else {
256 if (dst_depth >= 2) {
257 workgroup = uint3(16, 8, 2);
258 }
259 if (dst_depth >= 4) {
260 workgroup = uint3(16, 4, 4);
261 }
262 if (dst_depth >= 8) {
263 workgroup = uint3(8, 4, 8);
264 }
265 if (dst_depth >= 32) {
266 workgroup = uint3(8, 4, 8);
267 }
268 if (dst_depth >= 64) {
269 workgroup = uint3(8, 4, 8);
270 }
271 }
272 *generated_code = {
273 /*parameters=*/std::move(parameters),
274 /*objects=*/std::move(objects),
275 /*shared_variables=*/{},
276 /*workload=*/
277 uint3(ctx.output_shapes[0][2] / multiplier, ctx.output_shapes[0][1],
278 DivideRoundUp(ctx.output_shapes[0][3], 4)),
279 /*workgroup=*/
280 GetIdealWorkgroupIfPossible(
281 *ctx.gpu_info, OperationType::CONVOLUTION_2D,
282 HW(attr.weights.shape.h, attr.weights.shape.w), attr.strides,
283 workgroup,
284 OHWI(attr.weights.shape.o, ctx.input_shapes[0][1],
285 ctx.input_shapes[0][2], ctx.input_shapes[0][3])),
286 /*source_code=*/std::move(source),
287 /*input=*/IOStructure::ONLY_DEFINITIONS,
288 /*output=*/multiplier == 1 ? IOStructure::AUTO
289 : IOStructure::ONLY_DEFINITIONS,
290 };
291 return absl::OkStatus();
292 }
293 };
294
295 } // namespace
296
NewConvolutionNodeShader()297 std::unique_ptr<NodeShader> NewConvolutionNodeShader() {
298 return absl::make_unique<Convolution>();
299 }
300
NewConvolution1x1NodeShader()301 std::unique_ptr<NodeShader> NewConvolution1x1NodeShader() {
302 return absl::make_unique<Convolution1x1>();
303 }
304
305 } // namespace gl
306 } // namespace gpu
307 } // namespace tflite
308