1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/gl/kernels/conv.h"
17
18 #include <memory>
19 #include <vector>
20
21 #include "absl/memory/memory.h"
22 #include "absl/strings/str_cat.h"
23 #include "tensorflow/lite/delegates/gpu/common/convert.h"
24 #include "tensorflow/lite/delegates/gpu/common/operations.h"
25 #include "tensorflow/lite/delegates/gpu/common/shape.h"
26 #include "tensorflow/lite/delegates/gpu/common/status.h"
27 #include "tensorflow/lite/delegates/gpu/common/types.h"
28 #include "tensorflow/lite/delegates/gpu/common/util.h"
29 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
30 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
31 #include "tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h"
32
33 namespace tflite {
34 namespace gpu {
35 namespace gl {
36 namespace {
37
38 class Convolution : public NodeShader {
39 public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const40 absl::Status GenerateCode(const GenerationContext& ctx,
41 GeneratedCode* generated_code) const final {
42 if (ctx.input_shapes.size() != 1) {
43 return absl::UnimplementedError(
44 "Convolution does not support more than 1 runtime tensor");
45 }
46 const auto& attr =
47 absl::any_cast<const Convolution2DAttributes&>(ctx.op_attr);
48 auto weights = attr.weights.shape;
49 const int offsets_count = weights.h * weights.w;
50 const bool offsets_count_too_large = offsets_count > kMaxConstArraySize;
51 std::vector<Variable> parameters;
52 if (offsets_count_too_large) {
53 parameters = {
54 {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
55 {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
56 {"padding_w", attr.padding.prepended.w},
57 {"padding_h", attr.padding.prepended.h},
58 {"dilation_w", attr.dilations.w},
59 {"dilation_h", attr.dilations.h},
60 {"kernel_w", weights.w},
61 {"kernel_h", weights.h},
62 {"src_depth", DivideRoundUp(weights.i, 4)},
63 {"stride", int2(attr.strides.w, attr.strides.h)},
64 };
65 } else {
66 std::vector<int2> offsets;
67 for (int h = 0; h < weights.h; ++h) {
68 for (int w = 0; w < weights.w; ++w) {
69 offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w,
70 h * attr.dilations.h - attr.padding.prepended.h);
71 }
72 }
73 parameters = {
74 {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
75 {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
76 {"offsets_count", offsets_count},
77 {"offsets", offsets},
78 {"src_depth", DivideRoundUp(weights.i, 4)},
79 {"stride", int2(attr.strides.w, attr.strides.h)},
80 };
81 }
82
83 // at least one padding is not empty
84 bool non_empty_padding =
85 attr.padding.appended.h != 0 || attr.padding.appended.w != 0 ||
86 attr.padding.prepended.h != 0 || attr.padding.prepended.w != 0;
87
88 std::vector<std::pair<std::string, Object>> objects = {
89 {"weights", MakeReadonlyObject(Get3DSizeForPHWO4I4(attr.weights.shape),
90 ConvertToPHWO4I4(attr.weights))}};
91
92 std::string source;
93 if (offsets_count_too_large) {
94 source = R"(
95 int i = 0;
96 for (int ky = 0; ky < $kernel_h$; ky++) {
97 for (int kx = 0; kx < $kernel_w$; kx++, i++) {
98 ivec2 coord = gid.xy * $stride$ + ivec2(kx * $dilation_w$ - $padding_w$, ky * $dilation_h$ - $padding_h$);)";
99 } else {
100 source = R"(
101 for (int i = 0; i < $offsets_count$; ++i) {
102 ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)";
103 }
104 if (non_empty_padding) {
105 source += R"(
106 if (coord.x < 0 || coord.y < 0 || coord.x >= $input_data_0_w$ || coord.y >= $input_data_0_h$) {
107 continue;
108 })";
109 }
110 source += R"(
111 for (int l = 0; l < $src_depth$; ++l) {
112 vec4 input_ = $input_data_0[coord.x, coord.y, l]$;
113 value_0.x += dot(input_, $weights[l * 4 + 0, i, gid.z]$);
114 value_0.y += dot(input_, $weights[l * 4 + 1, i, gid.z]$);
115 value_0.z += dot(input_, $weights[l * 4 + 2, i, gid.z]$);
116 value_0.w += dot(input_, $weights[l * 4 + 3, i, gid.z]$);
117 }
118 }
119 )";
120 if (offsets_count_too_large) {
121 source += R"(
122 }
123 )";
124 }
125 if (!attr.bias.data.empty()) {
126 source += "value_0 += $bias[gid.z]$;\n";
127 objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
128 }
129
130 *generated_code = {
131 /*parameters=*/std::move(parameters),
132 /*objects=*/std::move(objects),
133 /*shared_variables=*/{},
134 /*workload=*/uint3(),
135 /*workgroup=*/
136 GetIdealWorkgroupIfPossible(
137 *ctx.gpu_info, OperationType::CONVOLUTION_2D,
138 HW(weights.h, weights.w), attr.strides, uint3(0, 0, 0),
139 OHWI(weights.o, ctx.input_shapes[0][1], ctx.input_shapes[0][2],
140 ctx.input_shapes[0][3])),
141 /*source_code=*/std::move(source),
142 /*input=*/IOStructure::ONLY_DEFINITIONS,
143 /*output=*/IOStructure::AUTO,
144 };
145 return absl::OkStatus();
146 }
147 };
148
SelectMultiplier(int32_t input_width,const NodeShader::GenerationContext & ctx)149 int SelectMultiplier(int32_t input_width,
150 const NodeShader::GenerationContext& ctx) {
151 std::vector<int> multipliers = {4, 2};
152 if (!ctx.compiler_options.allow_precision_loss && ctx.gpu_info->IsMali()) {
153 multipliers = {2};
154 }
155 for (int i : multipliers) {
156 if (input_width % i == 0) {
157 return i;
158 }
159 }
160 return 1;
161 }
162
163 class Convolution1x1 : public NodeShader {
164 public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const165 absl::Status GenerateCode(const GenerationContext& ctx,
166 GeneratedCode* generated_code) const final {
167 if (ctx.input_shapes.size() != 1) {
168 return absl::UnimplementedError(
169 "Convolution does not support more than 1 runtime tensor");
170 }
171 const auto& attr =
172 absl::any_cast<const Convolution2DAttributes&>(ctx.op_attr);
173 if (attr.weights.shape.h != 1 || attr.weights.shape.w != 1) {
174 return absl::UnimplementedError("Height and width should be 1.");
175 }
176 if (attr.dilations.h != 1 || attr.dilations.w != 1) {
177 return absl::UnimplementedError("Dilations are not supported.");
178 }
179 if (attr.strides.h != 1 || attr.strides.w != 1) {
180 return absl::UnimplementedError("Strides are not supported.");
181 }
182 if (attr.padding.appended.h != 0 || attr.padding.appended.w != 0 ||
183 attr.padding.prepended.h != 0 || attr.padding.prepended.w != 0) {
184 return absl::UnimplementedError("Padding is not supported.");
185 }
186
187 int multiplier = SelectMultiplier(ctx.input_shapes[0][2], ctx);
188
189 std::vector<Variable> parameters = {
190 {"src_depth",
191 DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)},
192 };
193
194 std::vector<std::pair<std::string, Object>> objects = {
195 {"weights",
196 MakeReadonlyObject(uint3(4, DivideRoundUp(attr.weights.shape.i, 4),
197 DivideRoundUp(attr.weights.shape.o, 4)),
198 ConvertToPHWO4I4(attr.weights))}};
199 std::string source;
200 for (int i = 0; i < multiplier; i++) {
201 absl::StrAppend(&source, "highp vec4 result", i, " = vec4(0);\n");
202 }
203 absl::StrAppend(&source, "vec4 f;\n");
204 absl::StrAppend(&source, "for (int l = 0; l < $src_depth$; ++l) {\n");
205 for (int i = 0; i < multiplier; i++) {
206 absl::StrAppend(&source, " vec4 input", i, " = $input_data_0[gid.x * ",
207 multiplier, " + ", i, ",gid.y,l]$;\n");
208 }
209 for (int k = 0; k < 4; k++) {
210 absl::StrAppend(&source, " f = $weights[", k, ", l, gid.z]$;\n");
211 for (int i = 0; i < multiplier; i++) {
212 absl::StrAppend(&source, " result", i, "[", k, "] += dot(input", i,
213 ", f);\n");
214 }
215 }
216 absl::StrAppend(&source, "}\n");
217 if (!attr.bias.data.empty()) {
218 objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
219 absl::StrAppend(&source, "vec4 b = $bias[gid.z]$;\n");
220 for (int i = 0; i < multiplier; i++) {
221 absl::StrAppend(&source, "result", i, " += b;\n");
222 }
223 }
224 if (multiplier != 1) {
225 for (int i = 0; i < multiplier; i++) {
226 absl::StrAppend(&source, "$inplace_update:result", i, "$\n");
227 absl::StrAppend(&source, "$output_data_0[gid.x * ", multiplier, " + ",
228 i, ",gid.y,gid.z] = result", i, "$;\n");
229 }
230 } else {
231 absl::StrAppend(&source, "value_0 = result0;\n");
232 }
233
234 auto dst_depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
235 uint3 workgroup = uint3(16, 16, 1);
236 if (ctx.gpu_info->IsAdreno()) {
237 if (dst_depth >= 2) {
238 workgroup = uint3(8, 8, 2);
239 }
240 if (dst_depth >= 4) {
241 workgroup = uint3(4, 8, 4);
242 }
243 if (dst_depth >= 8) {
244 workgroup = uint3(4, 4, 8);
245 }
246 if (dst_depth >= 32) {
247 workgroup = uint3(4, 4, 16);
248 }
249 if (dst_depth >= 64) {
250 workgroup = uint3(2, 8, 16);
251 }
252 } else {
253 if (dst_depth >= 2) {
254 workgroup = uint3(16, 8, 2);
255 }
256 if (dst_depth >= 4) {
257 workgroup = uint3(16, 4, 4);
258 }
259 if (dst_depth >= 8) {
260 workgroup = uint3(8, 4, 8);
261 }
262 if (dst_depth >= 32) {
263 workgroup = uint3(8, 4, 8);
264 }
265 if (dst_depth >= 64) {
266 workgroup = uint3(8, 4, 8);
267 }
268 }
269 *generated_code = {
270 /*parameters=*/std::move(parameters),
271 /*objects=*/std::move(objects),
272 /*shared_variables=*/{},
273 /*workload=*/
274 uint3(ctx.output_shapes[0][2] / multiplier, ctx.output_shapes[0][1],
275 DivideRoundUp(ctx.output_shapes[0][3], 4)),
276 /*workgroup=*/
277 GetIdealWorkgroupIfPossible(
278 *ctx.gpu_info, OperationType::CONVOLUTION_2D,
279 HW(attr.weights.shape.h, attr.weights.shape.w), attr.strides,
280 workgroup,
281 OHWI(attr.weights.shape.o, ctx.input_shapes[0][1],
282 ctx.input_shapes[0][2], ctx.input_shapes[0][3])),
283 /*source_code=*/std::move(source),
284 /*input=*/IOStructure::ONLY_DEFINITIONS,
285 /*output=*/multiplier == 1 ? IOStructure::AUTO
286 : IOStructure::ONLY_DEFINITIONS,
287 };
288 return absl::OkStatus();
289 }
290 };
291
292 } // namespace
293
NewConvolutionNodeShader()294 std::unique_ptr<NodeShader> NewConvolutionNodeShader() {
295 return absl::make_unique<Convolution>();
296 }
297
NewConvolution1x1NodeShader()298 std::unique_ptr<NodeShader> NewConvolution1x1NodeShader() {
299 return absl::make_unique<Convolution1x1>();
300 }
301
302 } // namespace gl
303 } // namespace gpu
304 } // namespace tflite
305