• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/gl/kernels/conv.h"
17 
18 #include <memory>
19 #include <vector>
20 
21 #include "absl/memory/memory.h"
22 #include "absl/strings/str_cat.h"
23 #include "tensorflow/lite/delegates/gpu/common/convert.h"
24 #include "tensorflow/lite/delegates/gpu/common/operations.h"
25 #include "tensorflow/lite/delegates/gpu/common/shape.h"
26 #include "tensorflow/lite/delegates/gpu/common/status.h"
27 #include "tensorflow/lite/delegates/gpu/common/types.h"
28 #include "tensorflow/lite/delegates/gpu/common/util.h"
29 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
30 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
31 #include "tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h"
32 
33 namespace tflite {
34 namespace gpu {
35 namespace gl {
36 namespace {
37 
38 class Convolution : public NodeShader {
39  public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const40   absl::Status GenerateCode(const GenerationContext& ctx,
41                             GeneratedCode* generated_code) const final {
42     if (ctx.input_shapes.size() != 1) {
43       return absl::UnimplementedError(
44           "Convolution does not support more than 1 runtime tensor");
45     }
46     const auto& attr =
47         absl::any_cast<const Convolution2DAttributes&>(ctx.op_attr);
48     auto weights = attr.weights.shape;
49     const int offsets_count = weights.h * weights.w;
50     const bool offsets_count_too_large = offsets_count > kMaxConstArraySize;
51     std::vector<Variable> parameters;
52     if (offsets_count_too_large) {
53       parameters = {
54           {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
55           {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
56           {"padding_w", attr.padding.prepended.w},
57           {"padding_h", attr.padding.prepended.h},
58           {"dilation_w", attr.dilations.w},
59           {"dilation_h", attr.dilations.h},
60           {"kernel_w", weights.w},
61           {"kernel_h", weights.h},
62           {"src_depth", DivideRoundUp(weights.i, 4)},
63           {"stride", int2(attr.strides.w, attr.strides.h)},
64       };
65     } else {
66       std::vector<int2> offsets;
67       for (int h = 0; h < weights.h; ++h) {
68         for (int w = 0; w < weights.w; ++w) {
69           offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w,
70                                h * attr.dilations.h - attr.padding.prepended.h);
71         }
72       }
73       parameters = {
74           {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
75           {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
76           {"offsets_count", offsets_count},
77           {"offsets", offsets},
78           {"src_depth", DivideRoundUp(weights.i, 4)},
79           {"stride", int2(attr.strides.w, attr.strides.h)},
80       };
81     }
82 
83     // at least one padding is not empty
84     bool non_empty_padding =
85         attr.padding.appended.h != 0 || attr.padding.appended.w != 0 ||
86         attr.padding.prepended.h != 0 || attr.padding.prepended.w != 0;
87 
88     std::vector<std::pair<std::string, Object>> objects = {
89         {"weights", MakeReadonlyObject(Get3DSizeForPHWO4I4(attr.weights.shape),
90                                        ConvertToPHWO4I4(attr.weights))}};
91 
92     std::string source;
93     if (offsets_count_too_large) {
94       source = R"(
95       int i = 0;
96       for (int ky = 0; ky < $kernel_h$; ky++) {
97         for (int kx = 0; kx < $kernel_w$; kx++, i++) {
98           ivec2 coord = gid.xy * $stride$ + ivec2(kx * $dilation_w$ - $padding_w$, ky * $dilation_h$ - $padding_h$);)";
99     } else {
100       source = R"(
101         for (int i = 0; i < $offsets_count$; ++i) {
102           ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)";
103     }
104     if (non_empty_padding) {
105       source += R"(
106         if (coord.x < 0 || coord.y < 0 || coord.x >= $input_data_0_w$ || coord.y >= $input_data_0_h$) {
107           continue;
108         })";
109     }
110     source += R"(
111           for (int l = 0; l < $src_depth$; ++l) {
112             vec4 input_ = $input_data_0[coord.x, coord.y, l]$;
113             value_0.x += dot(input_, $weights[l * 4 + 0, i, gid.z]$);
114             value_0.y += dot(input_, $weights[l * 4 + 1, i, gid.z]$);
115             value_0.z += dot(input_, $weights[l * 4 + 2, i, gid.z]$);
116             value_0.w += dot(input_, $weights[l * 4 + 3, i, gid.z]$);
117           }
118         }
119 )";
120     if (offsets_count_too_large) {
121       source += R"(
122       }
123 )";
124     }
125     if (!attr.bias.data.empty()) {
126       source += "value_0 += $bias[gid.z]$;\n";
127       objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
128     }
129 
130     *generated_code = {
131         /*parameters=*/std::move(parameters),
132         /*objects=*/std::move(objects),
133         /*shared_variables=*/{},
134         /*workload=*/uint3(),
135         /*workgroup=*/
136         GetIdealWorkgroupIfPossible(
137             *ctx.gpu_info, OperationType::CONVOLUTION_2D,
138             HW(weights.h, weights.w), attr.strides, uint3(0, 0, 0),
139             OHWI(weights.o, ctx.input_shapes[0][1], ctx.input_shapes[0][2],
140                  ctx.input_shapes[0][3])),
141         /*source_code=*/std::move(source),
142         /*input=*/IOStructure::ONLY_DEFINITIONS,
143         /*output=*/IOStructure::AUTO,
144     };
145     return absl::OkStatus();
146   }
147 };
148 
SelectMultiplier(int32_t input_width,const NodeShader::GenerationContext & ctx)149 int SelectMultiplier(int32_t input_width,
150                      const NodeShader::GenerationContext& ctx) {
151   std::vector<int> multipliers = {4, 2};
152   if (ctx.gpu_info->IsAMD()) {
153     return 1;
154   }
155   if (!ctx.compiler_options.allow_precision_loss && ctx.gpu_info->IsMali()) {
156     multipliers = {2};
157   }
158   for (int i : multipliers) {
159     if (input_width % i == 0) {
160       return i;
161     }
162   }
163   return 1;
164 }
165 
166 class Convolution1x1 : public NodeShader {
167  public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const168   absl::Status GenerateCode(const GenerationContext& ctx,
169                             GeneratedCode* generated_code) const final {
170     if (ctx.input_shapes.size() != 1) {
171       return absl::UnimplementedError(
172           "Convolution does not support more than 1 runtime tensor");
173     }
174     const auto& attr =
175         absl::any_cast<const Convolution2DAttributes&>(ctx.op_attr);
176     if (attr.weights.shape.h != 1 || attr.weights.shape.w != 1) {
177       return absl::UnimplementedError("Height and width should be 1.");
178     }
179     if (attr.dilations.h != 1 || attr.dilations.w != 1) {
180       return absl::UnimplementedError("Dilations are not supported.");
181     }
182     if (attr.strides.h != 1 || attr.strides.w != 1) {
183       return absl::UnimplementedError("Strides are not supported.");
184     }
185     if (attr.padding.appended.h != 0 || attr.padding.appended.w != 0 ||
186         attr.padding.prepended.h != 0 || attr.padding.prepended.w != 0) {
187       return absl::UnimplementedError("Padding is not supported.");
188     }
189 
190     int multiplier = SelectMultiplier(ctx.input_shapes[0][2], ctx);
191 
192     std::vector<Variable> parameters = {
193         {"src_depth",
194          DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)},
195     };
196 
197     std::vector<std::pair<std::string, Object>> objects = {
198         {"weights",
199          MakeReadonlyObject(uint3(4, DivideRoundUp(attr.weights.shape.i, 4),
200                                   DivideRoundUp(attr.weights.shape.o, 4)),
201                             ConvertToPHWO4I4(attr.weights))}};
202     std::string source;
203     for (int i = 0; i < multiplier; i++) {
204       absl::StrAppend(&source, "highp vec4 result", i, " = vec4(0);\n");
205     }
206     absl::StrAppend(&source, "vec4 f;\n");
207     absl::StrAppend(&source, "for (int l = 0; l < $src_depth$; ++l) {\n");
208     for (int i = 0; i < multiplier; i++) {
209       absl::StrAppend(&source, "  vec4 input", i, " = $input_data_0[gid.x * ",
210                       multiplier, " + ", i, ",gid.y,l]$;\n");
211     }
212     for (int k = 0; k < 4; k++) {
213       absl::StrAppend(&source, "  f = $weights[", k, ", l, gid.z]$;\n");
214       for (int i = 0; i < multiplier; i++) {
215         absl::StrAppend(&source, "  result", i, "[", k, "] += dot(input", i,
216                         ", f);\n");
217       }
218     }
219     absl::StrAppend(&source, "}\n");
220     if (!attr.bias.data.empty()) {
221       objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
222       absl::StrAppend(&source, "vec4 b = $bias[gid.z]$;\n");
223       for (int i = 0; i < multiplier; i++) {
224         absl::StrAppend(&source, "result", i, " += b;\n");
225       }
226     }
227     if (multiplier != 1) {
228       for (int i = 0; i < multiplier; i++) {
229         absl::StrAppend(&source, "$inplace_update:result", i, "$\n");
230         absl::StrAppend(&source, "$output_data_0[gid.x * ", multiplier, " + ",
231                         i, ",gid.y,gid.z] = result", i, "$;\n");
232       }
233     } else {
234       absl::StrAppend(&source, "value_0 = result0;\n");
235     }
236 
237     auto dst_depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
238     uint3 workgroup = uint3(16, 16, 1);
239     if (ctx.gpu_info->IsAdreno()) {
240       if (dst_depth >= 2) {
241         workgroup = uint3(8, 8, 2);
242       }
243       if (dst_depth >= 4) {
244         workgroup = uint3(4, 8, 4);
245       }
246       if (dst_depth >= 8) {
247         workgroup = uint3(4, 4, 8);
248       }
249       if (dst_depth >= 32) {
250         workgroup = uint3(4, 4, 16);
251       }
252       if (dst_depth >= 64) {
253         workgroup = uint3(2, 8, 16);
254       }
255     } else {
256       if (dst_depth >= 2) {
257         workgroup = uint3(16, 8, 2);
258       }
259       if (dst_depth >= 4) {
260         workgroup = uint3(16, 4, 4);
261       }
262       if (dst_depth >= 8) {
263         workgroup = uint3(8, 4, 8);
264       }
265       if (dst_depth >= 32) {
266         workgroup = uint3(8, 4, 8);
267       }
268       if (dst_depth >= 64) {
269         workgroup = uint3(8, 4, 8);
270       }
271     }
272     *generated_code = {
273         /*parameters=*/std::move(parameters),
274         /*objects=*/std::move(objects),
275         /*shared_variables=*/{},
276         /*workload=*/
277         uint3(ctx.output_shapes[0][2] / multiplier, ctx.output_shapes[0][1],
278               DivideRoundUp(ctx.output_shapes[0][3], 4)),
279         /*workgroup=*/
280         GetIdealWorkgroupIfPossible(
281             *ctx.gpu_info, OperationType::CONVOLUTION_2D,
282             HW(attr.weights.shape.h, attr.weights.shape.w), attr.strides,
283             workgroup,
284             OHWI(attr.weights.shape.o, ctx.input_shapes[0][1],
285                  ctx.input_shapes[0][2], ctx.input_shapes[0][3])),
286         /*source_code=*/std::move(source),
287         /*input=*/IOStructure::ONLY_DEFINITIONS,
288         /*output=*/multiplier == 1 ? IOStructure::AUTO
289                                    : IOStructure::ONLY_DEFINITIONS,
290     };
291     return absl::OkStatus();
292   }
293 };
294 
295 }  // namespace
296 
NewConvolutionNodeShader()297 std::unique_ptr<NodeShader> NewConvolutionNodeShader() {
298   return absl::make_unique<Convolution>();
299 }
300 
NewConvolution1x1NodeShader()301 std::unique_ptr<NodeShader> NewConvolution1x1NodeShader() {
302   return absl::make_unique<Convolution1x1>();
303 }
304 
305 }  // namespace gl
306 }  // namespace gpu
307 }  // namespace tflite
308