• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/gl/kernels/conv.h"
17 
18 #include <memory>
19 #include <vector>
20 
21 #include "absl/memory/memory.h"
22 #include "absl/strings/str_cat.h"
23 #include "tensorflow/lite/delegates/gpu/common/convert.h"
24 #include "tensorflow/lite/delegates/gpu/common/operations.h"
25 #include "tensorflow/lite/delegates/gpu/common/shape.h"
26 #include "tensorflow/lite/delegates/gpu/common/status.h"
27 #include "tensorflow/lite/delegates/gpu/common/types.h"
28 #include "tensorflow/lite/delegates/gpu/common/util.h"
29 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
30 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
31 #include "tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h"
32 
33 namespace tflite {
34 namespace gpu {
35 namespace gl {
36 namespace {
37 
38 class Convolution : public NodeShader {
39  public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const40   absl::Status GenerateCode(const GenerationContext& ctx,
41                             GeneratedCode* generated_code) const final {
42     if (ctx.input_shapes.size() != 1) {
43       return absl::UnimplementedError(
44           "Convolution does not support more than 1 runtime tensor");
45     }
46     const auto& attr =
47         absl::any_cast<const Convolution2DAttributes&>(ctx.op_attr);
48     auto weights = attr.weights.shape;
49     const int offsets_count = weights.h * weights.w;
50     const bool offsets_count_too_large = offsets_count > kMaxConstArraySize;
51     std::vector<Variable> parameters;
52     if (offsets_count_too_large) {
53       parameters = {
54           {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
55           {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
56           {"padding_w", attr.padding.prepended.w},
57           {"padding_h", attr.padding.prepended.h},
58           {"dilation_w", attr.dilations.w},
59           {"dilation_h", attr.dilations.h},
60           {"kernel_w", weights.w},
61           {"kernel_h", weights.h},
62           {"src_depth", DivideRoundUp(weights.i, 4)},
63           {"stride", int2(attr.strides.w, attr.strides.h)},
64       };
65     } else {
66       std::vector<int2> offsets;
67       for (int h = 0; h < weights.h; ++h) {
68         for (int w = 0; w < weights.w; ++w) {
69           offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w,
70                                h * attr.dilations.h - attr.padding.prepended.h);
71         }
72       }
73       parameters = {
74           {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
75           {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])},
76           {"offsets_count", offsets_count},
77           {"offsets", offsets},
78           {"src_depth", DivideRoundUp(weights.i, 4)},
79           {"stride", int2(attr.strides.w, attr.strides.h)},
80       };
81     }
82 
83     // at least one padding is not empty
84     bool non_empty_padding =
85         attr.padding.appended.h != 0 || attr.padding.appended.w != 0 ||
86         attr.padding.prepended.h != 0 || attr.padding.prepended.w != 0;
87 
88     std::vector<std::pair<std::string, Object>> objects = {
89         {"weights", MakeReadonlyObject(Get3DSizeForPHWO4I4(attr.weights.shape),
90                                        ConvertToPHWO4I4(attr.weights))}};
91 
92     std::string source;
93     if (offsets_count_too_large) {
94       source = R"(
95       int i = 0;
96       for (int ky = 0; ky < $kernel_h$; ky++) {
97         for (int kx = 0; kx < $kernel_w$; kx++, i++) {
98           ivec2 coord = gid.xy * $stride$ + ivec2(kx * $dilation_w$ - $padding_w$, ky * $dilation_h$ - $padding_h$);)";
99     } else {
100       source = R"(
101         for (int i = 0; i < $offsets_count$; ++i) {
102           ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)";
103     }
104     if (non_empty_padding) {
105       source += R"(
106         if (coord.x < 0 || coord.y < 0 || coord.x >= $input_data_0_w$ || coord.y >= $input_data_0_h$) {
107           continue;
108         })";
109     }
110     source += R"(
111           for (int l = 0; l < $src_depth$; ++l) {
112             vec4 input_ = $input_data_0[coord.x, coord.y, l]$;
113             value_0.x += dot(input_, $weights[l * 4 + 0, i, gid.z]$);
114             value_0.y += dot(input_, $weights[l * 4 + 1, i, gid.z]$);
115             value_0.z += dot(input_, $weights[l * 4 + 2, i, gid.z]$);
116             value_0.w += dot(input_, $weights[l * 4 + 3, i, gid.z]$);
117           }
118         }
119 )";
120     if (offsets_count_too_large) {
121       source += R"(
122       }
123 )";
124     }
125     if (!attr.bias.data.empty()) {
126       source += "value_0 += $bias[gid.z]$;\n";
127       objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
128     }
129 
130     *generated_code = {
131         /*parameters=*/std::move(parameters),
132         /*objects=*/std::move(objects),
133         /*shared_variables=*/{},
134         /*workload=*/uint3(),
135         /*workgroup=*/
136         GetIdealWorkgroupIfPossible(
137             *ctx.gpu_info, OperationType::CONVOLUTION_2D,
138             HW(weights.h, weights.w), attr.strides, uint3(0, 0, 0),
139             OHWI(weights.o, ctx.input_shapes[0][1], ctx.input_shapes[0][2],
140                  ctx.input_shapes[0][3])),
141         /*source_code=*/std::move(source),
142         /*input=*/IOStructure::ONLY_DEFINITIONS,
143         /*output=*/IOStructure::AUTO,
144     };
145     return absl::OkStatus();
146   }
147 };
148 
SelectMultiplier(int32_t input_width,const NodeShader::GenerationContext & ctx)149 int SelectMultiplier(int32_t input_width,
150                      const NodeShader::GenerationContext& ctx) {
151   std::vector<int> multipliers = {4, 2};
152   if (!ctx.compiler_options.allow_precision_loss && ctx.gpu_info->IsMali()) {
153     multipliers = {2};
154   }
155   for (int i : multipliers) {
156     if (input_width % i == 0) {
157       return i;
158     }
159   }
160   return 1;
161 }
162 
163 class Convolution1x1 : public NodeShader {
164  public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const165   absl::Status GenerateCode(const GenerationContext& ctx,
166                             GeneratedCode* generated_code) const final {
167     if (ctx.input_shapes.size() != 1) {
168       return absl::UnimplementedError(
169           "Convolution does not support more than 1 runtime tensor");
170     }
171     const auto& attr =
172         absl::any_cast<const Convolution2DAttributes&>(ctx.op_attr);
173     if (attr.weights.shape.h != 1 || attr.weights.shape.w != 1) {
174       return absl::UnimplementedError("Height and width should be 1.");
175     }
176     if (attr.dilations.h != 1 || attr.dilations.w != 1) {
177       return absl::UnimplementedError("Dilations are not supported.");
178     }
179     if (attr.strides.h != 1 || attr.strides.w != 1) {
180       return absl::UnimplementedError("Strides are not supported.");
181     }
182     if (attr.padding.appended.h != 0 || attr.padding.appended.w != 0 ||
183         attr.padding.prepended.h != 0 || attr.padding.prepended.w != 0) {
184       return absl::UnimplementedError("Padding is not supported.");
185     }
186 
187     int multiplier = SelectMultiplier(ctx.input_shapes[0][2], ctx);
188 
189     std::vector<Variable> parameters = {
190         {"src_depth",
191          DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)},
192     };
193 
194     std::vector<std::pair<std::string, Object>> objects = {
195         {"weights",
196          MakeReadonlyObject(uint3(4, DivideRoundUp(attr.weights.shape.i, 4),
197                                   DivideRoundUp(attr.weights.shape.o, 4)),
198                             ConvertToPHWO4I4(attr.weights))}};
199     std::string source;
200     for (int i = 0; i < multiplier; i++) {
201       absl::StrAppend(&source, "highp vec4 result", i, " = vec4(0);\n");
202     }
203     absl::StrAppend(&source, "vec4 f;\n");
204     absl::StrAppend(&source, "for (int l = 0; l < $src_depth$; ++l) {\n");
205     for (int i = 0; i < multiplier; i++) {
206       absl::StrAppend(&source, "  vec4 input", i, " = $input_data_0[gid.x * ",
207                       multiplier, " + ", i, ",gid.y,l]$;\n");
208     }
209     for (int k = 0; k < 4; k++) {
210       absl::StrAppend(&source, "  f = $weights[", k, ", l, gid.z]$;\n");
211       for (int i = 0; i < multiplier; i++) {
212         absl::StrAppend(&source, "  result", i, "[", k, "] += dot(input", i,
213                         ", f);\n");
214       }
215     }
216     absl::StrAppend(&source, "}\n");
217     if (!attr.bias.data.empty()) {
218       objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
219       absl::StrAppend(&source, "vec4 b = $bias[gid.z]$;\n");
220       for (int i = 0; i < multiplier; i++) {
221         absl::StrAppend(&source, "result", i, " += b;\n");
222       }
223     }
224     if (multiplier != 1) {
225       for (int i = 0; i < multiplier; i++) {
226         absl::StrAppend(&source, "$inplace_update:result", i, "$\n");
227         absl::StrAppend(&source, "$output_data_0[gid.x * ", multiplier, " + ",
228                         i, ",gid.y,gid.z] = result", i, "$;\n");
229       }
230     } else {
231       absl::StrAppend(&source, "value_0 = result0;\n");
232     }
233 
234     auto dst_depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
235     uint3 workgroup = uint3(16, 16, 1);
236     if (ctx.gpu_info->IsAdreno()) {
237       if (dst_depth >= 2) {
238         workgroup = uint3(8, 8, 2);
239       }
240       if (dst_depth >= 4) {
241         workgroup = uint3(4, 8, 4);
242       }
243       if (dst_depth >= 8) {
244         workgroup = uint3(4, 4, 8);
245       }
246       if (dst_depth >= 32) {
247         workgroup = uint3(4, 4, 16);
248       }
249       if (dst_depth >= 64) {
250         workgroup = uint3(2, 8, 16);
251       }
252     } else {
253       if (dst_depth >= 2) {
254         workgroup = uint3(16, 8, 2);
255       }
256       if (dst_depth >= 4) {
257         workgroup = uint3(16, 4, 4);
258       }
259       if (dst_depth >= 8) {
260         workgroup = uint3(8, 4, 8);
261       }
262       if (dst_depth >= 32) {
263         workgroup = uint3(8, 4, 8);
264       }
265       if (dst_depth >= 64) {
266         workgroup = uint3(8, 4, 8);
267       }
268     }
269     *generated_code = {
270         /*parameters=*/std::move(parameters),
271         /*objects=*/std::move(objects),
272         /*shared_variables=*/{},
273         /*workload=*/
274         uint3(ctx.output_shapes[0][2] / multiplier, ctx.output_shapes[0][1],
275               DivideRoundUp(ctx.output_shapes[0][3], 4)),
276         /*workgroup=*/
277         GetIdealWorkgroupIfPossible(
278             *ctx.gpu_info, OperationType::CONVOLUTION_2D,
279             HW(attr.weights.shape.h, attr.weights.shape.w), attr.strides,
280             workgroup,
281             OHWI(attr.weights.shape.o, ctx.input_shapes[0][1],
282                  ctx.input_shapes[0][2], ctx.input_shapes[0][3])),
283         /*source_code=*/std::move(source),
284         /*input=*/IOStructure::ONLY_DEFINITIONS,
285         /*output=*/multiplier == 1 ? IOStructure::AUTO
286                                    : IOStructure::ONLY_DEFINITIONS,
287     };
288     return absl::OkStatus();
289   }
290 };
291 
292 }  // namespace
293 
NewConvolutionNodeShader()294 std::unique_ptr<NodeShader> NewConvolutionNodeShader() {
295   return absl::make_unique<Convolution>();
296 }
297 
NewConvolution1x1NodeShader()298 std::unique_ptr<NodeShader> NewConvolution1x1NodeShader() {
299   return absl::make_unique<Convolution1x1>();
300 }
301 
302 }  // namespace gl
303 }  // namespace gpu
304 }  // namespace tflite
305