• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/special/depthwise_conv_plus_1x1_conv.h"
17 
18 #include <string>
19 #include <utility>
20 #include <vector>
21 
22 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
23 
24 namespace tflite {
25 namespace gpu {
26 namespace {
UploadWeights(const DepthwiseConvolution2DAttributes & dw_attr,const Convolution2DAttributes & conv_attr,CalculationsPrecision precision,GPUOperation * op)27 void UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
28                    const Convolution2DAttributes& conv_attr,
29                    CalculationsPrecision precision, GPUOperation* op) {
30   int dw_dst_ch_aligned = AlignByN(dw_attr.weights.shape.i, 4);
31   int dw_weights_count =
32       dw_dst_ch_aligned * dw_attr.weights.shape.h * dw_attr.weights.shape.w;
33   int conv_src_ch_aligned = AlignByN(conv_attr.weights.shape.i, 4);
34   int conv_dst_ch_aligned = AlignByN(conv_attr.weights.shape.o, 4);
35   int conv_weights_count = conv_src_ch_aligned * conv_dst_ch_aligned;
36   std::vector<float> gpu_data;
37   gpu_data.reserve(dw_dst_ch_aligned + dw_weights_count + conv_dst_ch_aligned +
38                    conv_weights_count);
39   // dw bias loading
40   for (int i = 0; i < dw_dst_ch_aligned; ++i) {
41     if (i < dw_attr.bias.shape.v) {
42       gpu_data.push_back(dw_attr.bias.data[i]);
43     } else {
44       gpu_data.push_back(0.0f);
45     }
46   }
47   // dw weights loading
48   for (int y = 0; y < dw_attr.weights.shape.h; ++y) {
49     for (int x = 0; x < dw_attr.weights.shape.w; ++x) {
50       for (int d = 0; d < dw_dst_ch_aligned / 4; ++d) {
51         for (int i = 0; i < 4; ++i) {
52           const int d_ch = d * 4 + i;
53           if (d_ch < dw_attr.weights.shape.i) {
54             const int f_index =
55                 dw_attr.weights.shape.LinearIndex({0, y, x, d_ch});
56             gpu_data.push_back(dw_attr.weights.data[f_index]);
57           } else {
58             gpu_data.push_back(0.0f);
59           }
60         }
61       }
62     }
63   }
64   // conv bias loading
65   for (int i = 0; i < conv_dst_ch_aligned; ++i) {
66     if (i < conv_attr.bias.shape.v) {
67       gpu_data.push_back(conv_attr.bias.data[i]);
68     } else {
69       gpu_data.push_back(0.0f);
70     }
71   }
72   // conv weights loading
73   for (int d = 0; d < conv_dst_ch_aligned / 4; ++d) {
74     for (int s = 0; s < conv_src_ch_aligned / 4; ++s) {
75       for (int j = 0; j < 4; ++j) {
76         for (int i = 0; i < 4; ++i) {
77           const int s_ch = s * 4 + j;
78           const int d_ch = d * 4 + i;
79           if (s_ch < conv_attr.weights.shape.i &&
80               d_ch < conv_attr.weights.shape.o) {
81             const int f_index =
82                 conv_attr.weights.shape.LinearIndex({d_ch, 0, 0, s_ch});
83             gpu_data.push_back(conv_attr.weights.data[f_index]);
84           } else {
85             gpu_data.push_back(0.0f);
86           }
87         }
88       }
89     }
90   }
91 
92   const bool fp32_weights = precision == CalculationsPrecision::F32;
93   const int float_size = fp32_weights ? 4 : 2;
94   BufferDescriptor desc;
95   desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
96   desc.element_size = 4;
97   desc.memory_type = MemoryType::CONSTANT;
98   desc.size = float_size * gpu_data.size();
99   desc.data.resize(desc.size);
100 
101   if (fp32_weights) {
102     memcpy(desc.data.data(), gpu_data.data(), desc.size);
103   } else {
104     half* gpu_data_half = reinterpret_cast<half*>(desc.data.data());
105     for (int i = 0; i < gpu_data.size(); ++i) {
106       gpu_data_half[i] = gpu_data[i];
107     }
108   }
109   op->args_.AddObject("constants",
110                       absl::make_unique<BufferDescriptor>(std::move(desc)));
111 }
112 
GenerateCode(const OperationDef & op_def,const DepthwiseConvolution2DAttributes & dw_attr,int result_depth,GPUOperation * result)113 std::string GenerateCode(const OperationDef& op_def,
114                          const DepthwiseConvolution2DAttributes& dw_attr,
115                          int result_depth, GPUOperation* result) {
116   auto src_desc = op_def.src_tensors[0];
117   src_desc.SetAddressMode(AddressMode::kZero);
118   result->AddSrcTensor("src_tensor", src_desc);
119   result->AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
120 
121   result->args_.AddInt("stride_x", dw_attr.strides.w);
122   result->args_.AddInt("padding_x", -dw_attr.padding.prepended.w);
123   result->args_.AddInt("dilation_x", dw_attr.dilations.w);
124   result->args_.AddInt("stride_y", dw_attr.strides.h);
125   result->args_.AddInt("padding_y", -dw_attr.padding.prepended.h);
126   result->args_.AddInt("dilation_y", dw_attr.dilations.h);
127 
128   std::string c;
129   c += "MAIN_FUNCTION($0) {\n";
130   if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
131     c += "  int linear_id = GLOBAL_ID_0;\n";
132     c += "  int X = linear_id / args.dst_tensor.Batch();\n";
133     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
134     c += "  args.dst_tensor.SetBatchRef(B);\n";
135     c += "  args.src_tensor.SetBatchRef(B);\n";
136   } else {
137     c += "  int X = GLOBAL_ID_0;\n";
138   }
139   c += "  int Y = GLOBAL_ID_1;\n";
140   c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) { "
141        "\n";
142   c += "    return; \n";
143   c += "  } \n";
144   c += "  __constant FLT4* constants = args.constants.GetPtr();\n";
145   int intermediate_depth = DivideRoundUp(dw_attr.weights.shape.i, 4);
146   int weights_counter = 0;
147   for (int d = 0; d < intermediate_depth; ++d) {
148     c += "  FLT4 dw_res_" + std::to_string(d) + " = constants[" +
149          std::to_string(weights_counter++) + "];\n";
150   }
151   c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
152   c += "  int y_offseted = Y * args.stride_y + args.padding_y;\n";
153   c += "  int x_c, y_c;\n";
154 
155   auto generate_check = [&]() {
156     std::string check;
157     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
158     const std::vector<std::string> names{"x_in", "y_in", "z_in"};
159     for (int i = 0; i < axes.size(); ++i) {
160       const auto& axis = axes[i];
161       if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
162         if (!check.empty()) {
163           check += " && ";
164         }
165         check += names[i];
166       }
167     }
168     return check;
169   };
170   const std::string check = generate_check();
171   if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
172     c += "  bool y_in;\n";
173   }
174   if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
175     c += "  bool x_in;\n";
176   }
177 
178   const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
179   c += "  FLT4 src;\n";
180   for (int ky = 0; ky < dw_attr.weights.shape.h; ++ky) {
181     c += "  y_c = y_offseted + " + std::to_string(ky) + " * args.dilation_y;\n";
182     if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
183       c += "  y_in = y_c >= 0 && y_c < args.src_tensor.Height();\n";
184       c += "  y_c = clamp(y_c, 0, args.src_tensor.Height() - 1);\n";
185     }
186     for (int kx = 0; kx < dw_attr.weights.shape.w; ++kx) {
187       c += "  x_c = x_offseted + " + std::to_string(kx) +
188            " * args.dilation_x;\n";
189       if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
190         c += "  x_in = x_c >= 0 && x_c < args.src_tensor.Width();\n";
191         c += "  x_c = clamp(x_c, 0, args.src_tensor.Width() - 1);\n";
192       }
193       for (int d = 0; d < intermediate_depth; ++d) {
194         const int src_ch_count = std::min(4, dw_attr.weights.shape.i - d * 4);
195         const std::string s_postfix = postfixes[src_ch_count - 1];
196         std::string multiplier =
197             check.empty() ? "" : " * INIT_FLT(" + check + ")";
198         c += "  src" + s_postfix + " = args.src_tensor.Read(x_c, y_c, " +
199              std::to_string(d) + ")" + s_postfix + multiplier + ";\n";
200         c += "  dw_res_" + std::to_string(d) + s_postfix + " += src" +
201              s_postfix + " * constants[" + std::to_string(weights_counter++) +
202              "]" + s_postfix + ";\n";
203       }
204     }
205   }
206   for (int d = 0; d < result_depth; ++d) {
207     c += "  FLT4 conv_res_" + std::to_string(d) + " = constants[" +
208          std::to_string(weights_counter++) + "];\n";
209   }
210   for (int d = 0; d < result_depth; ++d) {
211     for (int s = 0; s < intermediate_depth; ++s) {
212       std::string src = "dw_res_" + std::to_string(s);
213       std::string dst = "conv_res_" + std::to_string(d);
214       c += "  " + dst + " += " + src + ".x * constants[" +
215            std::to_string(weights_counter++) + "];\n";
216       c += "  " + dst + " += " + src + ".y * constants[" +
217            std::to_string(weights_counter++) + "];\n";
218       c += "  " + dst + " += " + src + ".z * constants[" +
219            std::to_string(weights_counter++) + "];\n";
220       c += "  " + dst + " += " + src + ".w * constants[" +
221            std::to_string(weights_counter++) + "];\n";
222     }
223     c += "  args.dst_tensor.Write(conv_res_" + std::to_string(d) + ", X, Y, " +
224          std::to_string(d) + ");\n";
225   }
226   c += "}\n";
227 
228   return c;
229 }
230 
231 }  // namespace
232 
IsDepthwiseConvPlus1x1ConvSupported(const OperationDef & definition,const DepthwiseConvolution2DAttributes & dw_attr,const Convolution2DAttributes & conv_attr)233 bool IsDepthwiseConvPlus1x1ConvSupported(
234     const OperationDef& definition,
235     const DepthwiseConvolution2DAttributes& dw_attr,
236     const Convolution2DAttributes& conv_attr) {
237   const auto dw_shape = dw_attr.weights.shape;
238   const auto conv_shape = conv_attr.weights.shape;
239   bool good_dw = dw_shape.o == 1;
240   bool good_conv =
241       conv_shape.w == 1 && conv_shape.h == 1 && conv_attr.dilations.w == 1 &&
242       conv_attr.dilations.h == 1 && conv_attr.strides.w == 1 &&
243       conv_attr.strides.h == 1 && conv_attr.padding.prepended.w == 0 &&
244       conv_attr.padding.prepended.h == 0 && conv_attr.padding.appended.w == 0 &&
245       conv_attr.padding.appended.h == 0;
246   bool recommended_dw =
247       dw_shape.i <= 16 && dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
248   bool recommended_conv =
249       conv_shape.o <= 32 && conv_shape.i * conv_shape.o <= 16 * 32;
250   return good_dw && good_conv && recommended_dw && recommended_conv;
251 }
252 
CreateDepthwiseConvPlus1x1Conv(const OperationDef & definition,const DepthwiseConvolution2DAttributes & dw_attr,const Convolution2DAttributes & conv_attr)253 GPUOperation CreateDepthwiseConvPlus1x1Conv(
254     const OperationDef& definition,
255     const DepthwiseConvolution2DAttributes& dw_attr,
256     const Convolution2DAttributes& conv_attr) {
257   GPUOperation result(definition);
258   result.code_ =
259       GenerateCode(definition, dw_attr,
260                    DivideRoundUp(conv_attr.weights.shape.o, 4), &result);
261   result.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
262   UploadWeights(dw_attr, conv_attr, definition.precision, &result);
263   return result;
264 }
265 
266 }  // namespace gpu
267 }  // namespace tflite
268