• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/special/depthwise_conv_plus_1x1_conv.h"
17 
18 #include <string>
19 #include <utility>
20 #include <vector>
21 
22 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
23 #include "tensorflow/lite/delegates/gpu/common/util.h"
24 
25 namespace tflite {
26 namespace gpu {
27 namespace {
UploadWeights(const DepthwiseConvolution2DAttributes & dw_attr,const Convolution2DAttributes & conv_attr,CalculationsPrecision precision,GPUOperation * op)28 void UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
29                    const Convolution2DAttributes& conv_attr,
30                    CalculationsPrecision precision, GPUOperation* op) {
31   int dw_dst_ch_aligned = AlignByN(dw_attr.weights.shape.i, 4);
32   int dw_weights_count =
33       dw_dst_ch_aligned * dw_attr.weights.shape.h * dw_attr.weights.shape.w;
34   int conv_src_ch_aligned = AlignByN(conv_attr.weights.shape.i, 4);
35   int conv_dst_ch_aligned = AlignByN(conv_attr.weights.shape.o, 4);
36   int conv_weights_count = conv_src_ch_aligned * conv_dst_ch_aligned;
37   std::vector<float> gpu_data;
38   gpu_data.reserve(dw_dst_ch_aligned + dw_weights_count + conv_dst_ch_aligned +
39                    conv_weights_count);
40   // dw bias loading
41   for (int i = 0; i < dw_dst_ch_aligned; ++i) {
42     if (i < dw_attr.bias.shape.v) {
43       gpu_data.push_back(dw_attr.bias.data[i]);
44     } else {
45       gpu_data.push_back(0.0f);
46     }
47   }
48   // dw weights loading
49   for (int d = 0; d < dw_dst_ch_aligned / 4; ++d) {
50     for (int y = 0; y < dw_attr.weights.shape.h; ++y) {
51       for (int x = 0; x < dw_attr.weights.shape.w; ++x) {
52         for (int i = 0; i < 4; ++i) {
53           const int d_ch = d * 4 + i;
54           if (d_ch < dw_attr.weights.shape.i) {
55             const int f_index =
56                 dw_attr.weights.shape.LinearIndex({0, y, x, d_ch});
57             gpu_data.push_back(dw_attr.weights.data[f_index]);
58           } else {
59             gpu_data.push_back(0.0f);
60           }
61         }
62       }
63     }
64   }
65   // conv bias loading
66   for (int i = 0; i < conv_dst_ch_aligned; ++i) {
67     if (i < conv_attr.bias.shape.v) {
68       gpu_data.push_back(conv_attr.bias.data[i]);
69     } else {
70       gpu_data.push_back(0.0f);
71     }
72   }
73   // conv weights loading
74   for (int d = 0; d < conv_dst_ch_aligned / 4; ++d) {
75     for (int s = 0; s < conv_src_ch_aligned / 4; ++s) {
76       for (int j = 0; j < 4; ++j) {
77         for (int i = 0; i < 4; ++i) {
78           const int s_ch = s * 4 + j;
79           const int d_ch = d * 4 + i;
80           if (s_ch < conv_attr.weights.shape.i &&
81               d_ch < conv_attr.weights.shape.o) {
82             const int f_index =
83                 conv_attr.weights.shape.LinearIndex({d_ch, 0, 0, s_ch});
84             gpu_data.push_back(conv_attr.weights.data[f_index]);
85           } else {
86             gpu_data.push_back(0.0f);
87           }
88         }
89       }
90     }
91   }
92 
93   const bool fp32_weights = precision == CalculationsPrecision::F32;
94   const int float_size = fp32_weights ? 4 : 2;
95   BufferDescriptor desc;
96   desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
97   desc.element_size = 4;
98   desc.memory_type = MemoryType::CONSTANT;
99   desc.size = float_size * gpu_data.size();
100   desc.data.resize(desc.size);
101 
102   if (fp32_weights) {
103     memcpy(desc.data.data(), gpu_data.data(), desc.size);
104   } else {
105     half* gpu_data_half = reinterpret_cast<half*>(desc.data.data());
106     for (int i = 0; i < gpu_data.size(); ++i) {
107       gpu_data_half[i] = gpu_data[i];
108     }
109   }
110   op->args_.AddObject("constants",
111                       absl::make_unique<BufferDescriptor>(std::move(desc)));
112 }
113 
GenerateCode(const OperationDef & op_def,const DepthwiseConvolution2DAttributes & dw_attr,int result_depth,GPUOperation * result)114 std::string GenerateCode(const OperationDef& op_def,
115                          const DepthwiseConvolution2DAttributes& dw_attr,
116                          int result_depth, GPUOperation* result) {
117   auto src_desc = op_def.src_tensors[0];
118   src_desc.SetAddressMode(AddressMode::kZero);
119   result->AddSrcTensor("src_tensor", src_desc);
120   result->AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
121 
122   result->args_.AddInt("stride_x", dw_attr.strides.w);
123   result->args_.AddInt("padding_x", -dw_attr.padding.prepended.w);
124   result->args_.AddInt("dilation_x", dw_attr.dilations.w);
125   result->args_.AddInt("stride_y", dw_attr.strides.h);
126   result->args_.AddInt("padding_y", -dw_attr.padding.prepended.h);
127   result->args_.AddInt("dilation_y", dw_attr.dilations.h);
128 
129   std::string c;
130   c += "MAIN_FUNCTION($0) {\n";
131   if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
132     c += "  int linear_id = GLOBAL_ID_0;\n";
133     c += "  int X = linear_id / args.dst_tensor.Batch();\n";
134     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
135     c += "  args.dst_tensor.SetBatchRef(B);\n";
136     c += "  args.src_tensor.SetBatchRef(B);\n";
137   } else {
138     c += "  int X = GLOBAL_ID_0;\n";
139   }
140   c += "  int Y = GLOBAL_ID_1;\n";
141   c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) { "
142        "\n";
143   c += "    return; \n";
144   c += "  } \n";
145   c += "  __constant FLT4* constants = args.constants.GetPtr();\n";
146   int intermediate_depth = DivideRoundUp(dw_attr.weights.shape.i, 4);
147   int weights_counter = 0;
148   for (int d = 0; d < intermediate_depth; ++d) {
149     c += "  FLT4 dw_res_" + std::to_string(d) + " = constants[" +
150          std::to_string(weights_counter++) + "];\n";
151   }
152   c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
153   c += "  int y_offseted = Y * args.stride_y + args.padding_y;\n";
154   c += "  int x_c, y_c;\n";
155 
156   auto generate_check = [&]() {
157     std::string check;
158     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
159     const std::vector<std::string> names{"x_in", "y_in", "z_in"};
160     for (int i = 0; i < axes.size(); ++i) {
161       const auto& axis = axes[i];
162       if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
163         if (!check.empty()) {
164           check += " && ";
165         }
166         check += names[i];
167       }
168     }
169     return check;
170   };
171   const std::string check = generate_check();
172   if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
173     c += "  bool y_in;\n";
174   }
175   if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
176     c += "  bool x_in;\n";
177   }
178 
179   const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
180   c += "  FLT4 src;\n";
181   for (int d = 0; d < intermediate_depth; ++d) {
182     const int src_ch_count = std::min(4, dw_attr.weights.shape.i - d * 4);
183     const std::string s_postfix = postfixes[src_ch_count - 1];
184     for (int ky = 0; ky < dw_attr.weights.shape.h; ++ky) {
185       c += "  y_c = y_offseted + " + std::to_string(ky) +
186            " * args.dilation_y;\n";
187       if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
188         c += "  y_in = y_c >= 0 && y_c < args.src_tensor.Height();\n";
189         c += "  y_c = clamp(y_c, 0, args.src_tensor.Height() - 1);\n";
190       }
191       for (int kx = 0; kx < dw_attr.weights.shape.w; ++kx) {
192         c += "  x_c = x_offseted + " + std::to_string(kx) +
193              " * args.dilation_x;\n";
194         if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
195           c += "  x_in = x_c >= 0 && x_c < args.src_tensor.Width();\n";
196           c += "  x_c = clamp(x_c, 0, args.src_tensor.Width() - 1);\n";
197         }
198         std::string multiplier =
199             check.empty() ? "" : " * INIT_FLT(" + check + ")";
200         c += "  src" + s_postfix + " = args.src_tensor.Read(x_c, y_c, " +
201              std::to_string(d) + ")" + s_postfix + multiplier + ";\n";
202         c += "  dw_res_" + std::to_string(d) + s_postfix + " += src" +
203              s_postfix + " * constants[" + std::to_string(weights_counter++) +
204              "]" + s_postfix + ";\n";
205       }
206     }
207   }
208   for (int d = 0; d < result_depth; ++d) {
209     c += "  FLT4 conv_res_" + std::to_string(d) + " = constants[" +
210          std::to_string(weights_counter++) + "];\n";
211   }
212   for (int d = 0; d < result_depth; ++d) {
213     for (int s = 0; s < intermediate_depth; ++s) {
214       std::string src = "dw_res_" + std::to_string(s);
215       std::string dst = "conv_res_" + std::to_string(d);
216       c += "  " + dst + " += " + src + ".x * constants[" +
217            std::to_string(weights_counter++) + "];\n";
218       c += "  " + dst + " += " + src + ".y * constants[" +
219            std::to_string(weights_counter++) + "];\n";
220       c += "  " + dst + " += " + src + ".z * constants[" +
221            std::to_string(weights_counter++) + "];\n";
222       c += "  " + dst + " += " + src + ".w * constants[" +
223            std::to_string(weights_counter++) + "];\n";
224     }
225     c += "  args.dst_tensor.Write(conv_res_" + std::to_string(d) + ", X, Y, " +
226          std::to_string(d) + ");\n";
227   }
228   c += "}\n";
229 
230   return c;
231 }
232 
233 }  // namespace
234 
IsDepthwiseConvPlus1x1ConvSupported(const OperationDef & definition,const GpuInfo & gpu_info,const DepthwiseConvolution2DAttributes & dw_attr,const Convolution2DAttributes & conv_attr)235 bool IsDepthwiseConvPlus1x1ConvSupported(
236     const OperationDef& definition, const GpuInfo& gpu_info,
237     const DepthwiseConvolution2DAttributes& dw_attr,
238     const Convolution2DAttributes& conv_attr) {
239   const auto dw_shape = dw_attr.weights.shape;
240   const auto conv_shape = conv_attr.weights.shape;
241   bool good_dw = dw_shape.o == 1;
242   bool good_conv =
243       conv_shape.w == 1 && conv_shape.h == 1 && conv_attr.dilations.w == 1 &&
244       conv_attr.dilations.h == 1 && conv_attr.strides.w == 1 &&
245       conv_attr.strides.h == 1 && conv_attr.padding.prepended.w == 0 &&
246       conv_attr.padding.prepended.h == 0 && conv_attr.padding.appended.w == 0 &&
247       conv_attr.padding.appended.h == 0;
248   if (gpu_info.IsApple()) {
249     if (definition.precision == CalculationsPrecision::F16) {
250       bool recommended_dw = dw_shape.i <= 16 &&
251                             dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
252       bool recommended_conv =
253           conv_shape.o <= 16 && conv_shape.i * conv_shape.o <= 16 * 16;
254       return good_dw && good_conv && recommended_dw && recommended_conv;
255     } else {
256       bool recommended_dw = dw_shape.i <= 16 &&
257                             dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
258       bool recommended_conv =
259           conv_shape.o <= 8 && conv_shape.i * conv_shape.o <= 8 * 16;
260       return good_dw && good_conv && recommended_dw && recommended_conv;
261     }
262   } else {
263     if (definition.precision == CalculationsPrecision::F16) {
264       bool recommended_dw = dw_shape.i <= 32 &&
265                             dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 32;
266       bool recommended_conv =
267           conv_shape.o <= 32 && conv_shape.i * conv_shape.o <= 32 * 32;
268       return good_dw && good_conv && recommended_dw && recommended_conv;
269     } else {
270       bool recommended_dw = dw_shape.i <= 16 &&
271                             dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
272       bool recommended_conv =
273           conv_shape.o <= 32 && conv_shape.i * conv_shape.o <= 16 * 32;
274       return good_dw && good_conv && recommended_dw && recommended_conv;
275     }
276   }
277 }
278 
CreateDepthwiseConvPlus1x1Conv(const OperationDef & definition,const DepthwiseConvolution2DAttributes & dw_attr,const Convolution2DAttributes & conv_attr)279 GPUOperation CreateDepthwiseConvPlus1x1Conv(
280     const OperationDef& definition,
281     const DepthwiseConvolution2DAttributes& dw_attr,
282     const Convolution2DAttributes& conv_attr) {
283   GPUOperation result(definition);
284   result.code_ =
285       GenerateCode(definition, dw_attr,
286                    DivideRoundUp(conv_attr.weights.shape.o, 4), &result);
287   result.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
288   UploadWeights(dw_attr, conv_attr, definition.precision, &result);
289   return result;
290 }
291 
292 }  // namespace gpu
293 }  // namespace tflite
294