1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/special/depthwise_conv_plus_1x1_conv.h"
17
18 #include <string>
19 #include <utility>
20 #include <vector>
21
22 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
23
24 namespace tflite {
25 namespace gpu {
26 namespace {
UploadWeights(const DepthwiseConvolution2DAttributes & dw_attr,const Convolution2DAttributes & conv_attr,CalculationsPrecision precision,GPUOperation * op)27 void UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
28 const Convolution2DAttributes& conv_attr,
29 CalculationsPrecision precision, GPUOperation* op) {
30 int dw_dst_ch_aligned = AlignByN(dw_attr.weights.shape.i, 4);
31 int dw_weights_count =
32 dw_dst_ch_aligned * dw_attr.weights.shape.h * dw_attr.weights.shape.w;
33 int conv_src_ch_aligned = AlignByN(conv_attr.weights.shape.i, 4);
34 int conv_dst_ch_aligned = AlignByN(conv_attr.weights.shape.o, 4);
35 int conv_weights_count = conv_src_ch_aligned * conv_dst_ch_aligned;
36 std::vector<float> gpu_data;
37 gpu_data.reserve(dw_dst_ch_aligned + dw_weights_count + conv_dst_ch_aligned +
38 conv_weights_count);
39 // dw bias loading
40 for (int i = 0; i < dw_dst_ch_aligned; ++i) {
41 if (i < dw_attr.bias.shape.v) {
42 gpu_data.push_back(dw_attr.bias.data[i]);
43 } else {
44 gpu_data.push_back(0.0f);
45 }
46 }
47 // dw weights loading
48 for (int y = 0; y < dw_attr.weights.shape.h; ++y) {
49 for (int x = 0; x < dw_attr.weights.shape.w; ++x) {
50 for (int d = 0; d < dw_dst_ch_aligned / 4; ++d) {
51 for (int i = 0; i < 4; ++i) {
52 const int d_ch = d * 4 + i;
53 if (d_ch < dw_attr.weights.shape.i) {
54 const int f_index =
55 dw_attr.weights.shape.LinearIndex({0, y, x, d_ch});
56 gpu_data.push_back(dw_attr.weights.data[f_index]);
57 } else {
58 gpu_data.push_back(0.0f);
59 }
60 }
61 }
62 }
63 }
64 // conv bias loading
65 for (int i = 0; i < conv_dst_ch_aligned; ++i) {
66 if (i < conv_attr.bias.shape.v) {
67 gpu_data.push_back(conv_attr.bias.data[i]);
68 } else {
69 gpu_data.push_back(0.0f);
70 }
71 }
72 // conv weights loading
73 for (int d = 0; d < conv_dst_ch_aligned / 4; ++d) {
74 for (int s = 0; s < conv_src_ch_aligned / 4; ++s) {
75 for (int j = 0; j < 4; ++j) {
76 for (int i = 0; i < 4; ++i) {
77 const int s_ch = s * 4 + j;
78 const int d_ch = d * 4 + i;
79 if (s_ch < conv_attr.weights.shape.i &&
80 d_ch < conv_attr.weights.shape.o) {
81 const int f_index =
82 conv_attr.weights.shape.LinearIndex({d_ch, 0, 0, s_ch});
83 gpu_data.push_back(conv_attr.weights.data[f_index]);
84 } else {
85 gpu_data.push_back(0.0f);
86 }
87 }
88 }
89 }
90 }
91
92 const bool fp32_weights = precision == CalculationsPrecision::F32;
93 const int float_size = fp32_weights ? 4 : 2;
94 BufferDescriptor desc;
95 desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
96 desc.element_size = 4;
97 desc.memory_type = MemoryType::CONSTANT;
98 desc.size = float_size * gpu_data.size();
99 desc.data.resize(desc.size);
100
101 if (fp32_weights) {
102 memcpy(desc.data.data(), gpu_data.data(), desc.size);
103 } else {
104 half* gpu_data_half = reinterpret_cast<half*>(desc.data.data());
105 for (int i = 0; i < gpu_data.size(); ++i) {
106 gpu_data_half[i] = gpu_data[i];
107 }
108 }
109 op->args_.AddObject("constants",
110 absl::make_unique<BufferDescriptor>(std::move(desc)));
111 }
112
GenerateCode(const OperationDef & op_def,const DepthwiseConvolution2DAttributes & dw_attr,int result_depth,GPUOperation * result)113 std::string GenerateCode(const OperationDef& op_def,
114 const DepthwiseConvolution2DAttributes& dw_attr,
115 int result_depth, GPUOperation* result) {
116 auto src_desc = op_def.src_tensors[0];
117 src_desc.SetAddressMode(AddressMode::kZero);
118 result->AddSrcTensor("src_tensor", src_desc);
119 result->AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
120
121 result->args_.AddInt("stride_x", dw_attr.strides.w);
122 result->args_.AddInt("padding_x", -dw_attr.padding.prepended.w);
123 result->args_.AddInt("dilation_x", dw_attr.dilations.w);
124 result->args_.AddInt("stride_y", dw_attr.strides.h);
125 result->args_.AddInt("padding_y", -dw_attr.padding.prepended.h);
126 result->args_.AddInt("dilation_y", dw_attr.dilations.h);
127
128 std::string c;
129 c += "MAIN_FUNCTION($0) {\n";
130 if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
131 c += " int linear_id = GLOBAL_ID_0;\n";
132 c += " int X = linear_id / args.dst_tensor.Batch();\n";
133 c += " int B = linear_id % args.dst_tensor.Batch();\n";
134 c += " args.dst_tensor.SetBatchRef(B);\n";
135 c += " args.src_tensor.SetBatchRef(B);\n";
136 } else {
137 c += " int X = GLOBAL_ID_0;\n";
138 }
139 c += " int Y = GLOBAL_ID_1;\n";
140 c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) { "
141 "\n";
142 c += " return; \n";
143 c += " } \n";
144 c += " __constant FLT4* constants = args.constants.GetPtr();\n";
145 int intermediate_depth = DivideRoundUp(dw_attr.weights.shape.i, 4);
146 int weights_counter = 0;
147 for (int d = 0; d < intermediate_depth; ++d) {
148 c += " FLT4 dw_res_" + std::to_string(d) + " = constants[" +
149 std::to_string(weights_counter++) + "];\n";
150 }
151 c += " int x_offseted = X * args.stride_x + args.padding_x;\n";
152 c += " int y_offseted = Y * args.stride_y + args.padding_y;\n";
153 c += " int x_c, y_c;\n";
154
155 auto generate_check = [&]() {
156 std::string check;
157 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
158 const std::vector<std::string> names{"x_in", "y_in", "z_in"};
159 for (int i = 0; i < axes.size(); ++i) {
160 const auto& axis = axes[i];
161 if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
162 if (!check.empty()) {
163 check += " && ";
164 }
165 check += names[i];
166 }
167 }
168 return check;
169 };
170 const std::string check = generate_check();
171 if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
172 c += " bool y_in;\n";
173 }
174 if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
175 c += " bool x_in;\n";
176 }
177
178 const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
179 c += " FLT4 src;\n";
180 for (int ky = 0; ky < dw_attr.weights.shape.h; ++ky) {
181 c += " y_c = y_offseted + " + std::to_string(ky) + " * args.dilation_y;\n";
182 if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
183 c += " y_in = y_c >= 0 && y_c < args.src_tensor.Height();\n";
184 c += " y_c = clamp(y_c, 0, args.src_tensor.Height() - 1);\n";
185 }
186 for (int kx = 0; kx < dw_attr.weights.shape.w; ++kx) {
187 c += " x_c = x_offseted + " + std::to_string(kx) +
188 " * args.dilation_x;\n";
189 if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
190 c += " x_in = x_c >= 0 && x_c < args.src_tensor.Width();\n";
191 c += " x_c = clamp(x_c, 0, args.src_tensor.Width() - 1);\n";
192 }
193 for (int d = 0; d < intermediate_depth; ++d) {
194 const int src_ch_count = std::min(4, dw_attr.weights.shape.i - d * 4);
195 const std::string s_postfix = postfixes[src_ch_count - 1];
196 std::string multiplier =
197 check.empty() ? "" : " * INIT_FLT(" + check + ")";
198 c += " src" + s_postfix + " = args.src_tensor.Read(x_c, y_c, " +
199 std::to_string(d) + ")" + s_postfix + multiplier + ";\n";
200 c += " dw_res_" + std::to_string(d) + s_postfix + " += src" +
201 s_postfix + " * constants[" + std::to_string(weights_counter++) +
202 "]" + s_postfix + ";\n";
203 }
204 }
205 }
206 for (int d = 0; d < result_depth; ++d) {
207 c += " FLT4 conv_res_" + std::to_string(d) + " = constants[" +
208 std::to_string(weights_counter++) + "];\n";
209 }
210 for (int d = 0; d < result_depth; ++d) {
211 for (int s = 0; s < intermediate_depth; ++s) {
212 std::string src = "dw_res_" + std::to_string(s);
213 std::string dst = "conv_res_" + std::to_string(d);
214 c += " " + dst + " += " + src + ".x * constants[" +
215 std::to_string(weights_counter++) + "];\n";
216 c += " " + dst + " += " + src + ".y * constants[" +
217 std::to_string(weights_counter++) + "];\n";
218 c += " " + dst + " += " + src + ".z * constants[" +
219 std::to_string(weights_counter++) + "];\n";
220 c += " " + dst + " += " + src + ".w * constants[" +
221 std::to_string(weights_counter++) + "];\n";
222 }
223 c += " args.dst_tensor.Write(conv_res_" + std::to_string(d) + ", X, Y, " +
224 std::to_string(d) + ");\n";
225 }
226 c += "}\n";
227
228 return c;
229 }
230
231 } // namespace
232
IsDepthwiseConvPlus1x1ConvSupported(const OperationDef & definition,const DepthwiseConvolution2DAttributes & dw_attr,const Convolution2DAttributes & conv_attr)233 bool IsDepthwiseConvPlus1x1ConvSupported(
234 const OperationDef& definition,
235 const DepthwiseConvolution2DAttributes& dw_attr,
236 const Convolution2DAttributes& conv_attr) {
237 const auto dw_shape = dw_attr.weights.shape;
238 const auto conv_shape = conv_attr.weights.shape;
239 bool good_dw = dw_shape.o == 1;
240 bool good_conv =
241 conv_shape.w == 1 && conv_shape.h == 1 && conv_attr.dilations.w == 1 &&
242 conv_attr.dilations.h == 1 && conv_attr.strides.w == 1 &&
243 conv_attr.strides.h == 1 && conv_attr.padding.prepended.w == 0 &&
244 conv_attr.padding.prepended.h == 0 && conv_attr.padding.appended.w == 0 &&
245 conv_attr.padding.appended.h == 0;
246 bool recommended_dw =
247 dw_shape.i <= 16 && dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
248 bool recommended_conv =
249 conv_shape.o <= 32 && conv_shape.i * conv_shape.o <= 16 * 32;
250 return good_dw && good_conv && recommended_dw && recommended_conv;
251 }
252
CreateDepthwiseConvPlus1x1Conv(const OperationDef & definition,const DepthwiseConvolution2DAttributes & dw_attr,const Convolution2DAttributes & conv_attr)253 GPUOperation CreateDepthwiseConvPlus1x1Conv(
254 const OperationDef& definition,
255 const DepthwiseConvolution2DAttributes& dw_attr,
256 const Convolution2DAttributes& conv_attr) {
257 GPUOperation result(definition);
258 result.code_ =
259 GenerateCode(definition, dw_attr,
260 DivideRoundUp(conv_attr.weights.shape.o, 4), &result);
261 result.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
262 UploadWeights(dw_attr, conv_attr, definition.precision, &result);
263 return result;
264 }
265
266 } // namespace gpu
267 } // namespace tflite
268