1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/special/depthwise_conv_plus_1x1_conv.h"
17
18 #include <string>
19 #include <utility>
20 #include <vector>
21
22 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
23 #include "tensorflow/lite/delegates/gpu/common/util.h"
24
25 namespace tflite {
26 namespace gpu {
27 namespace {
UploadWeights(const DepthwiseConvolution2DAttributes & dw_attr,const Convolution2DAttributes & conv_attr,CalculationsPrecision precision,GPUOperation * op)28 void UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
29 const Convolution2DAttributes& conv_attr,
30 CalculationsPrecision precision, GPUOperation* op) {
31 int dw_dst_ch_aligned = AlignByN(dw_attr.weights.shape.i, 4);
32 int dw_weights_count =
33 dw_dst_ch_aligned * dw_attr.weights.shape.h * dw_attr.weights.shape.w;
34 int conv_src_ch_aligned = AlignByN(conv_attr.weights.shape.i, 4);
35 int conv_dst_ch_aligned = AlignByN(conv_attr.weights.shape.o, 4);
36 int conv_weights_count = conv_src_ch_aligned * conv_dst_ch_aligned;
37 std::vector<float> gpu_data;
38 gpu_data.reserve(dw_dst_ch_aligned + dw_weights_count + conv_dst_ch_aligned +
39 conv_weights_count);
40 // dw bias loading
41 for (int i = 0; i < dw_dst_ch_aligned; ++i) {
42 if (i < dw_attr.bias.shape.v) {
43 gpu_data.push_back(dw_attr.bias.data[i]);
44 } else {
45 gpu_data.push_back(0.0f);
46 }
47 }
48 // dw weights loading
49 for (int d = 0; d < dw_dst_ch_aligned / 4; ++d) {
50 for (int y = 0; y < dw_attr.weights.shape.h; ++y) {
51 for (int x = 0; x < dw_attr.weights.shape.w; ++x) {
52 for (int i = 0; i < 4; ++i) {
53 const int d_ch = d * 4 + i;
54 if (d_ch < dw_attr.weights.shape.i) {
55 const int f_index =
56 dw_attr.weights.shape.LinearIndex({0, y, x, d_ch});
57 gpu_data.push_back(dw_attr.weights.data[f_index]);
58 } else {
59 gpu_data.push_back(0.0f);
60 }
61 }
62 }
63 }
64 }
65 // conv bias loading
66 for (int i = 0; i < conv_dst_ch_aligned; ++i) {
67 if (i < conv_attr.bias.shape.v) {
68 gpu_data.push_back(conv_attr.bias.data[i]);
69 } else {
70 gpu_data.push_back(0.0f);
71 }
72 }
73 // conv weights loading
74 for (int d = 0; d < conv_dst_ch_aligned / 4; ++d) {
75 for (int s = 0; s < conv_src_ch_aligned / 4; ++s) {
76 for (int j = 0; j < 4; ++j) {
77 for (int i = 0; i < 4; ++i) {
78 const int s_ch = s * 4 + j;
79 const int d_ch = d * 4 + i;
80 if (s_ch < conv_attr.weights.shape.i &&
81 d_ch < conv_attr.weights.shape.o) {
82 const int f_index =
83 conv_attr.weights.shape.LinearIndex({d_ch, 0, 0, s_ch});
84 gpu_data.push_back(conv_attr.weights.data[f_index]);
85 } else {
86 gpu_data.push_back(0.0f);
87 }
88 }
89 }
90 }
91 }
92
93 const bool fp32_weights = precision == CalculationsPrecision::F32;
94 const int float_size = fp32_weights ? 4 : 2;
95 BufferDescriptor desc;
96 desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
97 desc.element_size = 4;
98 desc.memory_type = MemoryType::CONSTANT;
99 desc.size = float_size * gpu_data.size();
100 desc.data.resize(desc.size);
101
102 if (fp32_weights) {
103 memcpy(desc.data.data(), gpu_data.data(), desc.size);
104 } else {
105 half* gpu_data_half = reinterpret_cast<half*>(desc.data.data());
106 for (int i = 0; i < gpu_data.size(); ++i) {
107 gpu_data_half[i] = gpu_data[i];
108 }
109 }
110 op->args_.AddObject("constants",
111 absl::make_unique<BufferDescriptor>(std::move(desc)));
112 }
113
GenerateCode(const OperationDef & op_def,const DepthwiseConvolution2DAttributes & dw_attr,int result_depth,GPUOperation * result)114 std::string GenerateCode(const OperationDef& op_def,
115 const DepthwiseConvolution2DAttributes& dw_attr,
116 int result_depth, GPUOperation* result) {
117 auto src_desc = op_def.src_tensors[0];
118 src_desc.SetAddressMode(AddressMode::kZero);
119 result->AddSrcTensor("src_tensor", src_desc);
120 result->AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
121
122 result->args_.AddInt("stride_x", dw_attr.strides.w);
123 result->args_.AddInt("padding_x", -dw_attr.padding.prepended.w);
124 result->args_.AddInt("dilation_x", dw_attr.dilations.w);
125 result->args_.AddInt("stride_y", dw_attr.strides.h);
126 result->args_.AddInt("padding_y", -dw_attr.padding.prepended.h);
127 result->args_.AddInt("dilation_y", dw_attr.dilations.h);
128
129 std::string c;
130 c += "MAIN_FUNCTION($0) {\n";
131 if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
132 c += " int linear_id = GLOBAL_ID_0;\n";
133 c += " int X = linear_id / args.dst_tensor.Batch();\n";
134 c += " int B = linear_id % args.dst_tensor.Batch();\n";
135 c += " args.dst_tensor.SetBatchRef(B);\n";
136 c += " args.src_tensor.SetBatchRef(B);\n";
137 } else {
138 c += " int X = GLOBAL_ID_0;\n";
139 }
140 c += " int Y = GLOBAL_ID_1;\n";
141 c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) { "
142 "\n";
143 c += " return; \n";
144 c += " } \n";
145 c += " __constant FLT4* constants = args.constants.GetPtr();\n";
146 int intermediate_depth = DivideRoundUp(dw_attr.weights.shape.i, 4);
147 int weights_counter = 0;
148 for (int d = 0; d < intermediate_depth; ++d) {
149 c += " FLT4 dw_res_" + std::to_string(d) + " = constants[" +
150 std::to_string(weights_counter++) + "];\n";
151 }
152 c += " int x_offseted = X * args.stride_x + args.padding_x;\n";
153 c += " int y_offseted = Y * args.stride_y + args.padding_y;\n";
154 c += " int x_c, y_c;\n";
155
156 auto generate_check = [&]() {
157 std::string check;
158 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
159 const std::vector<std::string> names{"x_in", "y_in", "z_in"};
160 for (int i = 0; i < axes.size(); ++i) {
161 const auto& axis = axes[i];
162 if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
163 if (!check.empty()) {
164 check += " && ";
165 }
166 check += names[i];
167 }
168 }
169 return check;
170 };
171 const std::string check = generate_check();
172 if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
173 c += " bool y_in;\n";
174 }
175 if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
176 c += " bool x_in;\n";
177 }
178
179 const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
180 c += " FLT4 src;\n";
181 for (int d = 0; d < intermediate_depth; ++d) {
182 const int src_ch_count = std::min(4, dw_attr.weights.shape.i - d * 4);
183 const std::string s_postfix = postfixes[src_ch_count - 1];
184 for (int ky = 0; ky < dw_attr.weights.shape.h; ++ky) {
185 c += " y_c = y_offseted + " + std::to_string(ky) +
186 " * args.dilation_y;\n";
187 if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
188 c += " y_in = y_c >= 0 && y_c < args.src_tensor.Height();\n";
189 c += " y_c = clamp(y_c, 0, args.src_tensor.Height() - 1);\n";
190 }
191 for (int kx = 0; kx < dw_attr.weights.shape.w; ++kx) {
192 c += " x_c = x_offseted + " + std::to_string(kx) +
193 " * args.dilation_x;\n";
194 if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
195 c += " x_in = x_c >= 0 && x_c < args.src_tensor.Width();\n";
196 c += " x_c = clamp(x_c, 0, args.src_tensor.Width() - 1);\n";
197 }
198 std::string multiplier =
199 check.empty() ? "" : " * INIT_FLT(" + check + ")";
200 c += " src" + s_postfix + " = args.src_tensor.Read(x_c, y_c, " +
201 std::to_string(d) + ")" + s_postfix + multiplier + ";\n";
202 c += " dw_res_" + std::to_string(d) + s_postfix + " += src" +
203 s_postfix + " * constants[" + std::to_string(weights_counter++) +
204 "]" + s_postfix + ";\n";
205 }
206 }
207 }
208 for (int d = 0; d < result_depth; ++d) {
209 c += " FLT4 conv_res_" + std::to_string(d) + " = constants[" +
210 std::to_string(weights_counter++) + "];\n";
211 }
212 for (int d = 0; d < result_depth; ++d) {
213 for (int s = 0; s < intermediate_depth; ++s) {
214 std::string src = "dw_res_" + std::to_string(s);
215 std::string dst = "conv_res_" + std::to_string(d);
216 c += " " + dst + " += " + src + ".x * constants[" +
217 std::to_string(weights_counter++) + "];\n";
218 c += " " + dst + " += " + src + ".y * constants[" +
219 std::to_string(weights_counter++) + "];\n";
220 c += " " + dst + " += " + src + ".z * constants[" +
221 std::to_string(weights_counter++) + "];\n";
222 c += " " + dst + " += " + src + ".w * constants[" +
223 std::to_string(weights_counter++) + "];\n";
224 }
225 c += " args.dst_tensor.Write(conv_res_" + std::to_string(d) + ", X, Y, " +
226 std::to_string(d) + ");\n";
227 }
228 c += "}\n";
229
230 return c;
231 }
232
233 } // namespace
234
IsDepthwiseConvPlus1x1ConvSupported(const OperationDef & definition,const GpuInfo & gpu_info,const DepthwiseConvolution2DAttributes & dw_attr,const Convolution2DAttributes & conv_attr)235 bool IsDepthwiseConvPlus1x1ConvSupported(
236 const OperationDef& definition, const GpuInfo& gpu_info,
237 const DepthwiseConvolution2DAttributes& dw_attr,
238 const Convolution2DAttributes& conv_attr) {
239 const auto dw_shape = dw_attr.weights.shape;
240 const auto conv_shape = conv_attr.weights.shape;
241 bool good_dw = dw_shape.o == 1;
242 bool good_conv =
243 conv_shape.w == 1 && conv_shape.h == 1 && conv_attr.dilations.w == 1 &&
244 conv_attr.dilations.h == 1 && conv_attr.strides.w == 1 &&
245 conv_attr.strides.h == 1 && conv_attr.padding.prepended.w == 0 &&
246 conv_attr.padding.prepended.h == 0 && conv_attr.padding.appended.w == 0 &&
247 conv_attr.padding.appended.h == 0;
248 if (gpu_info.IsApple()) {
249 if (definition.precision == CalculationsPrecision::F16) {
250 bool recommended_dw = dw_shape.i <= 16 &&
251 dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
252 bool recommended_conv =
253 conv_shape.o <= 16 && conv_shape.i * conv_shape.o <= 16 * 16;
254 return good_dw && good_conv && recommended_dw && recommended_conv;
255 } else {
256 bool recommended_dw = dw_shape.i <= 16 &&
257 dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
258 bool recommended_conv =
259 conv_shape.o <= 8 && conv_shape.i * conv_shape.o <= 8 * 16;
260 return good_dw && good_conv && recommended_dw && recommended_conv;
261 }
262 } else {
263 if (definition.precision == CalculationsPrecision::F16) {
264 bool recommended_dw = dw_shape.i <= 32 &&
265 dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 32;
266 bool recommended_conv =
267 conv_shape.o <= 32 && conv_shape.i * conv_shape.o <= 32 * 32;
268 return good_dw && good_conv && recommended_dw && recommended_conv;
269 } else {
270 bool recommended_dw = dw_shape.i <= 16 &&
271 dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
272 bool recommended_conv =
273 conv_shape.o <= 32 && conv_shape.i * conv_shape.o <= 16 * 32;
274 return good_dw && good_conv && recommended_dw && recommended_conv;
275 }
276 }
277 }
278
CreateDepthwiseConvPlus1x1Conv(const OperationDef & definition,const DepthwiseConvolution2DAttributes & dw_attr,const Convolution2DAttributes & conv_attr)279 GPUOperation CreateDepthwiseConvPlus1x1Conv(
280 const OperationDef& definition,
281 const DepthwiseConvolution2DAttributes& dw_attr,
282 const Convolution2DAttributes& conv_attr) {
283 GPUOperation result(definition);
284 result.code_ =
285 GenerateCode(definition, dw_attr,
286 DivideRoundUp(conv_attr.weights.shape.o, 4), &result);
287 result.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
288 UploadWeights(dw_attr, conv_attr, definition.precision, &result);
289 return result;
290 }
291
292 } // namespace gpu
293 } // namespace tflite
294