1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv.h"
17
18 #include <string>
19 #include <utility>
20 #include <vector>
21
22 #include "tensorflow/lite/delegates/gpu/common/task/tensor_linear_desc.h"
23 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
25
26 namespace tflite {
27 namespace gpu {
28
29 namespace {
30
IsSpecializedCase(int channel_multiplier)31 bool IsSpecializedCase(int channel_multiplier) {
32 return channel_multiplier == 1 || channel_multiplier == 2 ||
33 channel_multiplier == 4;
34 }
35
GetSrcValue(int channel_multiplier,const std::string coords)36 std::string GetSrcValue(int channel_multiplier, const std::string coords) {
37 std::string c;
38 if (channel_multiplier == 1) {
39 c += " FLT4 src_final = args.src_tensor.Read(" + coords + ", S);\n";
40 } else if (channel_multiplier == 2) {
41 c += " int s_layer = S / 2;\n";
42 c += " FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
43 c += " FLT2 t0 = S % 2 == 0 ? src.xy : src.zw;\n";
44 c += " FLT4 src_final = INIT_FLT4v4(t0.x, t0.x, t0.y, t0.y);\n";
45 } else if (channel_multiplier == 4) {
46 c += " int s_layer = S / 4;\n";
47 c += " FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
48 c += " FLT t0 = src.x;\n";
49 c += " int reminder = S % 4;\n";
50 c += " if (reminder == 1) t0 = src.y;\n";
51 c += " if (reminder == 2) t0 = src.z;\n";
52 c += " if (reminder == 3) t0 = src.w;\n";
53 c += " FLT4 src_final = INIT_FLT4v4(t0, t0, t0, t0);\n";
54 } else {
55 c += " int s_layer = S / args.ch_multiplier;\n";
56 c += " FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
57 c += " int s_offset = (S % args.ch_multiplier) * 4;\n";
58 c += " FLT4 src_final;\n";
59 c += " FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
60 c += " src_final.x = temp_arr[(s_offset + 0) / args.ch_multiplier];\n";
61 c += " src_final.y = temp_arr[(s_offset + 1) / args.ch_multiplier];\n";
62 c += " src_final.z = temp_arr[(s_offset + 2) / args.ch_multiplier];\n";
63 c += " src_final.w = temp_arr[(s_offset + 3) / args.ch_multiplier];\n";
64 }
65
66 return c;
67 }
68
GenerateDepthwiseConvolutionCode(const OperationDef & op_def,bool stride_correction,int channel_multiplier,bool weights_are_buffer,bool dynamic_weights,GPUOperation * op)69 std::string GenerateDepthwiseConvolutionCode(
70 const OperationDef& op_def, bool stride_correction, int channel_multiplier,
71 bool weights_are_buffer, bool dynamic_weights, GPUOperation* op) {
72 auto src_desc = op_def.src_tensors[0];
73 src_desc.SetAddressMode(AddressMode::kZero);
74 if (op_def.IsBatchSupported()) {
75 src_desc.SetStateVar("BatchedWidth", "true");
76 }
77 op->AddSrcTensor("src_tensor", src_desc);
78 if (dynamic_weights) {
79 op->AddSrcTensor("weights", op_def.src_tensors[1]);
80 }
81
82 auto dst_desc = op_def.dst_tensors[0];
83 if (op_def.IsBatchSupported()) {
84 dst_desc.SetStateVar("BatchedWidth", "true");
85 }
86 op->AddDstTensor("dst_tensor", dst_desc);
87
88 std::string c;
89
90 c += "MAIN_FUNCTION(\n";
91 c += "$0) {\n";
92 c += " int X = GLOBAL_ID_0;\n";
93 if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
94 c += " int linear_id_1 = GLOBAL_ID_1;\n";
95 c += " int Y = linear_id_1 / args.dst_tensor.Depth();\n";
96 c += " int Z = linear_id_1 % args.dst_tensor.Depth();\n";
97 } else {
98 c += " int Y = GLOBAL_ID_1;\n";
99 }
100 c += " int S = GLOBAL_ID_2;\n";
101 c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
102 "S >= args.dst_tensor.Slices()) { \n";
103 c += " return; \n";
104 c += " } \n";
105 c += " ACCUM_FLT4 r = INIT_ACCUM_FLT4(0.0f);\n";
106 if (stride_correction) {
107 c += " int x_offseted = " +
108 GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
109 "args.padding_x") +
110 ";\n";
111 } else {
112 if (op_def.IsBatchSupported()) {
113 c += " int x_offseted = X * args.stride_x + args.padding_x * "
114 "args.src_tensor.Batch();\n";
115 } else {
116 c += " int x_offseted = X * args.stride_x + args.padding_x;\n";
117 }
118 }
119 c += " int y_offseted = Y * args.stride_y + args.padding_y;\n";
120 if (!dynamic_weights) {
121 std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
122 if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
123 c += " int z_offseted = Z * args.stride_z + args.padding_z;\n";
124 weights_offset += " * args.kernel_size_z";
125 }
126 if (weights_are_buffer) {
127 c += " int fx_c = S * " + weights_offset + ";\n";
128 } else {
129 c += " int fx_c = 0;\n";
130 }
131 }
132 std::string kernel_size_x =
133 dynamic_weights ? "args.weights.Width()" : "args.kernel_size_x";
134 std::string kernel_size_y =
135 dynamic_weights ? "args.weights.Height()" : "args.kernel_size_y";
136 std::string kernel_size_z =
137 dynamic_weights ? "args.weights.Depth()" : "args.kernel_size_z";
138
139 auto generate_check = [&]() {
140 std::string check;
141 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
142 const std::vector<std::string> names{"outside_x", "outside_y", "outside_z"};
143 for (int i = 0; i < axes.size(); ++i) {
144 const auto& axis = axes[i];
145 if (src_desc.HasAxis(axis) && !src_desc.SupportsZeroClamp(axis)) {
146 if (!check.empty()) {
147 check += " && ";
148 }
149 check += "!" + names[i];
150 }
151 }
152 return check;
153 };
154 auto generate_coords = [&]() {
155 std::string check;
156 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
157 const std::vector<std::string> names{"x_c", "y_c", "z_c"};
158 for (int i = 0; i < axes.size(); ++i) {
159 const auto& axis = axes[i];
160 if (src_desc.HasAxis(axis)) {
161 if (!check.empty()) {
162 check += ", ";
163 }
164 check += names[i];
165 }
166 }
167 return check;
168 };
169 const std::string check = generate_check();
170 const std::string coords = generate_coords();
171
172 if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
173 c += " for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
174 c += " int z_c = z_offseted + kz * args.dilation_z;\n";
175 if (!src_desc.SupportsZeroClamp(Axis::DEPTH)) {
176 c += " bool outside_z = z_c < 0 || z_c >= args.src_tensor.Depth();\n";
177 }
178 }
179 if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
180 c += " for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
181 c += " int y_c = y_offseted + ky * args.dilation_y;\n";
182 if (!src_desc.SupportsZeroClamp(Axis::HEIGHT)) {
183 c += " bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
184 }
185 }
186 if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
187 c += " for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
188 const std::string dilation_x =
189 op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
190 : "args.dilation_x";
191 c += " int x_c = x_offseted + kx * " + dilation_x + ";\n";
192 if (!src_desc.SupportsZeroClamp(Axis::WIDTH)) {
193 c += " bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
194 }
195 }
196 if (!check.empty()) {
197 c += " if (" + check + ") {\n";
198 }
199 if (dynamic_weights) {
200 c += " FLT4 f = args.weights.Read(kx, ky, S);\n";
201 } else {
202 if (weights_are_buffer) {
203 c += " FLT4 f = args.weights.Read(fx_c);\n";
204 } else {
205 c += " FLT4 f = args.weights.Read(fx_c, S);\n";
206 }
207 }
208 c += GetSrcValue(channel_multiplier, coords);
209 c += " r += TO_ACCUM_TYPE(src_final * f);\n";
210 if (!check.empty()) {
211 c += " }\n";
212 }
213 if (!dynamic_weights) {
214 c += " fx_c++;\n";
215 }
216 if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
217 c += " }\n";
218 }
219 if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
220 c += " }\n";
221 }
222 if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
223 c += " }\n";
224 }
225 c += " FLT4 res0 = TO_FLT4(r) + args.biases.Read(S);\n";
226 if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
227 c += " args.dst_tensor.Write(res0, X, Y, Z, S);\n";
228 } else {
229 c += " args.dst_tensor.Write(res0, X, Y, S);\n";
230 }
231 c += "}\n";
232 return c;
233 }
234 } // namespace
235
CreateDepthwiseConvolution2D(const GpuInfo & gpu_info,const OperationDef & definition,const DepthwiseConvolution2DAttributes & attr)236 GPUOperation CreateDepthwiseConvolution2D(
237 const GpuInfo& gpu_info, const OperationDef& definition,
238 const DepthwiseConvolution2DAttributes& attr) {
239 bool weights_are_buffer =
240 !gpu_info.SupportsImages() || gpu_info.IsMali() || gpu_info.IsApple();
241 GPUOperation op(definition);
242 op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
243 op.args_.AddInt("stride_x", attr.strides.w);
244 op.args_.AddInt("padding_x", -attr.padding.prepended.w);
245 op.args_.AddInt("dilation_x", attr.dilations.w);
246 op.args_.AddInt("kernel_size_y", attr.weights.shape.h);
247 op.args_.AddInt("stride_y", attr.strides.h);
248 op.args_.AddInt("padding_y", -attr.padding.prepended.h);
249 op.args_.AddInt("dilation_y", attr.dilations.h);
250 if (!IsSpecializedCase(attr.weights.shape.o)) {
251 op.args_.AddInt("ch_multiplier", attr.weights.shape.o);
252 }
253 const bool stride_correction =
254 definition.IsBatchSupported() && attr.strides.w != 1;
255 op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
256 attr.weights.shape.o,
257 weights_are_buffer, false, &op);
258 UploadWeightsForDWConv2D(attr.weights, weights_are_buffer,
259 definition.precision, &op);
260 op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
261
262 TensorLinearDescriptor desc;
263 desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
264 : LinearStorageType::TEXTURE_2D;
265 desc.element_type = definition.GetDataType();
266 desc.UploadLinearData(attr.bias);
267 op.args_.AddObject(
268 "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
269 return op;
270 }
271
CreateDepthwiseConvolution2DDynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const DepthwiseConvolution2DAttributes & attr)272 GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
273 const GpuInfo& gpu_info, const OperationDef& definition,
274 const DepthwiseConvolution2DAttributes& attr) {
275 GPUOperation op(definition);
276 op.args_.AddInt("stride_x", attr.strides.w);
277 op.args_.AddInt("padding_x", -attr.padding.prepended.w);
278 op.args_.AddInt("dilation_x", attr.dilations.w);
279 op.args_.AddInt("stride_y", attr.strides.h);
280 op.args_.AddInt("padding_y", -attr.padding.prepended.h);
281 op.args_.AddInt("dilation_y", attr.dilations.h);
282 const bool stride_correction =
283 definition.IsBatchSupported() && attr.strides.w != 1;
284 op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction, 1,
285 false, true, &op);
286 op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
287
288 TensorLinearDescriptor desc;
289 desc.storage_type =
290 !gpu_info.SupportsImages() || gpu_info.IsMali() || gpu_info.IsApple()
291 ? LinearStorageType::BUFFER
292 : LinearStorageType::TEXTURE_2D;
293 desc.element_type = definition.GetDataType();
294 desc.UploadLinearData(attr.bias);
295 op.args_.AddObject(
296 "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
297 return op;
298 }
299
CreateDepthwiseConvolution3D(const GpuInfo & gpu_info,const OperationDef & definition,const DepthwiseConvolution3DAttributes & attr)300 GPUOperation CreateDepthwiseConvolution3D(
301 const GpuInfo& gpu_info, const OperationDef& definition,
302 const DepthwiseConvolution3DAttributes& attr) {
303 bool weights_are_buffer =
304 !gpu_info.SupportsImages() || gpu_info.IsMali() || gpu_info.IsApple();
305 GPUOperation op(definition);
306 op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
307 op.args_.AddInt("stride_x", attr.strides.w);
308 op.args_.AddInt("padding_x", -attr.padding.prepended.w);
309 op.args_.AddInt("dilation_x", attr.dilations.w);
310 op.args_.AddInt("kernel_size_y", attr.weights.shape.h);
311 op.args_.AddInt("stride_y", attr.strides.h);
312 op.args_.AddInt("padding_y", -attr.padding.prepended.h);
313 op.args_.AddInt("dilation_y", attr.dilations.h);
314 op.args_.AddInt("kernel_size_z", attr.weights.shape.d);
315 op.args_.AddInt("stride_z", attr.strides.d);
316 op.args_.AddInt("padding_z", -attr.padding.prepended.d);
317 op.args_.AddInt("dilation_z", attr.dilations.d);
318 if (!IsSpecializedCase(attr.weights.shape.o)) {
319 op.args_.AddInt("ch_multiplier", attr.weights.shape.o);
320 }
321 const bool stride_correction =
322 definition.IsBatchSupported() && attr.strides.w != 1;
323 op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
324 attr.weights.shape.o,
325 weights_are_buffer, false, &op);
326 UploadWeightsForDWConv3D(attr.weights, weights_are_buffer,
327 definition.precision, &op);
328 op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
329
330 TensorLinearDescriptor desc;
331 desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
332 : LinearStorageType::TEXTURE_2D;
333 desc.element_type = definition.GetDataType();
334 desc.UploadLinearData(attr.bias);
335 op.args_.AddObject(
336 "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
337 return op;
338 }
339
340 } // namespace gpu
341 } // namespace tflite
342