1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <assert.h>
10 #include <stdbool.h>
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <string.h>
14 #include <math.h>
15
16 #include <xnnpack.h>
17 #include <xnnpack/allocator.h>
18 #include <xnnpack/indirection.h>
19 #include <xnnpack/log.h>
20 #include <xnnpack/math.h>
21 #include <xnnpack/operator.h>
22 #include <xnnpack/pack.h>
23 #include <xnnpack/params-init.h>
24 #include <xnnpack/params.h>
25
26
compute_output_dimension(size_t input_dimension,size_t output_padding_dimension,size_t adjustment_dimension,size_t kernel_dimension,size_t dilation_dimension,size_t stride_dimension)27 static inline size_t compute_output_dimension(
28 size_t input_dimension,
29 size_t output_padding_dimension,
30 size_t adjustment_dimension,
31 size_t kernel_dimension,
32 size_t dilation_dimension,
33 size_t stride_dimension)
34 {
35 const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
36 return doz(
37 stride_dimension * (input_dimension - 1) + adjustment_dimension + effective_kernel_dimension,
38 output_padding_dimension);
39 }
40
create_deconvolution2d_nhwc(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,const void * kernel,const void * bias,uint32_t flags,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,xnn_pack_conv_goki_w_function pack_conv_goki_w,xnn_pack_deconv_goki_w_function pack_deconv_goki_w,const void * packing_params,int input_padding_byte,int packed_weights_padding_byte,const void * params,size_t params_size,const struct gemm_parameters * gemm_parameters,const struct gemm_fused_ukernels * gemm_ukernels,enum xnn_operator_type operator_type,xnn_operator_t * deconvolution_op_out)41 static enum xnn_status create_deconvolution2d_nhwc(
42 uint32_t output_padding_top,
43 uint32_t output_padding_right,
44 uint32_t output_padding_bottom,
45 uint32_t output_padding_left,
46 uint32_t kernel_height,
47 uint32_t kernel_width,
48 uint32_t stride_height,
49 uint32_t stride_width,
50 uint32_t dilation_height,
51 uint32_t dilation_width,
52 uint32_t groups,
53 size_t group_input_channels,
54 size_t group_output_channels,
55 size_t input_pixel_stride,
56 size_t output_pixel_stride,
57 const void* kernel,
58 const void* bias,
59 uint32_t flags,
60 uint32_t log2_input_element_size,
61 uint32_t log2_filter_element_size,
62 uint32_t bias_element_size,
63 xnn_pack_conv_goki_w_function pack_conv_goki_w,
64 xnn_pack_deconv_goki_w_function pack_deconv_goki_w,
65 const void* packing_params,
66 int input_padding_byte,
67 int packed_weights_padding_byte,
68 const void* params,
69 size_t params_size,
70 const struct gemm_parameters* gemm_parameters,
71 const struct gemm_fused_ukernels* gemm_ukernels,
72 enum xnn_operator_type operator_type,
73 xnn_operator_t* deconvolution_op_out)
74 {
75 xnn_operator_t deconvolution_op = NULL;
76 enum xnn_status status = xnn_status_uninitialized;
77
78 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
79 xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
80 xnn_operator_type_to_string(operator_type));
81 goto error;
82 }
83
84 status = xnn_status_invalid_parameter;
85
86 if (kernel_width == 0 || kernel_height == 0) {
87 xnn_log_error(
88 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
89 xnn_operator_type_to_string(operator_type), kernel_width, kernel_height);
90 goto error;
91 }
92
93 if (stride_width == 0 || stride_height == 0) {
94 xnn_log_error(
95 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
96 xnn_operator_type_to_string(operator_type), stride_width, stride_height);
97 goto error;
98 }
99
100 if (dilation_width == 0 || dilation_height == 0) {
101 xnn_log_error(
102 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " dilation: dilation dimensions must be non-zero",
103 xnn_operator_type_to_string(operator_type), dilation_width, dilation_height);
104 goto error;
105 }
106
107 if (groups == 0) {
108 xnn_log_error(
109 "failed to create %s operator with %" PRIu32 " groups: number of groups must be non-zero",
110 xnn_operator_type_to_string(operator_type), groups);
111 goto error;
112 }
113
114 if (group_input_channels == 0) {
115 xnn_log_error(
116 "failed to create %s operator with %zu input channels per group: number of channels must be non-zero",
117 xnn_operator_type_to_string(operator_type), group_input_channels);
118 goto error;
119 }
120
121 if (group_output_channels == 0) {
122 xnn_log_error(
123 "failed to create %s operator with %zu output channels per group: number of channels must be non-zero",
124 xnn_operator_type_to_string(operator_type), group_output_channels);
125 goto error;
126 }
127
128 const size_t input_channels = groups * group_input_channels;
129 if (input_pixel_stride < input_channels) {
130 xnn_log_error(
131 "failed to create %s operator with input pixel stride of %zu: "
132 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
133 xnn_operator_type_to_string(operator_type),
134 input_pixel_stride, groups, group_input_channels);
135 goto error;
136 }
137
138 const size_t output_channels = groups * group_output_channels;
139 if (output_pixel_stride < output_channels) {
140 xnn_log_error(
141 "failed to create %s operator with output pixel stride of %zu: "
142 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
143 xnn_operator_type_to_string(operator_type),
144 output_pixel_stride, groups, group_output_channels);
145 goto error;
146 }
147
148 const bool any_padding = (output_padding_left | output_padding_top | output_padding_right | output_padding_bottom) != 0;
149 if (any_padding && (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
150 xnn_log_error(
151 "failed to create %s operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
152 "TensorFlow SAME padding can't be combined with explicit padding specification",
153 xnn_operator_type_to_string(operator_type),
154 output_padding_top, output_padding_left, output_padding_bottom, output_padding_right);
155 goto error;
156 }
157
158 status = xnn_status_out_of_memory;
159
160 deconvolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
161 if (deconvolution_op == NULL) {
162 xnn_log_error(
163 "failed to allocate %zu bytes for %s operator descriptor",
164 sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
165 goto error;
166 }
167
168 const uint32_t mr = gemm_parameters->mr;
169 const uint32_t nr = gemm_parameters->nr;
170 const uint32_t kr = UINT32_C(1) << gemm_parameters->log2_kr;
171 const uint32_t sr = UINT32_C(1) << gemm_parameters->log2_sr;
172
173 const uint32_t n_stride = round_up(group_output_channels, nr);
174 const uint32_t k_stride = round_up_po2(group_input_channels, kr);
175 const uint32_t kernel_size = kernel_height * kernel_width;
176 enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_igemm;
177 size_t packed_group_weights_size = (sizeof(float) * kernel_size * k_stride + sizeof(float)) * n_stride;
178 if (max(stride_height, stride_width) > 1 && max(dilation_height, dilation_width) == 1 && stride_width <= kernel_width && stride_height <= kernel_height) {
179 ukernel_type = xnn_ukernel_type_subconv2d;
180 const size_t subkernels = stride_height * stride_width;
181 packed_group_weights_size = n_stride *
182 (sizeof(float) * kernel_size * k_stride + sizeof(float) * subkernels);
183
184 const size_t subconvolution_buffer_size = sizeof(struct subconvolution_params) * subkernels;
185 deconvolution_op->subconvolution_buffer = xnn_allocate_zero_memory(subconvolution_buffer_size);
186 if (deconvolution_op->subconvolution_buffer == NULL) {
187 xnn_log_error(
188 "failed to allocate %zu bytes for %s operator subconvolution buffer",
189 subconvolution_buffer_size, xnn_operator_type_to_string(operator_type));
190 goto error;
191 }
192
193 struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
194 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
195 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
196 const size_t subkernel_height = divide_round_up(kernel_height - offset_y, stride_height);
197 const size_t subkernel_width = divide_round_up(kernel_width - offset_x, stride_width);
198 const size_t subkernel_size = subkernel_height * subkernel_width;
199
200 subconvolution_params->indirection_x_stride = sizeof(void*) * subkernel_size;
201 subconvolution_params->w_stride = sizeof(float) + k_stride * subkernel_size * sizeof(float);
202 subconvolution_params++;
203 }
204 }
205 }
206 deconvolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
207 if (deconvolution_op->packed_weights == NULL) {
208 xnn_log_error(
209 "failed to allocate %zu bytes for %s operator packed weights",
210 packed_group_weights_size * groups, xnn_operator_type_to_string(operator_type));
211 goto error;
212 }
213 memset(deconvolution_op->packed_weights, packed_weights_padding_byte, packed_group_weights_size * groups);
214
215 switch (ukernel_type) {
216 case xnn_ukernel_type_igemm:
217 pack_conv_goki_w(
218 groups, group_output_channels, kernel_size, group_input_channels,
219 nr, kr, sr,
220 kernel, bias, deconvolution_op->packed_weights,
221 packing_params);
222 break;
223 case xnn_ukernel_type_subconv2d:
224 pack_deconv_goki_w(
225 groups, group_output_channels, kernel_height, kernel_width, group_input_channels,
226 stride_height, stride_width,
227 nr, kr, sr,
228 kernel, bias, deconvolution_op->packed_weights, deconvolution_op->subconvolution_buffer,
229 packing_params);
230 break;
231 default:
232 XNN_UNREACHABLE;
233 }
234
235 const size_t zero_size = (k_stride << log2_input_element_size) + XNN_EXTRA_BYTES;
236 deconvolution_op->zero_buffer = xnn_allocate_simd_memory(zero_size);
237 if (deconvolution_op->zero_buffer == NULL) {
238 xnn_log_error(
239 "failed to allocate %zu bytes for %s operator zero padding",
240 zero_size, xnn_operator_type_to_string(operator_type));
241 goto error;
242 }
243 memset(deconvolution_op->zero_buffer, input_padding_byte, zero_size);
244
245 deconvolution_op->padding_top = output_padding_top;
246 deconvolution_op->padding_right = output_padding_right;
247 deconvolution_op->padding_bottom = output_padding_bottom;
248 deconvolution_op->padding_left = output_padding_left;
249
250 deconvolution_op->kernel_height = kernel_height;
251 deconvolution_op->kernel_width = kernel_width;
252 deconvolution_op->stride_height = stride_height;
253 deconvolution_op->stride_width = stride_width;
254 deconvolution_op->dilation_height = dilation_height;
255 deconvolution_op->dilation_width = dilation_width;
256 deconvolution_op->groups = groups;
257 deconvolution_op->group_input_channels = group_input_channels;
258 deconvolution_op->group_output_channels = group_output_channels;
259 deconvolution_op->input_pixel_stride = input_pixel_stride;
260 deconvolution_op->output_pixel_stride = output_pixel_stride;
261
262 memcpy(&deconvolution_op->params, params, params_size);
263 deconvolution_op->type = operator_type;
264 deconvolution_op->ukernel.type = ukernel_type;
265 deconvolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
266 .general_case = gemm_ukernels->igemm,
267 .gemm_case = gemm_ukernels->gemm,
268 .mr = mr,
269 .nr = nr,
270 .kr = kr,
271 };
272
273 if (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
274 if ((stride_height | stride_width) == 1) {
275 // Padding can be computed statically
276 const uint32_t padding_height = (kernel_height - 1) * dilation_height;
277 const uint32_t padding_width = (kernel_width - 1) * dilation_width;
278
279 const uint32_t padding_top = padding_height / 2;
280 const uint32_t padding_left = padding_width / 2;
281
282 deconvolution_op->padding_top = padding_top;
283 deconvolution_op->padding_left = padding_left;
284 deconvolution_op->padding_bottom = padding_height - padding_top;
285 deconvolution_op->padding_right = padding_width - padding_left;
286 } else {
287 deconvolution_op->flags = XNN_FLAG_TENSORFLOW_SAME_PADDING;
288 }
289 }
290
291 deconvolution_op->state = xnn_run_state_invalid;
292
293 *deconvolution_op_out = deconvolution_op;
294 return xnn_status_success;
295
296 error:
297 xnn_delete_operator(deconvolution_op);
298 return status;
299 }
300
xnn_create_deconvolution2d_nhwc_qu8(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,uint8_t input_zero_point,float input_scale,uint8_t kernel_zero_point,float kernel_scale,const uint8_t * kernel,const int32_t * bias,uint8_t output_zero_point,float output_scale,uint8_t output_min,uint8_t output_max,uint32_t flags,xnn_operator_t * deconvolution_op_out)301 enum xnn_status xnn_create_deconvolution2d_nhwc_qu8(
302 uint32_t output_padding_top,
303 uint32_t output_padding_right,
304 uint32_t output_padding_bottom,
305 uint32_t output_padding_left,
306 uint32_t kernel_height,
307 uint32_t kernel_width,
308 uint32_t stride_height,
309 uint32_t stride_width,
310 uint32_t dilation_height,
311 uint32_t dilation_width,
312 uint32_t groups,
313 size_t group_input_channels,
314 size_t group_output_channels,
315 size_t input_pixel_stride,
316 size_t output_pixel_stride,
317 uint8_t input_zero_point,
318 float input_scale,
319 uint8_t kernel_zero_point,
320 float kernel_scale,
321 const uint8_t* kernel,
322 const int32_t* bias,
323 uint8_t output_zero_point,
324 float output_scale,
325 uint8_t output_min,
326 uint8_t output_max,
327 uint32_t flags,
328 xnn_operator_t* deconvolution_op_out)
329 {
330 if (input_scale <= 0.0f || !isnormal(input_scale)) {
331 xnn_log_error(
332 "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
333 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), input_scale);
334 return xnn_status_invalid_parameter;
335 }
336
337 if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
338 xnn_log_error(
339 "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
340 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), kernel_scale);
341 return xnn_status_invalid_parameter;
342 }
343
344 if (output_scale <= 0.0f || !isnormal(output_scale)) {
345 xnn_log_error(
346 "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
347 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), output_scale);
348 return xnn_status_invalid_parameter;
349 }
350
351 if (output_min >= output_max) {
352 xnn_log_error(
353 "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
354 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), output_min, output_max);
355 return xnn_status_invalid_parameter;
356 }
357
358 const float requantization_scale = input_scale * kernel_scale / output_scale;
359 if (requantization_scale >= 1.0f) {
360 xnn_log_error(
361 "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
362 "requantization scale %.7g is greater or equal to 1.0",
363 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8),
364 input_scale, kernel_scale, output_scale, requantization_scale);
365 return xnn_status_unsupported_parameter;
366 }
367
368 const union xnn_qu8_gemm_params params = xnn_init_qu8_gemm_params(
369 kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max);
370 const struct xnn_qu8_packing_params packing_params = {
371 .input_zero_point = input_zero_point,
372 .kernel_zero_point = kernel_zero_point,
373 };
374 return create_deconvolution2d_nhwc(
375 output_padding_top, output_padding_right, output_padding_bottom, output_padding_left,
376 kernel_height, kernel_width,
377 stride_height, stride_width,
378 dilation_height, dilation_width,
379 groups, group_input_channels, group_output_channels,
380 input_pixel_stride, output_pixel_stride,
381 kernel, bias, flags,
382 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
383 0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
384 sizeof(int32_t) /* sizeof(bias element) */,
385 (xnn_pack_conv_goki_w_function) xnn_pack_qu8_conv_goki_w,
386 (xnn_pack_deconv_goki_w_function) xnn_pack_qu8_deconv_goki_w,
387 &packing_params, input_zero_point /* input padding byte */, kernel_zero_point /* packed weights padding byte */,
388 ¶ms, sizeof(params),
389 &xnn_params.qu8.gemm, &xnn_params.qu8.gemm.minmax,
390 xnn_operator_type_deconvolution_nhwc_qu8,
391 deconvolution_op_out);
392 }
393
xnn_create_deconvolution2d_nhwc_f32(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,const float * kernel,const float * bias,float output_min,float output_max,uint32_t flags,xnn_operator_t * deconvolution_op_out)394 enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
395 uint32_t output_padding_top,
396 uint32_t output_padding_right,
397 uint32_t output_padding_bottom,
398 uint32_t output_padding_left,
399 uint32_t kernel_height,
400 uint32_t kernel_width,
401 uint32_t stride_height,
402 uint32_t stride_width,
403 uint32_t dilation_height,
404 uint32_t dilation_width,
405 uint32_t groups,
406 size_t group_input_channels,
407 size_t group_output_channels,
408 size_t input_pixel_stride,
409 size_t output_pixel_stride,
410 const float* kernel,
411 const float* bias,
412 float output_min,
413 float output_max,
414 uint32_t flags,
415 xnn_operator_t* deconvolution_op_out)
416 {
417 if (isnan(output_min)) {
418 xnn_log_error(
419 "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
420 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32));
421 return xnn_status_invalid_parameter;
422 }
423
424 if (isnan(output_max)) {
425 xnn_log_error(
426 "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
427 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32));
428 return xnn_status_invalid_parameter;
429 }
430
431 if (output_min >= output_max) {
432 xnn_log_error(
433 "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
434 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32), output_min, output_max);
435 return xnn_status_invalid_parameter;
436 }
437
438 const struct gemm_parameters* gemm_parameters = &xnn_params.f32.gemm;
439 if (gemm_parameters->nr > group_output_channels) {
440 // Default micro-kernel is suboptimal. Try to find a better micro-kernel.
441 if (xnn_params.f32.gemm2.minmax.igemm.function[XNN_UARCH_DEFAULT] != NULL) {
442 gemm_parameters = &xnn_params.f32.gemm2;
443 }
444 }
445 const struct gemm_fused_ukernels* gemm_ukernels = &gemm_parameters->minmax;
446 const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max);
447 if (linear_activation && gemm_parameters->linear.gemm.function[XNN_UARCH_DEFAULT] != NULL) {
448 gemm_ukernels = &gemm_parameters->linear;
449 }
450
451 const union xnn_f32_minmax_params params = xnn_init_f32_minmax_params(output_min, output_max);
452 return create_deconvolution2d_nhwc(
453 output_padding_top, output_padding_right, output_padding_bottom, output_padding_left,
454 kernel_height, kernel_width,
455 stride_height, stride_width,
456 dilation_height, dilation_width,
457 groups, group_input_channels, group_output_channels,
458 input_pixel_stride, output_pixel_stride,
459 kernel, bias, flags,
460 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
461 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
462 sizeof(float) /* sizeof(bias element) */,
463 (xnn_pack_conv_goki_w_function) xnn_pack_f32_conv_goki_w,
464 (xnn_pack_deconv_goki_w_function) xnn_pack_f32_deconv_goki_w,
465 NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
466 ¶ms, sizeof(params),
467 gemm_parameters, gemm_ukernels,
468 xnn_operator_type_deconvolution_nhwc_f32,
469 deconvolution_op_out);
470 }
471
setup_conv_path(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,size_t output_height,size_t output_width,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t params_size,size_t num_threads)472 static enum xnn_status setup_conv_path(
473 xnn_operator_t deconvolution_op,
474 size_t batch_size,
475 size_t input_height,
476 size_t input_width,
477 const void* input,
478 size_t output_height,
479 size_t output_width,
480 void* output,
481 uint32_t log2_input_element_size,
482 uint32_t log2_filter_element_size,
483 uint32_t bias_element_size,
484 uint32_t log2_output_element_size,
485 const void* params,
486 size_t params_size,
487 size_t num_threads)
488 {
489 assert(deconvolution_op->ukernel.type == xnn_ukernel_type_igemm);
490
491 const size_t kernel_height = deconvolution_op->kernel_height;
492 const size_t kernel_width = deconvolution_op->kernel_width;
493 const size_t kernel_size = kernel_height * kernel_width;
494
495 const size_t groups = deconvolution_op->groups;
496 const size_t output_size = output_height * output_width;
497 const size_t mr = deconvolution_op->ukernel.igemm.mr;
498 const size_t tiled_output_size = round_up(output_size, mr);
499 const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
500
501 if (input_height != deconvolution_op->last_input_height ||
502 input_width != deconvolution_op->last_input_width)
503 {
504 const void** indirection_buffer = (const void**) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
505 if (indirection_buffer == NULL) {
506 xnn_log_error(
507 "failed to allocate %zu bytes for %s operator indirection buffer",
508 indirection_buffer_size, xnn_operator_type_to_string(deconvolution_op->type));
509 return xnn_status_out_of_memory;
510 }
511 deconvolution_op->indirection_buffer = indirection_buffer;
512 deconvolution_op->last_input = input;
513 deconvolution_op->last_input_height = input_height;
514 deconvolution_op->last_input_width = input_width;
515
516 xnn_indirection_init_deconv2d(deconvolution_op, mr, log2_input_element_size);
517 }
518
519 const size_t group_input_channels = deconvolution_op->group_input_channels;
520 const size_t group_output_channels = deconvolution_op->group_output_channels;
521 const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
522 const size_t w_stride = bias_element_size +
523 (round_up_po2(group_input_channels, deconvolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size);
524 deconvolution_op->context.igemm = (struct igemm_context) {
525 .ks = kernel_size,
526 .ks_scaled = kernel_size * mr * sizeof(void*),
527 .kc = group_input_channels << log2_input_element_size,
528 .w_stride = w_stride,
529 .indirect_a = deconvolution_op->indirection_buffer,
530 .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
531 .zero = deconvolution_op->zero_buffer,
532 .packed_w = deconvolution_op->packed_weights,
533 .c = deconvolution_op->output,
534 .cm_stride = deconvolution_op->output_pixel_stride << log2_output_element_size,
535 .cn_stride = nr << log2_output_element_size,
536 .ga_stride = group_input_channels << log2_input_element_size,
537 .gw_stride = w_stride * round_up(group_output_channels, nr),
538 .gc_stride = group_output_channels << log2_output_element_size,
539 .ba_stride = input_height * input_width * deconvolution_op->input_pixel_stride << log2_input_element_size,
540 .bc_stride = output_size * deconvolution_op->output_pixel_stride << log2_output_element_size,
541 .log2_csize = log2_output_element_size,
542 .ukernel = deconvolution_op->ukernel.igemm.general_case,
543 };
544 if (output_size == 1 && deconvolution_op->ukernel.igemm.mr1_case.function[XNN_UARCH_DEFAULT] != NULL) {
545 deconvolution_op->context.igemm.ukernel = deconvolution_op->ukernel.igemm.mr1_case;
546 }
547 memcpy(&deconvolution_op->context.igemm.params, params, params_size);
548
549 size_t nc = group_output_channels;
550 if (num_threads > 1) {
551 const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
552 const size_t target_tiles_per_thread = 5;
553 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
554 if (max_nc < nc) {
555 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
556 }
557 }
558 if (groups == 1) {
559 if (batch_size > 1) {
560 deconvolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
561 deconvolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
562 deconvolution_op->compute.range[0] = batch_size;
563 deconvolution_op->compute.range[1] = output_size;
564 deconvolution_op->compute.range[2] = group_output_channels;
565 } else {
566 deconvolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
567 deconvolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
568 deconvolution_op->compute.range[0] = output_size;
569 deconvolution_op->compute.range[1] = group_output_channels;
570 }
571 deconvolution_op->compute.tile[0] = mr;
572 deconvolution_op->compute.tile[1] = nc;
573 } else {
574 if (batch_size > 1) {
575 deconvolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
576 deconvolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
577 deconvolution_op->compute.range[0] = batch_size;
578 deconvolution_op->compute.range[1] = groups;
579 deconvolution_op->compute.range[2] = output_size;
580 deconvolution_op->compute.range[3] = group_output_channels;
581 } else {
582 deconvolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
583 deconvolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
584 deconvolution_op->compute.range[0] = groups;
585 deconvolution_op->compute.range[1] = output_size;
586 deconvolution_op->compute.range[2] = group_output_channels;
587 }
588 deconvolution_op->compute.tile[0] = mr;
589 deconvolution_op->compute.tile[1] = nc;
590 }
591 deconvolution_op->state = xnn_run_state_ready;
592 return xnn_status_success;
593 }
594
setup_subconv2d_path(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,size_t output_height,size_t output_width,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t params_size,size_t num_threads,bool use_gemm)595 static enum xnn_status setup_subconv2d_path(
596 xnn_operator_t deconvolution_op,
597 size_t batch_size,
598 size_t input_height,
599 size_t input_width,
600 const void* input,
601 size_t output_height,
602 size_t output_width,
603 void* output,
604 uint32_t log2_input_element_size,
605 uint32_t log2_filter_element_size,
606 uint32_t bias_element_size,
607 uint32_t log2_output_element_size,
608 const void* params,
609 size_t params_size,
610 size_t num_threads,
611 bool use_gemm)
612 {
613 assert(deconvolution_op->ukernel.type == xnn_ukernel_type_subconv2d);
614
615 const size_t kernel_height = deconvolution_op->kernel_height;
616 const size_t kernel_width = deconvolution_op->kernel_width;
617 const size_t kernel_size = kernel_height * kernel_width;
618 const size_t stride_height = deconvolution_op->stride_height;
619 const size_t stride_width = deconvolution_op->stride_width;
620
621 const size_t groups = deconvolution_op->groups;
622 const size_t output_size = output_height * output_width;
623 const size_t mr = deconvolution_op->ukernel.igemm.mr;
624
625 const size_t input_pixel_stride = deconvolution_op->input_pixel_stride << log2_input_element_size;
626 const size_t output_pixel_stride = deconvolution_op->output_pixel_stride << log2_output_element_size;
627
628 const bool any_size_change =
629 input_height != deconvolution_op->last_input_height ||
630 input_width != deconvolution_op->last_input_width ||
631 output_height != deconvolution_op->last_output_height ||
632 output_width != deconvolution_op->last_output_width;
633
634 if (any_size_change || output != deconvolution_op->last_output) {
635 // Initialize subconvolution parameters which depend on output dimensions or MR.
636 struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
637 const size_t modulo_padding_top = deconvolution_op->padding_top % stride_height;
638 const size_t modulo_padding_left = deconvolution_op->padding_left % stride_width;
639 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
640 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
641 const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
642 const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
643 subconvolution_params->scaled_kernel_size = mr * subconvolution_params->indirection_x_stride;
644 subconvolution_params->slice_width = divide_round_up(output_width - output_x_start, stride_width);
645 subconvolution_params->slice_height = divide_round_up(output_height - output_y_start, stride_height);
646 subconvolution_params->output =
647 (void*) ((uintptr_t) output + ((output_y_start * output_width + output_x_start) * output_pixel_stride));
648 ++subconvolution_params;
649 }
650 }
651 deconvolution_op->last_output = output;
652 }
653
654 if (any_size_change) {
655 if (!use_gemm) {
656 const size_t indirection_buffer_size = sizeof(void*) *
657 kernel_size * output_height * stride_width * round_up(divide_round_up(output_width, stride_width), mr);
658
659 const void** indirection_buffer =
660 (const void**) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
661 if (indirection_buffer == NULL) {
662 xnn_log_error(
663 "failed to allocate %zu bytes for %s operator indirection buffer",
664 indirection_buffer_size, xnn_operator_type_to_string(deconvolution_op->type));
665 return xnn_status_out_of_memory;
666 }
667 deconvolution_op->indirection_buffer = indirection_buffer;
668 deconvolution_op->last_input = input;
669
670 xnn_indirection_init_subconv2d(deconvolution_op, mr, log2_input_element_size);
671 }
672 deconvolution_op->last_input_height = input_height;
673 deconvolution_op->last_input_width = input_width;
674 deconvolution_op->last_output_height = output_height;
675 deconvolution_op->last_output_width = output_width;
676 }
677
678 const size_t group_input_channels = deconvolution_op->group_input_channels;
679 const size_t group_output_channels = deconvolution_op->group_output_channels;
680 const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
681 const uint32_t kr = deconvolution_op->ukernel.igemm.kr;
682 const size_t w_stride = stride_height * stride_width * bias_element_size +
683 (round_up_po2(group_input_channels, kr) * kernel_size << log2_filter_element_size);
684 if (use_gemm) {
685 deconvolution_op->context.subgemm = (struct subgemm_context) {
686 .subconvolution_params = deconvolution_op->subconvolution_buffer,
687 .kc = group_input_channels << log2_input_element_size,
688 .a = input,
689 .ax_stride = input_pixel_stride,
690 .ay_stride = input_width * input_pixel_stride,
691 .cx_stride = stride_width * output_pixel_stride,
692 .cy_stride = stride_height * output_width * output_pixel_stride,
693 .cn_stride = nr << log2_output_element_size,
694 .ga_stride = group_input_channels << log2_input_element_size,
695 .gw_stride = w_stride * round_up(group_output_channels, nr),
696 .gc_stride = group_output_channels << log2_output_element_size,
697 .ba_stride = input_height * input_width * input_pixel_stride,
698 .bc_stride = output_size * output_pixel_stride,
699 .log2_csize = log2_output_element_size,
700 .ukernel = deconvolution_op->ukernel.igemm.gemm_case,
701 };
702 memcpy(&deconvolution_op->context.subgemm.params, params, params_size);
703 } else {
704 deconvolution_op->context.subconv = (struct subconv_context) {
705 .subconvolution_params = deconvolution_op->subconvolution_buffer,
706 .kc = group_input_channels << log2_input_element_size,
707 .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
708 .zero = deconvolution_op->zero_buffer,
709 .cx_stride = stride_width * output_pixel_stride,
710 .cy_stride = stride_height * output_width * output_pixel_stride,
711 .cn_stride = nr << log2_output_element_size,
712 .ga_stride = group_input_channels << log2_input_element_size,
713 .gw_stride = w_stride * round_up(group_output_channels, nr),
714 .gc_stride = group_output_channels << log2_output_element_size,
715 .ba_stride = input_height * input_width * input_pixel_stride,
716 .bc_stride = output_size * output_pixel_stride,
717 .log2_csize = log2_output_element_size,
718 .ukernel = deconvolution_op->ukernel.igemm.general_case,
719 };
720 memcpy(&deconvolution_op->context.subconv.params, params, params_size);
721 }
722
723 const size_t output_height_positions = divide_round_up(output_height, stride_height);
724 const size_t output_width_positions = divide_round_up(output_width, stride_width);
725
726 size_t nc = group_output_channels;
727 if (num_threads > 1) {
728 const size_t num_other_tiles = groups * stride_height * stride_width *
729 output_height_positions * divide_round_up(output_width_positions, mr);
730 const size_t target_tiles_per_thread = 5;
731 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
732 if (max_nc < nc) {
733 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
734 }
735 }
736
737 if (groups == 1) {
738 deconvolution_op->compute.type = xnn_parallelization_type_5d_tile_2d;
739 deconvolution_op->compute.task_5d_tile_2d = use_gemm ?
740 (pthreadpool_task_5d_tile_2d_t) xnn_compute_subgemm2d : (pthreadpool_task_5d_tile_2d_t) xnn_compute_subconv2d;
741 deconvolution_op->compute.range[0] = batch_size;
742 deconvolution_op->compute.range[1] = stride_height * stride_width;
743 deconvolution_op->compute.range[2] = divide_round_up(output_height, stride_height);
744 deconvolution_op->compute.range[3] = divide_round_up(output_width, stride_width);
745 deconvolution_op->compute.range[4] = group_output_channels;
746 deconvolution_op->compute.tile[0] = mr;
747 deconvolution_op->compute.tile[1] = nc;
748 } else {
749 deconvolution_op->compute.type = xnn_parallelization_type_6d_tile_2d;
750 deconvolution_op->compute.task_6d_tile_2d = use_gemm ?
751 (pthreadpool_task_6d_tile_2d_t) xnn_compute_grouped_subgemm2d : (pthreadpool_task_6d_tile_2d_t) xnn_compute_grouped_subconv2d;
752 deconvolution_op->compute.range[0] = batch_size;
753 deconvolution_op->compute.range[1] = groups;
754 deconvolution_op->compute.range[2] = stride_height * stride_width;
755 deconvolution_op->compute.range[3] = divide_round_up(output_height, stride_height);
756 deconvolution_op->compute.range[4] = divide_round_up(output_width, stride_width);
757 deconvolution_op->compute.range[5] = group_output_channels;
758 deconvolution_op->compute.tile[0] = mr;
759 deconvolution_op->compute.tile[1] = nc;
760 }
761
762 deconvolution_op->state = xnn_run_state_ready;
763 return xnn_status_success;
764 }
765
setup_deconvolution2d_nhwc(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const void * input,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t params_size,size_t num_threads)766 static enum xnn_status setup_deconvolution2d_nhwc(
767 xnn_operator_t deconvolution_op,
768 size_t batch_size,
769 size_t input_height,
770 size_t input_width,
771 uint32_t adjustment_height,
772 uint32_t adjustment_width,
773 const void* input,
774 void* output,
775 uint32_t log2_input_element_size,
776 uint32_t log2_filter_element_size,
777 uint32_t bias_element_size,
778 uint32_t log2_output_element_size,
779 const void* params,
780 size_t params_size,
781 size_t num_threads)
782 {
783 deconvolution_op->state = xnn_run_state_invalid;
784
785 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
786 xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
787 xnn_operator_type_to_string(deconvolution_op->type));
788 return xnn_status_uninitialized;
789 }
790
791 if (input_width == 0 || input_height == 0) {
792 xnn_log_error(
793 "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
794 xnn_operator_type_to_string(deconvolution_op->type), input_width, input_height);
795 return xnn_status_invalid_parameter;
796 }
797
798 if (adjustment_height >= deconvolution_op->stride_height) {
799 xnn_log_error(
800 "failed to setup %s operator with %" PRIu32 " height adjustment: "
801 "height adjustment must be smaller than height stride (%" PRIu32 ")",
802 xnn_operator_type_to_string(deconvolution_op->type), adjustment_height, deconvolution_op->stride_height);
803 return xnn_status_invalid_parameter;
804 }
805
806 if (adjustment_width >= deconvolution_op->stride_width) {
807 xnn_log_error(
808 "failed to setup %s operator with %" PRIu32 " width adjustment: "
809 "width adjustment must be smaller than width stride (%" PRIu32 ")",
810 xnn_operator_type_to_string(deconvolution_op->type), adjustment_width, deconvolution_op->stride_width);
811 return xnn_status_invalid_parameter;
812 }
813
814 if (batch_size == 0) {
815 deconvolution_op->state = xnn_run_state_skip;
816 return xnn_status_success;
817 }
818
819 deconvolution_op->batch_size = batch_size;
820 deconvolution_op->input_height = input_height;
821 deconvolution_op->input_width = input_width;
822 deconvolution_op->input = input;
823 deconvolution_op->output = output;
824
825 if (deconvolution_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
826 // Recompute padding for the input size.
827 const uint32_t dilated_kernel_height_minus_1 = (deconvolution_op->kernel_height - 1) * deconvolution_op->dilation_height;
828 const uint32_t dilated_kernel_width_minus_1 = (deconvolution_op->kernel_width - 1) * deconvolution_op->dilation_width;
829
830 const size_t total_padding_height = doz(dilated_kernel_height_minus_1, (input_height - 1) % deconvolution_op->stride_height);
831 const size_t total_padding_width = doz(dilated_kernel_width_minus_1, (input_width - 1) % deconvolution_op->stride_width);
832
833 const uint32_t padding_top = deconvolution_op->padding_top = total_padding_height / 2;
834 const uint32_t padding_left = deconvolution_op->padding_left = total_padding_width / 2;
835 deconvolution_op->padding_bottom = total_padding_height - padding_top;
836 deconvolution_op->padding_right = total_padding_width - padding_left;
837 }
838
839 const size_t output_height = deconvolution_op->output_height = compute_output_dimension(
840 input_height, deconvolution_op->padding_top + deconvolution_op->padding_bottom,
841 adjustment_height, deconvolution_op->kernel_height, deconvolution_op->dilation_height, deconvolution_op->stride_height);
842 const size_t output_width = deconvolution_op->output_width = compute_output_dimension(
843 input_width, deconvolution_op->padding_left + deconvolution_op->padding_right,
844 adjustment_width, deconvolution_op->kernel_width, deconvolution_op->dilation_width, deconvolution_op->stride_width);
845
846 switch (deconvolution_op->ukernel.type) {
847 case xnn_ukernel_type_igemm:
848 return setup_conv_path(
849 deconvolution_op,
850 batch_size,
851 input_height, input_width, input,
852 output_height, output_width, output,
853 log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
854 params, params_size, num_threads);
855 case xnn_ukernel_type_subconv2d:
856 {
857 const bool no_padding = (deconvolution_op->padding_top | deconvolution_op->padding_right | deconvolution_op->padding_bottom | deconvolution_op->padding_left) == 0;
858 const bool no_adjustment = (adjustment_height | adjustment_width) == 0;
859 const bool use_gemm = no_padding && no_adjustment &&
860 deconvolution_op->kernel_height == deconvolution_op->stride_height &&
861 deconvolution_op->kernel_width == deconvolution_op->stride_width &&
862 deconvolution_op->ukernel.igemm.gemm_case.function[XNN_UARCH_DEFAULT] != NULL;
863 return setup_subconv2d_path(
864 deconvolution_op,
865 batch_size,
866 input_height, input_width, input,
867 output_height, output_width, output,
868 log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
869 params, params_size, num_threads, use_gemm);
870 }
871 default:
872 XNN_UNREACHABLE;
873 }
874 }
875
xnn_setup_deconvolution2d_nhwc_qu8(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const uint8_t * input,uint8_t * output,pthreadpool_t threadpool)876 enum xnn_status xnn_setup_deconvolution2d_nhwc_qu8(
877 xnn_operator_t deconvolution_op,
878 size_t batch_size,
879 size_t input_height,
880 size_t input_width,
881 uint32_t adjustment_height,
882 uint32_t adjustment_width,
883 const uint8_t* input,
884 uint8_t* output,
885 pthreadpool_t threadpool)
886 {
887 if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_qu8) {
888 xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
889 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8),
890 xnn_operator_type_to_string(deconvolution_op->type));
891 return xnn_status_invalid_parameter;
892 }
893
894 return setup_deconvolution2d_nhwc(
895 deconvolution_op,
896 batch_size, input_height, input_width,
897 adjustment_height, adjustment_width,
898 input, output,
899 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
900 0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
901 sizeof(int32_t) /* sizeof(bias element) */,
902 0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
903 &deconvolution_op->params.qu8_gemm, sizeof(deconvolution_op->params.qu8_gemm),
904 pthreadpool_get_threads_count(threadpool));
905 }
906
xnn_setup_deconvolution2d_nhwc_f32(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const float * input,float * output,pthreadpool_t threadpool)907 enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
908 xnn_operator_t deconvolution_op,
909 size_t batch_size,
910 size_t input_height,
911 size_t input_width,
912 uint32_t adjustment_height,
913 uint32_t adjustment_width,
914 const float* input,
915 float* output,
916 pthreadpool_t threadpool)
917 {
918 if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_f32) {
919 xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
920 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32),
921 xnn_operator_type_to_string(deconvolution_op->type));
922 return xnn_status_invalid_parameter;
923 }
924
925 return setup_deconvolution2d_nhwc(
926 deconvolution_op,
927 batch_size, input_height, input_width,
928 adjustment_height, adjustment_width,
929 input, output,
930 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
931 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
932 sizeof(float) /* sizeof(bias element) */,
933 2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
934 &deconvolution_op->params.f32_minmax, sizeof(deconvolution_op->params.f32_minmax),
935 pthreadpool_get_threads_count(threadpool));
936 }
937