• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 #include <stdbool.h>
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <string.h>
14 #include <math.h>
15 
16 #include <xnnpack.h>
17 #include <xnnpack/allocator.h>
18 #include <xnnpack/indirection.h>
19 #include <xnnpack/log.h>
20 #include <xnnpack/math.h>
21 #include <xnnpack/operator.h>
22 #include <xnnpack/pack.h>
23 #include <xnnpack/params-init.h>
24 #include <xnnpack/params.h>
25 
26 
compute_output_dimension(size_t input_dimension,size_t output_padding_dimension,size_t adjustment_dimension,size_t kernel_dimension,size_t dilation_dimension,size_t stride_dimension)27 static inline size_t compute_output_dimension(
28     size_t input_dimension,
29     size_t output_padding_dimension,
30     size_t adjustment_dimension,
31     size_t kernel_dimension,
32     size_t dilation_dimension,
33     size_t stride_dimension)
34 {
35   const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
36   return doz(
37     stride_dimension * (input_dimension - 1) + adjustment_dimension + effective_kernel_dimension,
38     output_padding_dimension);
39 }
40 
create_deconvolution2d_nhwc(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,const void * kernel,const void * bias,uint32_t flags,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,xnn_pack_conv_goki_w_function pack_conv_goki_w,xnn_pack_deconv_goki_w_function pack_deconv_goki_w,const void * packing_params,int input_padding_byte,int packed_weights_padding_byte,const void * params,size_t params_size,const struct gemm_parameters * gemm_parameters,const struct gemm_fused_ukernels * gemm_ukernels,enum xnn_operator_type operator_type,xnn_operator_t * deconvolution_op_out)41 static enum xnn_status create_deconvolution2d_nhwc(
42     uint32_t output_padding_top,
43     uint32_t output_padding_right,
44     uint32_t output_padding_bottom,
45     uint32_t output_padding_left,
46     uint32_t kernel_height,
47     uint32_t kernel_width,
48     uint32_t stride_height,
49     uint32_t stride_width,
50     uint32_t dilation_height,
51     uint32_t dilation_width,
52     uint32_t groups,
53     size_t group_input_channels,
54     size_t group_output_channels,
55     size_t input_pixel_stride,
56     size_t output_pixel_stride,
57     const void* kernel,
58     const void* bias,
59     uint32_t flags,
60     uint32_t log2_input_element_size,
61     uint32_t log2_filter_element_size,
62     uint32_t bias_element_size,
63     xnn_pack_conv_goki_w_function pack_conv_goki_w,
64     xnn_pack_deconv_goki_w_function pack_deconv_goki_w,
65     const void* packing_params,
66     int input_padding_byte,
67     int packed_weights_padding_byte,
68     const void* params,
69     size_t params_size,
70     const struct gemm_parameters* gemm_parameters,
71     const struct gemm_fused_ukernels* gemm_ukernels,
72     enum xnn_operator_type operator_type,
73     xnn_operator_t* deconvolution_op_out)
74 {
75   xnn_operator_t deconvolution_op = NULL;
76   enum xnn_status status = xnn_status_uninitialized;
77 
78   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
79     xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
80       xnn_operator_type_to_string(operator_type));
81     goto error;
82   }
83 
84   status = xnn_status_invalid_parameter;
85 
86   if (kernel_width == 0 || kernel_height == 0) {
87     xnn_log_error(
88       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
89       xnn_operator_type_to_string(operator_type), kernel_width, kernel_height);
90     goto error;
91   }
92 
93   if (stride_width == 0 || stride_height == 0) {
94     xnn_log_error(
95       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
96       xnn_operator_type_to_string(operator_type), stride_width, stride_height);
97     goto error;
98   }
99 
100   if (dilation_width == 0 || dilation_height == 0) {
101     xnn_log_error(
102       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " dilation: dilation dimensions must be non-zero",
103       xnn_operator_type_to_string(operator_type), dilation_width, dilation_height);
104     goto error;
105   }
106 
107   if (groups == 0) {
108     xnn_log_error(
109       "failed to create %s operator with %" PRIu32 " groups: number of groups must be non-zero",
110       xnn_operator_type_to_string(operator_type), groups);
111     goto error;
112   }
113 
114   if (group_input_channels == 0) {
115     xnn_log_error(
116       "failed to create %s operator with %zu input channels per group: number of channels must be non-zero",
117       xnn_operator_type_to_string(operator_type), group_input_channels);
118     goto error;
119   }
120 
121   if (group_output_channels == 0) {
122     xnn_log_error(
123       "failed to create %s operator with %zu output channels per group: number of channels must be non-zero",
124       xnn_operator_type_to_string(operator_type), group_output_channels);
125     goto error;
126   }
127 
128   const size_t input_channels = groups * group_input_channels;
129   if (input_pixel_stride < input_channels) {
130     xnn_log_error(
131       "failed to create %s operator with input pixel stride of %zu: "
132       "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
133       xnn_operator_type_to_string(operator_type),
134       input_pixel_stride, groups, group_input_channels);
135     goto error;
136   }
137 
138   const size_t output_channels = groups * group_output_channels;
139   if (output_pixel_stride < output_channels) {
140     xnn_log_error(
141       "failed to create %s operator with output pixel stride of %zu: "
142       "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
143       xnn_operator_type_to_string(operator_type),
144       output_pixel_stride, groups, group_output_channels);
145     goto error;
146   }
147 
148   const bool any_padding = (output_padding_left | output_padding_top | output_padding_right | output_padding_bottom) != 0;
149   if (any_padding && (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
150     xnn_log_error(
151       "failed to create %s operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
152       "TensorFlow SAME padding can't be combined with explicit padding specification",
153       xnn_operator_type_to_string(operator_type),
154       output_padding_top, output_padding_left, output_padding_bottom, output_padding_right);
155     goto error;
156   }
157 
158   status = xnn_status_out_of_memory;
159 
160   deconvolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
161   if (deconvolution_op == NULL) {
162     xnn_log_error(
163       "failed to allocate %zu bytes for %s operator descriptor",
164       sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
165     goto error;
166   }
167 
168   const uint32_t mr = gemm_parameters->mr;
169   const uint32_t nr = gemm_parameters->nr;
170   const uint32_t kr = UINT32_C(1) << gemm_parameters->log2_kr;
171   const uint32_t sr = UINT32_C(1) << gemm_parameters->log2_sr;
172 
173   const uint32_t n_stride = round_up(group_output_channels, nr);
174   const uint32_t k_stride = round_up_po2(group_input_channels, kr);
175   const uint32_t kernel_size = kernel_height * kernel_width;
176   enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_igemm;
177   size_t packed_group_weights_size = (sizeof(float) * kernel_size * k_stride + sizeof(float)) * n_stride;
178   if (max(stride_height, stride_width) > 1 && max(dilation_height, dilation_width) == 1 && stride_width <= kernel_width && stride_height <= kernel_height) {
179     ukernel_type = xnn_ukernel_type_subconv2d;
180     const size_t subkernels = stride_height * stride_width;
181     packed_group_weights_size = n_stride *
182       (sizeof(float) * kernel_size * k_stride + sizeof(float) * subkernels);
183 
184     const size_t subconvolution_buffer_size = sizeof(struct subconvolution_params) * subkernels;
185     deconvolution_op->subconvolution_buffer = xnn_allocate_zero_memory(subconvolution_buffer_size);
186     if (deconvolution_op->subconvolution_buffer == NULL) {
187       xnn_log_error(
188         "failed to allocate %zu bytes for %s operator subconvolution buffer",
189         subconvolution_buffer_size, xnn_operator_type_to_string(operator_type));
190       goto error;
191     }
192 
193     struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
194     for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
195       for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
196         const size_t subkernel_height = divide_round_up(kernel_height - offset_y, stride_height);
197         const size_t subkernel_width = divide_round_up(kernel_width - offset_x, stride_width);
198         const size_t subkernel_size = subkernel_height * subkernel_width;
199 
200         subconvolution_params->indirection_x_stride = sizeof(void*) * subkernel_size;
201         subconvolution_params->w_stride = sizeof(float) + k_stride * subkernel_size * sizeof(float);
202         subconvolution_params++;
203       }
204     }
205   }
206   deconvolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
207   if (deconvolution_op->packed_weights == NULL) {
208     xnn_log_error(
209       "failed to allocate %zu bytes for %s operator packed weights",
210       packed_group_weights_size * groups, xnn_operator_type_to_string(operator_type));
211     goto error;
212   }
213   memset(deconvolution_op->packed_weights, packed_weights_padding_byte, packed_group_weights_size * groups);
214 
215   switch (ukernel_type) {
216     case xnn_ukernel_type_igemm:
217       pack_conv_goki_w(
218         groups, group_output_channels, kernel_size, group_input_channels,
219         nr, kr, sr,
220         kernel, bias, deconvolution_op->packed_weights,
221         packing_params);
222       break;
223     case xnn_ukernel_type_subconv2d:
224       pack_deconv_goki_w(
225         groups, group_output_channels, kernel_height, kernel_width, group_input_channels,
226         stride_height, stride_width,
227         nr, kr, sr,
228         kernel, bias, deconvolution_op->packed_weights, deconvolution_op->subconvolution_buffer,
229         packing_params);
230       break;
231     default:
232       XNN_UNREACHABLE;
233   }
234 
235   const size_t zero_size = (k_stride << log2_input_element_size) + XNN_EXTRA_BYTES;
236   deconvolution_op->zero_buffer = xnn_allocate_simd_memory(zero_size);
237   if (deconvolution_op->zero_buffer == NULL) {
238     xnn_log_error(
239       "failed to allocate %zu bytes for %s operator zero padding",
240       zero_size, xnn_operator_type_to_string(operator_type));
241     goto error;
242   }
243   memset(deconvolution_op->zero_buffer, input_padding_byte, zero_size);
244 
245   deconvolution_op->padding_top = output_padding_top;
246   deconvolution_op->padding_right = output_padding_right;
247   deconvolution_op->padding_bottom = output_padding_bottom;
248   deconvolution_op->padding_left = output_padding_left;
249 
250   deconvolution_op->kernel_height = kernel_height;
251   deconvolution_op->kernel_width = kernel_width;
252   deconvolution_op->stride_height = stride_height;
253   deconvolution_op->stride_width = stride_width;
254   deconvolution_op->dilation_height = dilation_height;
255   deconvolution_op->dilation_width = dilation_width;
256   deconvolution_op->groups = groups;
257   deconvolution_op->group_input_channels = group_input_channels;
258   deconvolution_op->group_output_channels = group_output_channels;
259   deconvolution_op->input_pixel_stride = input_pixel_stride;
260   deconvolution_op->output_pixel_stride = output_pixel_stride;
261 
262   memcpy(&deconvolution_op->params, params, params_size);
263   deconvolution_op->type = operator_type;
264   deconvolution_op->ukernel.type = ukernel_type;
265   deconvolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
266     .general_case = gemm_ukernels->igemm,
267     .gemm_case = gemm_ukernels->gemm,
268     .mr = mr,
269     .nr = nr,
270     .kr = kr,
271   };
272 
273   if (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
274     if ((stride_height | stride_width) == 1) {
275       // Padding can be computed statically
276       const uint32_t padding_height = (kernel_height - 1) * dilation_height;
277       const uint32_t padding_width = (kernel_width - 1) * dilation_width;
278 
279       const uint32_t padding_top = padding_height / 2;
280       const uint32_t padding_left = padding_width / 2;
281 
282       deconvolution_op->padding_top = padding_top;
283       deconvolution_op->padding_left = padding_left;
284       deconvolution_op->padding_bottom = padding_height - padding_top;
285       deconvolution_op->padding_right = padding_width - padding_left;
286     } else {
287       deconvolution_op->flags = XNN_FLAG_TENSORFLOW_SAME_PADDING;
288     }
289   }
290 
291   deconvolution_op->state = xnn_run_state_invalid;
292 
293   *deconvolution_op_out = deconvolution_op;
294   return xnn_status_success;
295 
296 error:
297   xnn_delete_operator(deconvolution_op);
298   return status;
299 }
300 
xnn_create_deconvolution2d_nhwc_qu8(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,uint8_t input_zero_point,float input_scale,uint8_t kernel_zero_point,float kernel_scale,const uint8_t * kernel,const int32_t * bias,uint8_t output_zero_point,float output_scale,uint8_t output_min,uint8_t output_max,uint32_t flags,xnn_operator_t * deconvolution_op_out)301 enum xnn_status xnn_create_deconvolution2d_nhwc_qu8(
302     uint32_t output_padding_top,
303     uint32_t output_padding_right,
304     uint32_t output_padding_bottom,
305     uint32_t output_padding_left,
306     uint32_t kernel_height,
307     uint32_t kernel_width,
308     uint32_t stride_height,
309     uint32_t stride_width,
310     uint32_t dilation_height,
311     uint32_t dilation_width,
312     uint32_t groups,
313     size_t group_input_channels,
314     size_t group_output_channels,
315     size_t input_pixel_stride,
316     size_t output_pixel_stride,
317     uint8_t input_zero_point,
318     float input_scale,
319     uint8_t kernel_zero_point,
320     float kernel_scale,
321     const uint8_t* kernel,
322     const int32_t* bias,
323     uint8_t output_zero_point,
324     float output_scale,
325     uint8_t output_min,
326     uint8_t output_max,
327     uint32_t flags,
328     xnn_operator_t* deconvolution_op_out)
329 {
330   if (input_scale <= 0.0f || !isnormal(input_scale)) {
331     xnn_log_error(
332       "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
333       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), input_scale);
334     return xnn_status_invalid_parameter;
335   }
336 
337   if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
338     xnn_log_error(
339       "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
340       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), kernel_scale);
341     return xnn_status_invalid_parameter;
342   }
343 
344   if (output_scale <= 0.0f || !isnormal(output_scale)) {
345     xnn_log_error(
346       "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
347       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), output_scale);
348     return xnn_status_invalid_parameter;
349   }
350 
351   if (output_min >= output_max) {
352     xnn_log_error(
353       "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
354       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), output_min, output_max);
355     return xnn_status_invalid_parameter;
356   }
357 
358   const float requantization_scale = input_scale * kernel_scale / output_scale;
359   if (requantization_scale >= 1.0f) {
360     xnn_log_error(
361       "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
362       "requantization scale %.7g is greater or equal to 1.0",
363       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8),
364       input_scale, kernel_scale, output_scale, requantization_scale);
365     return xnn_status_unsupported_parameter;
366   }
367 
368   const union xnn_qu8_gemm_params params = xnn_init_qu8_gemm_params(
369     kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max);
370   const struct xnn_qu8_packing_params packing_params = {
371     .input_zero_point = input_zero_point,
372     .kernel_zero_point = kernel_zero_point,
373   };
374   return create_deconvolution2d_nhwc(
375     output_padding_top, output_padding_right, output_padding_bottom, output_padding_left,
376     kernel_height, kernel_width,
377     stride_height, stride_width,
378     dilation_height, dilation_width,
379     groups, group_input_channels, group_output_channels,
380     input_pixel_stride, output_pixel_stride,
381     kernel, bias, flags,
382     0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
383     0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
384     sizeof(int32_t) /* sizeof(bias element) */,
385     (xnn_pack_conv_goki_w_function) xnn_pack_qu8_conv_goki_w,
386     (xnn_pack_deconv_goki_w_function) xnn_pack_qu8_deconv_goki_w,
387     &packing_params, input_zero_point /* input padding byte */, kernel_zero_point /* packed weights padding byte */,
388     &params, sizeof(params),
389     &xnn_params.qu8.gemm, &xnn_params.qu8.gemm.minmax,
390     xnn_operator_type_deconvolution_nhwc_qu8,
391     deconvolution_op_out);
392 }
393 
xnn_create_deconvolution2d_nhwc_f32(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,const float * kernel,const float * bias,float output_min,float output_max,uint32_t flags,xnn_operator_t * deconvolution_op_out)394 enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
395     uint32_t output_padding_top,
396     uint32_t output_padding_right,
397     uint32_t output_padding_bottom,
398     uint32_t output_padding_left,
399     uint32_t kernel_height,
400     uint32_t kernel_width,
401     uint32_t stride_height,
402     uint32_t stride_width,
403     uint32_t dilation_height,
404     uint32_t dilation_width,
405     uint32_t groups,
406     size_t group_input_channels,
407     size_t group_output_channels,
408     size_t input_pixel_stride,
409     size_t output_pixel_stride,
410     const float* kernel,
411     const float* bias,
412     float output_min,
413     float output_max,
414     uint32_t flags,
415     xnn_operator_t* deconvolution_op_out)
416 {
417   if (isnan(output_min)) {
418     xnn_log_error(
419       "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
420       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32));
421     return xnn_status_invalid_parameter;
422   }
423 
424   if (isnan(output_max)) {
425     xnn_log_error(
426       "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
427       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32));
428     return xnn_status_invalid_parameter;
429   }
430 
431   if (output_min >= output_max) {
432     xnn_log_error(
433       "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
434       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32), output_min, output_max);
435     return xnn_status_invalid_parameter;
436   }
437 
438   const struct gemm_parameters* gemm_parameters = &xnn_params.f32.gemm;
439   if (gemm_parameters->nr > group_output_channels) {
440     // Default micro-kernel is suboptimal. Try to find a better micro-kernel.
441     if (xnn_params.f32.gemm2.minmax.igemm.function[XNN_UARCH_DEFAULT] != NULL) {
442       gemm_parameters = &xnn_params.f32.gemm2;
443     }
444   }
445   const struct gemm_fused_ukernels* gemm_ukernels = &gemm_parameters->minmax;
446   const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max);
447   if (linear_activation && gemm_parameters->linear.gemm.function[XNN_UARCH_DEFAULT] != NULL) {
448     gemm_ukernels = &gemm_parameters->linear;
449   }
450 
451   const union xnn_f32_minmax_params params = xnn_init_f32_minmax_params(output_min, output_max);
452   return create_deconvolution2d_nhwc(
453     output_padding_top, output_padding_right, output_padding_bottom, output_padding_left,
454     kernel_height, kernel_width,
455     stride_height, stride_width,
456     dilation_height, dilation_width,
457     groups, group_input_channels, group_output_channels,
458     input_pixel_stride, output_pixel_stride,
459     kernel, bias, flags,
460     2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
461     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
462     sizeof(float) /* sizeof(bias element) */,
463     (xnn_pack_conv_goki_w_function) xnn_pack_f32_conv_goki_w,
464     (xnn_pack_deconv_goki_w_function) xnn_pack_f32_deconv_goki_w,
465     NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
466     &params, sizeof(params),
467     gemm_parameters, gemm_ukernels,
468     xnn_operator_type_deconvolution_nhwc_f32,
469     deconvolution_op_out);
470 }
471 
setup_conv_path(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,size_t output_height,size_t output_width,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t params_size,size_t num_threads)472 static enum xnn_status setup_conv_path(
473   xnn_operator_t deconvolution_op,
474   size_t batch_size,
475   size_t input_height,
476   size_t input_width,
477   const void* input,
478   size_t output_height,
479   size_t output_width,
480   void* output,
481   uint32_t log2_input_element_size,
482   uint32_t log2_filter_element_size,
483   uint32_t bias_element_size,
484   uint32_t log2_output_element_size,
485   const void* params,
486   size_t params_size,
487   size_t num_threads)
488 {
489   assert(deconvolution_op->ukernel.type == xnn_ukernel_type_igemm);
490 
491   const size_t kernel_height = deconvolution_op->kernel_height;
492   const size_t kernel_width = deconvolution_op->kernel_width;
493   const size_t kernel_size = kernel_height * kernel_width;
494 
495   const size_t groups = deconvolution_op->groups;
496   const size_t output_size = output_height * output_width;
497   const size_t mr = deconvolution_op->ukernel.igemm.mr;
498   const size_t tiled_output_size = round_up(output_size, mr);
499   const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
500 
501   if (input_height != deconvolution_op->last_input_height ||
502       input_width != deconvolution_op->last_input_width)
503   {
504     const void** indirection_buffer = (const void**) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
505     if (indirection_buffer == NULL) {
506       xnn_log_error(
507         "failed to allocate %zu bytes for %s operator indirection buffer",
508         indirection_buffer_size, xnn_operator_type_to_string(deconvolution_op->type));
509       return xnn_status_out_of_memory;
510     }
511     deconvolution_op->indirection_buffer = indirection_buffer;
512     deconvolution_op->last_input = input;
513     deconvolution_op->last_input_height = input_height;
514     deconvolution_op->last_input_width = input_width;
515 
516     xnn_indirection_init_deconv2d(deconvolution_op, mr, log2_input_element_size);
517   }
518 
519   const size_t group_input_channels = deconvolution_op->group_input_channels;
520   const size_t group_output_channels = deconvolution_op->group_output_channels;
521   const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
522   const size_t w_stride = bias_element_size +
523     (round_up_po2(group_input_channels, deconvolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size);
524   deconvolution_op->context.igemm = (struct igemm_context) {
525       .ks = kernel_size,
526       .ks_scaled = kernel_size * mr * sizeof(void*),
527       .kc = group_input_channels << log2_input_element_size,
528       .w_stride = w_stride,
529       .indirect_a = deconvolution_op->indirection_buffer,
530       .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
531       .zero = deconvolution_op->zero_buffer,
532       .packed_w = deconvolution_op->packed_weights,
533       .c = deconvolution_op->output,
534       .cm_stride = deconvolution_op->output_pixel_stride << log2_output_element_size,
535       .cn_stride = nr << log2_output_element_size,
536       .ga_stride = group_input_channels << log2_input_element_size,
537       .gw_stride = w_stride * round_up(group_output_channels, nr),
538       .gc_stride = group_output_channels << log2_output_element_size,
539       .ba_stride = input_height * input_width * deconvolution_op->input_pixel_stride << log2_input_element_size,
540       .bc_stride = output_size * deconvolution_op->output_pixel_stride << log2_output_element_size,
541       .log2_csize = log2_output_element_size,
542       .ukernel = deconvolution_op->ukernel.igemm.general_case,
543   };
544   if (output_size == 1 && deconvolution_op->ukernel.igemm.mr1_case.function[XNN_UARCH_DEFAULT] != NULL) {
545     deconvolution_op->context.igemm.ukernel = deconvolution_op->ukernel.igemm.mr1_case;
546   }
547   memcpy(&deconvolution_op->context.igemm.params, params, params_size);
548 
549   size_t nc = group_output_channels;
550   if (num_threads > 1) {
551     const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
552     const size_t target_tiles_per_thread = 5;
553     const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
554     if (max_nc < nc) {
555       nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
556     }
557   }
558   if (groups == 1) {
559     if (batch_size > 1) {
560       deconvolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
561       deconvolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
562       deconvolution_op->compute.range[0] = batch_size;
563       deconvolution_op->compute.range[1] = output_size;
564       deconvolution_op->compute.range[2] = group_output_channels;
565     } else {
566       deconvolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
567       deconvolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
568       deconvolution_op->compute.range[0] = output_size;
569       deconvolution_op->compute.range[1] = group_output_channels;
570     }
571     deconvolution_op->compute.tile[0] = mr;
572     deconvolution_op->compute.tile[1] = nc;
573   } else {
574     if (batch_size > 1) {
575       deconvolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
576       deconvolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
577       deconvolution_op->compute.range[0] = batch_size;
578       deconvolution_op->compute.range[1] = groups;
579       deconvolution_op->compute.range[2] = output_size;
580       deconvolution_op->compute.range[3] = group_output_channels;
581     } else {
582       deconvolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
583       deconvolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
584       deconvolution_op->compute.range[0] = groups;
585       deconvolution_op->compute.range[1] = output_size;
586       deconvolution_op->compute.range[2] = group_output_channels;
587     }
588     deconvolution_op->compute.tile[0] = mr;
589     deconvolution_op->compute.tile[1] = nc;
590   }
591   deconvolution_op->state = xnn_run_state_ready;
592   return xnn_status_success;
593 }
594 
setup_subconv2d_path(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,size_t output_height,size_t output_width,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t params_size,size_t num_threads,bool use_gemm)595 static enum xnn_status setup_subconv2d_path(
596   xnn_operator_t deconvolution_op,
597   size_t batch_size,
598   size_t input_height,
599   size_t input_width,
600   const void* input,
601   size_t output_height,
602   size_t output_width,
603   void* output,
604   uint32_t log2_input_element_size,
605   uint32_t log2_filter_element_size,
606   uint32_t bias_element_size,
607   uint32_t log2_output_element_size,
608   const void* params,
609   size_t params_size,
610   size_t num_threads,
611   bool use_gemm)
612 {
613   assert(deconvolution_op->ukernel.type == xnn_ukernel_type_subconv2d);
614 
615   const size_t kernel_height = deconvolution_op->kernel_height;
616   const size_t kernel_width = deconvolution_op->kernel_width;
617   const size_t kernel_size = kernel_height * kernel_width;
618   const size_t stride_height = deconvolution_op->stride_height;
619   const size_t stride_width = deconvolution_op->stride_width;
620 
621   const size_t groups = deconvolution_op->groups;
622   const size_t output_size = output_height * output_width;
623   const size_t mr = deconvolution_op->ukernel.igemm.mr;
624 
625   const size_t input_pixel_stride = deconvolution_op->input_pixel_stride << log2_input_element_size;
626   const size_t output_pixel_stride = deconvolution_op->output_pixel_stride << log2_output_element_size;
627 
628   const bool any_size_change =
629     input_height != deconvolution_op->last_input_height ||
630     input_width != deconvolution_op->last_input_width ||
631     output_height != deconvolution_op->last_output_height ||
632     output_width != deconvolution_op->last_output_width;
633 
634   if (any_size_change || output != deconvolution_op->last_output) {
635     // Initialize subconvolution parameters which depend on output dimensions or MR.
636     struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
637     const size_t modulo_padding_top = deconvolution_op->padding_top % stride_height;
638     const size_t modulo_padding_left = deconvolution_op->padding_left % stride_width;
639     for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
640       for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
641         const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
642         const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
643         subconvolution_params->scaled_kernel_size = mr * subconvolution_params->indirection_x_stride;
644         subconvolution_params->slice_width = divide_round_up(output_width - output_x_start, stride_width);
645         subconvolution_params->slice_height = divide_round_up(output_height - output_y_start, stride_height);
646         subconvolution_params->output =
647           (void*) ((uintptr_t) output + ((output_y_start * output_width + output_x_start) * output_pixel_stride));
648         ++subconvolution_params;
649       }
650     }
651     deconvolution_op->last_output = output;
652   }
653 
654   if (any_size_change) {
655     if (!use_gemm) {
656       const size_t indirection_buffer_size = sizeof(void*) *
657         kernel_size * output_height * stride_width * round_up(divide_round_up(output_width, stride_width), mr);
658 
659       const void** indirection_buffer =
660         (const void**) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
661       if (indirection_buffer == NULL) {
662         xnn_log_error(
663           "failed to allocate %zu bytes for %s operator indirection buffer",
664           indirection_buffer_size, xnn_operator_type_to_string(deconvolution_op->type));
665         return xnn_status_out_of_memory;
666       }
667       deconvolution_op->indirection_buffer = indirection_buffer;
668       deconvolution_op->last_input = input;
669 
670       xnn_indirection_init_subconv2d(deconvolution_op, mr, log2_input_element_size);
671     }
672     deconvolution_op->last_input_height = input_height;
673     deconvolution_op->last_input_width = input_width;
674     deconvolution_op->last_output_height = output_height;
675     deconvolution_op->last_output_width = output_width;
676   }
677 
678   const size_t group_input_channels = deconvolution_op->group_input_channels;
679   const size_t group_output_channels = deconvolution_op->group_output_channels;
680   const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
681   const uint32_t kr = deconvolution_op->ukernel.igemm.kr;
682   const size_t w_stride = stride_height * stride_width * bias_element_size +
683     (round_up_po2(group_input_channels, kr) * kernel_size << log2_filter_element_size);
684   if (use_gemm) {
685     deconvolution_op->context.subgemm = (struct subgemm_context) {
686         .subconvolution_params = deconvolution_op->subconvolution_buffer,
687         .kc = group_input_channels << log2_input_element_size,
688         .a = input,
689         .ax_stride = input_pixel_stride,
690         .ay_stride = input_width * input_pixel_stride,
691         .cx_stride = stride_width * output_pixel_stride,
692         .cy_stride = stride_height * output_width * output_pixel_stride,
693         .cn_stride = nr << log2_output_element_size,
694         .ga_stride = group_input_channels << log2_input_element_size,
695         .gw_stride = w_stride * round_up(group_output_channels, nr),
696         .gc_stride = group_output_channels << log2_output_element_size,
697         .ba_stride = input_height * input_width * input_pixel_stride,
698         .bc_stride = output_size * output_pixel_stride,
699         .log2_csize = log2_output_element_size,
700         .ukernel = deconvolution_op->ukernel.igemm.gemm_case,
701     };
702     memcpy(&deconvolution_op->context.subgemm.params, params, params_size);
703   } else {
704     deconvolution_op->context.subconv = (struct subconv_context) {
705         .subconvolution_params = deconvolution_op->subconvolution_buffer,
706         .kc = group_input_channels << log2_input_element_size,
707         .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
708         .zero = deconvolution_op->zero_buffer,
709         .cx_stride = stride_width * output_pixel_stride,
710         .cy_stride = stride_height * output_width * output_pixel_stride,
711         .cn_stride = nr << log2_output_element_size,
712         .ga_stride = group_input_channels << log2_input_element_size,
713         .gw_stride = w_stride * round_up(group_output_channels, nr),
714         .gc_stride = group_output_channels << log2_output_element_size,
715         .ba_stride = input_height * input_width * input_pixel_stride,
716         .bc_stride = output_size * output_pixel_stride,
717         .log2_csize = log2_output_element_size,
718         .ukernel = deconvolution_op->ukernel.igemm.general_case,
719     };
720     memcpy(&deconvolution_op->context.subconv.params, params, params_size);
721   }
722 
723   const size_t output_height_positions = divide_round_up(output_height, stride_height);
724   const size_t output_width_positions = divide_round_up(output_width, stride_width);
725 
726   size_t nc = group_output_channels;
727   if (num_threads > 1) {
728     const size_t num_other_tiles = groups * stride_height * stride_width *
729       output_height_positions * divide_round_up(output_width_positions, mr);
730     const size_t target_tiles_per_thread = 5;
731     const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
732     if (max_nc < nc) {
733       nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
734     }
735   }
736 
737   if (groups == 1) {
738     deconvolution_op->compute.type = xnn_parallelization_type_5d_tile_2d;
739     deconvolution_op->compute.task_5d_tile_2d = use_gemm ?
740       (pthreadpool_task_5d_tile_2d_t) xnn_compute_subgemm2d : (pthreadpool_task_5d_tile_2d_t) xnn_compute_subconv2d;
741     deconvolution_op->compute.range[0] = batch_size;
742     deconvolution_op->compute.range[1] = stride_height * stride_width;
743     deconvolution_op->compute.range[2] = divide_round_up(output_height, stride_height);
744     deconvolution_op->compute.range[3] = divide_round_up(output_width, stride_width);
745     deconvolution_op->compute.range[4] = group_output_channels;
746     deconvolution_op->compute.tile[0] = mr;
747     deconvolution_op->compute.tile[1] = nc;
748   } else {
749     deconvolution_op->compute.type = xnn_parallelization_type_6d_tile_2d;
750     deconvolution_op->compute.task_6d_tile_2d = use_gemm ?
751       (pthreadpool_task_6d_tile_2d_t) xnn_compute_grouped_subgemm2d : (pthreadpool_task_6d_tile_2d_t) xnn_compute_grouped_subconv2d;
752     deconvolution_op->compute.range[0] = batch_size;
753     deconvolution_op->compute.range[1] = groups;
754     deconvolution_op->compute.range[2] = stride_height * stride_width;
755     deconvolution_op->compute.range[3] = divide_round_up(output_height, stride_height);
756     deconvolution_op->compute.range[4] = divide_round_up(output_width, stride_width);
757     deconvolution_op->compute.range[5] = group_output_channels;
758     deconvolution_op->compute.tile[0] = mr;
759     deconvolution_op->compute.tile[1] = nc;
760   }
761 
762   deconvolution_op->state = xnn_run_state_ready;
763   return xnn_status_success;
764 }
765 
setup_deconvolution2d_nhwc(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const void * input,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t params_size,size_t num_threads)766 static enum xnn_status setup_deconvolution2d_nhwc(
767   xnn_operator_t deconvolution_op,
768   size_t batch_size,
769   size_t input_height,
770   size_t input_width,
771   uint32_t adjustment_height,
772   uint32_t adjustment_width,
773   const void* input,
774   void* output,
775   uint32_t log2_input_element_size,
776   uint32_t log2_filter_element_size,
777   uint32_t bias_element_size,
778   uint32_t log2_output_element_size,
779   const void* params,
780   size_t params_size,
781   size_t num_threads)
782 {
783   deconvolution_op->state = xnn_run_state_invalid;
784 
785   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
786     xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
787       xnn_operator_type_to_string(deconvolution_op->type));
788     return xnn_status_uninitialized;
789   }
790 
791   if (input_width == 0 || input_height == 0) {
792     xnn_log_error(
793       "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
794       xnn_operator_type_to_string(deconvolution_op->type), input_width, input_height);
795     return xnn_status_invalid_parameter;
796   }
797 
798   if (adjustment_height >= deconvolution_op->stride_height) {
799     xnn_log_error(
800       "failed to setup %s operator with %" PRIu32 " height adjustment: "
801       "height adjustment must be smaller than height stride (%" PRIu32 ")",
802       xnn_operator_type_to_string(deconvolution_op->type), adjustment_height, deconvolution_op->stride_height);
803     return xnn_status_invalid_parameter;
804   }
805 
806   if (adjustment_width >= deconvolution_op->stride_width) {
807     xnn_log_error(
808       "failed to setup %s operator with %" PRIu32 " width adjustment: "
809       "width adjustment must be smaller than width stride (%" PRIu32 ")",
810       xnn_operator_type_to_string(deconvolution_op->type), adjustment_width, deconvolution_op->stride_width);
811     return xnn_status_invalid_parameter;
812   }
813 
814   if (batch_size == 0) {
815     deconvolution_op->state = xnn_run_state_skip;
816     return xnn_status_success;
817   }
818 
819   deconvolution_op->batch_size = batch_size;
820   deconvolution_op->input_height = input_height;
821   deconvolution_op->input_width = input_width;
822   deconvolution_op->input = input;
823   deconvolution_op->output = output;
824 
825   if (deconvolution_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
826     // Recompute padding for the input size.
827     const uint32_t dilated_kernel_height_minus_1 = (deconvolution_op->kernel_height - 1) * deconvolution_op->dilation_height;
828     const uint32_t dilated_kernel_width_minus_1 = (deconvolution_op->kernel_width - 1) * deconvolution_op->dilation_width;
829 
830     const size_t total_padding_height = doz(dilated_kernel_height_minus_1, (input_height - 1) % deconvolution_op->stride_height);
831     const size_t total_padding_width = doz(dilated_kernel_width_minus_1, (input_width - 1) % deconvolution_op->stride_width);
832 
833     const uint32_t padding_top = deconvolution_op->padding_top = total_padding_height / 2;
834     const uint32_t padding_left = deconvolution_op->padding_left = total_padding_width / 2;
835     deconvolution_op->padding_bottom = total_padding_height - padding_top;
836     deconvolution_op->padding_right = total_padding_width - padding_left;
837   }
838 
839   const size_t output_height = deconvolution_op->output_height = compute_output_dimension(
840     input_height, deconvolution_op->padding_top + deconvolution_op->padding_bottom,
841     adjustment_height, deconvolution_op->kernel_height, deconvolution_op->dilation_height, deconvolution_op->stride_height);
842   const size_t output_width = deconvolution_op->output_width = compute_output_dimension(
843     input_width, deconvolution_op->padding_left + deconvolution_op->padding_right,
844     adjustment_width, deconvolution_op->kernel_width, deconvolution_op->dilation_width, deconvolution_op->stride_width);
845 
846   switch (deconvolution_op->ukernel.type) {
847     case xnn_ukernel_type_igemm:
848       return setup_conv_path(
849         deconvolution_op,
850         batch_size,
851         input_height, input_width, input,
852         output_height, output_width, output,
853         log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
854         params, params_size, num_threads);
855     case xnn_ukernel_type_subconv2d:
856     {
857       const bool no_padding = (deconvolution_op->padding_top | deconvolution_op->padding_right | deconvolution_op->padding_bottom | deconvolution_op->padding_left) == 0;
858       const bool no_adjustment = (adjustment_height | adjustment_width) == 0;
859       const bool use_gemm = no_padding && no_adjustment &&
860         deconvolution_op->kernel_height == deconvolution_op->stride_height &&
861         deconvolution_op->kernel_width == deconvolution_op->stride_width &&
862         deconvolution_op->ukernel.igemm.gemm_case.function[XNN_UARCH_DEFAULT] != NULL;
863       return setup_subconv2d_path(
864         deconvolution_op,
865         batch_size,
866         input_height, input_width, input,
867         output_height, output_width, output,
868         log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
869         params, params_size, num_threads, use_gemm);
870     }
871     default:
872       XNN_UNREACHABLE;
873   }
874 }
875 
xnn_setup_deconvolution2d_nhwc_qu8(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const uint8_t * input,uint8_t * output,pthreadpool_t threadpool)876 enum xnn_status xnn_setup_deconvolution2d_nhwc_qu8(
877     xnn_operator_t deconvolution_op,
878     size_t batch_size,
879     size_t input_height,
880     size_t input_width,
881     uint32_t adjustment_height,
882     uint32_t adjustment_width,
883     const uint8_t* input,
884     uint8_t* output,
885     pthreadpool_t threadpool)
886 {
887   if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_qu8) {
888     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
889       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8),
890       xnn_operator_type_to_string(deconvolution_op->type));
891     return xnn_status_invalid_parameter;
892   }
893 
894   return setup_deconvolution2d_nhwc(
895     deconvolution_op,
896     batch_size, input_height, input_width,
897     adjustment_height, adjustment_width,
898     input, output,
899     0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
900     0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
901     sizeof(int32_t) /* sizeof(bias element) */,
902     0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
903     &deconvolution_op->params.qu8_gemm, sizeof(deconvolution_op->params.qu8_gemm),
904     pthreadpool_get_threads_count(threadpool));
905 }
906 
xnn_setup_deconvolution2d_nhwc_f32(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const float * input,float * output,pthreadpool_t threadpool)907 enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
908     xnn_operator_t deconvolution_op,
909     size_t batch_size,
910     size_t input_height,
911     size_t input_width,
912     uint32_t adjustment_height,
913     uint32_t adjustment_width,
914     const float* input,
915     float* output,
916     pthreadpool_t threadpool)
917 {
918   if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_f32) {
919     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
920       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32),
921       xnn_operator_type_to_string(deconvolution_op->type));
922     return xnn_status_invalid_parameter;
923   }
924 
925   return setup_deconvolution2d_nhwc(
926     deconvolution_op,
927     batch_size, input_height, input_width,
928     adjustment_height, adjustment_width,
929     input, output,
930     2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
931     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
932     sizeof(float) /* sizeof(bias element) */,
933     2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
934     &deconvolution_op->params.f32_minmax, sizeof(deconvolution_op->params.f32_minmax),
935     pthreadpool_get_threads_count(threadpool));
936 }
937