• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 #include <stdbool.h>
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <string.h>
14 #include <math.h>
15 
16 #include <xnnpack.h>
17 #include <xnnpack/allocator.h>
18 #include <xnnpack/indirection.h>
19 #include <xnnpack/log.h>
20 #include <xnnpack/math.h>
21 #include <xnnpack/operator.h>
22 #include <xnnpack/pack.h>
23 #include <xnnpack/params.h>
24 
25 
compute_output_dimension(size_t input_dimension,size_t output_padding_dimension,size_t adjustment_dimension,size_t kernel_dimension,size_t dilation_dimension,size_t stride_dimension)26 static inline size_t compute_output_dimension(
27     size_t input_dimension,
28     size_t output_padding_dimension,
29     size_t adjustment_dimension,
30     size_t kernel_dimension,
31     size_t dilation_dimension,
32     size_t stride_dimension)
33 {
34   const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
35   return doz(
36     stride_dimension * (input_dimension - 1) + adjustment_dimension + effective_kernel_dimension,
37     output_padding_dimension);
38 }
39 
create_deconvolution2d_nhwc(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,const void * kernel,const void * bias,uint32_t flags,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,xnn_pack_conv_goki_w_function pack_conv_goki_w,xnn_pack_deconv_goki_w_function pack_deconv_goki_w,const void * packing_params,int input_padding_byte,int packed_weights_padding_byte,const void * params,size_t params_size,const struct gemm_parameters * gemm_parameters,const struct gemm_fused_ukernels * gemm_ukernels,enum xnn_operator_type operator_type,xnn_operator_t * deconvolution_op_out)40 static enum xnn_status create_deconvolution2d_nhwc(
41     uint32_t output_padding_top,
42     uint32_t output_padding_right,
43     uint32_t output_padding_bottom,
44     uint32_t output_padding_left,
45     uint32_t kernel_height,
46     uint32_t kernel_width,
47     uint32_t stride_height,
48     uint32_t stride_width,
49     uint32_t dilation_height,
50     uint32_t dilation_width,
51     uint32_t groups,
52     size_t group_input_channels,
53     size_t group_output_channels,
54     size_t input_pixel_stride,
55     size_t output_pixel_stride,
56     const void* kernel,
57     const void* bias,
58     uint32_t flags,
59     uint32_t log2_input_element_size,
60     uint32_t log2_filter_element_size,
61     uint32_t bias_element_size,
62     xnn_pack_conv_goki_w_function pack_conv_goki_w,
63     xnn_pack_deconv_goki_w_function pack_deconv_goki_w,
64     const void* packing_params,
65     int input_padding_byte,
66     int packed_weights_padding_byte,
67     const void* params,
68     size_t params_size,
69     const struct gemm_parameters* gemm_parameters,
70     const struct gemm_fused_ukernels* gemm_ukernels,
71     enum xnn_operator_type operator_type,
72     xnn_operator_t* deconvolution_op_out)
73 {
74   xnn_operator_t deconvolution_op = NULL;
75   enum xnn_status status = xnn_status_uninitialized;
76 
77   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
78     xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
79       xnn_operator_type_to_string(operator_type));
80     goto error;
81   }
82 
83   status = xnn_status_invalid_parameter;
84 
85   if (kernel_width == 0 || kernel_height == 0) {
86     xnn_log_error(
87       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
88       xnn_operator_type_to_string(operator_type), kernel_width, kernel_height);
89     goto error;
90   }
91 
92   if (stride_width == 0 || stride_height == 0) {
93     xnn_log_error(
94       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
95       xnn_operator_type_to_string(operator_type), stride_width, stride_height);
96     goto error;
97   }
98 
99   if (dilation_width == 0 || dilation_height == 0) {
100     xnn_log_error(
101       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " dilation: dilation dimensions must be non-zero",
102       xnn_operator_type_to_string(operator_type), dilation_width, dilation_height);
103     goto error;
104   }
105 
106   if (groups == 0) {
107     xnn_log_error(
108       "failed to create %s operator with %" PRIu32 " groups: number of groups must be non-zero",
109       xnn_operator_type_to_string(operator_type), groups);
110     goto error;
111   }
112 
113   if (group_input_channels == 0) {
114     xnn_log_error(
115       "failed to create %s operator with %zu input channels per group: number of channels must be non-zero",
116       xnn_operator_type_to_string(operator_type), group_input_channels);
117     goto error;
118   }
119 
120   if (group_output_channels == 0) {
121     xnn_log_error(
122       "failed to create %s operator with %zu output channels per group: number of channels must be non-zero",
123       xnn_operator_type_to_string(operator_type), group_output_channels);
124     goto error;
125   }
126 
127   const size_t input_channels = groups * group_input_channels;
128   if (input_pixel_stride < input_channels) {
129     xnn_log_error(
130       "failed to create %s operator with input pixel stride of %zu: "
131       "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
132       xnn_operator_type_to_string(operator_type),
133       input_pixel_stride, groups, group_input_channels);
134     goto error;
135   }
136 
137   const size_t output_channels = groups * group_output_channels;
138   if (output_pixel_stride < output_channels) {
139     xnn_log_error(
140       "failed to create %s operator with output pixel stride of %zu: "
141       "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
142       xnn_operator_type_to_string(operator_type),
143       output_pixel_stride, groups, group_output_channels);
144     goto error;
145   }
146 
147   status = xnn_status_out_of_memory;
148 
149   deconvolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
150   if (deconvolution_op == NULL) {
151     xnn_log_error(
152       "failed to allocate %zu bytes for %s operator descriptor",
153       sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
154     goto error;
155   }
156 
157   const uint32_t mr = gemm_parameters->mr;
158   const uint32_t nr = gemm_parameters->nr;
159   const uint32_t kr = UINT32_C(1) << gemm_parameters->log2_kr;
160   const uint32_t sr = UINT32_C(1) << gemm_parameters->log2_sr;
161 
162   const uint32_t n_stride = round_up(group_output_channels, nr);
163   const uint32_t k_stride = round_up_po2(group_input_channels, kr * sr);
164   const uint32_t kernel_size = kernel_height * kernel_width;
165   enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_igemm;
166   size_t packed_group_weights_size = (((kernel_size * k_stride) << log2_filter_element_size) + bias_element_size) * n_stride;
167   if (max(stride_height, stride_width) > 1 && max(dilation_height, dilation_width) == 1 && stride_width <= kernel_width && stride_height <= kernel_height) {
168     ukernel_type = xnn_ukernel_type_subconv2d;
169     const size_t subkernels = stride_height * stride_width;
170     packed_group_weights_size = n_stride *
171       (((kernel_size * k_stride) << log2_filter_element_size) + bias_element_size * subkernels);
172 
173     const size_t subconvolution_buffer_size = sizeof(struct subconvolution_params) * subkernels;
174     deconvolution_op->subconvolution_buffer = xnn_allocate_zero_memory(subconvolution_buffer_size);
175     if (deconvolution_op->subconvolution_buffer == NULL) {
176       xnn_log_error(
177         "failed to allocate %zu bytes for %s operator subconvolution buffer",
178         subconvolution_buffer_size, xnn_operator_type_to_string(operator_type));
179       goto error;
180     }
181 
182     struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
183     for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
184       for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
185         const size_t subkernel_height = divide_round_up(kernel_height - offset_y, stride_height);
186         const size_t subkernel_width = divide_round_up(kernel_width - offset_x, stride_width);
187         const size_t subkernel_size = subkernel_height * subkernel_width;
188 
189         subconvolution_params->indirection_x_stride = sizeof(void*) * subkernel_size;
190         subconvolution_params->w_stride = bias_element_size + ((k_stride * subkernel_size) << log2_filter_element_size);
191         subconvolution_params++;
192       }
193     }
194   }
195   deconvolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
196   if (deconvolution_op->packed_weights == NULL) {
197     xnn_log_error(
198       "failed to allocate %zu bytes for %s operator packed weights",
199       packed_group_weights_size * groups, xnn_operator_type_to_string(operator_type));
200     goto error;
201   }
202   memset(deconvolution_op->packed_weights, packed_weights_padding_byte, packed_group_weights_size * groups);
203 
204   switch (ukernel_type) {
205     case xnn_ukernel_type_igemm:
206       pack_conv_goki_w(
207         groups, group_output_channels, kernel_size, group_input_channels,
208         nr, kr, sr,
209         kernel, bias, deconvolution_op->packed_weights,
210         0 /* extra bytes */,
211         packing_params);
212       break;
213     case xnn_ukernel_type_subconv2d:
214       pack_deconv_goki_w(
215         groups, group_output_channels, kernel_height, kernel_width, group_input_channels,
216         stride_height, stride_width,
217         nr, kr, sr,
218         kernel, bias, deconvolution_op->packed_weights, deconvolution_op->subconvolution_buffer,
219         packing_params);
220       break;
221     default:
222       XNN_UNREACHABLE;
223   }
224 
225   const size_t zero_size = (k_stride << log2_input_element_size) + XNN_EXTRA_BYTES;
226   deconvolution_op->zero_buffer = xnn_allocate_simd_memory(zero_size);
227   if (deconvolution_op->zero_buffer == NULL) {
228     xnn_log_error(
229       "failed to allocate %zu bytes for %s operator zero padding",
230       zero_size, xnn_operator_type_to_string(operator_type));
231     goto error;
232   }
233   memset(deconvolution_op->zero_buffer, input_padding_byte, zero_size);
234 
235   deconvolution_op->padding_top = output_padding_top;
236   deconvolution_op->padding_right = output_padding_right;
237   deconvolution_op->padding_bottom = output_padding_bottom;
238   deconvolution_op->padding_left = output_padding_left;
239 
240   deconvolution_op->kernel_height = kernel_height;
241   deconvolution_op->kernel_width = kernel_width;
242   deconvolution_op->stride_height = stride_height;
243   deconvolution_op->stride_width = stride_width;
244   deconvolution_op->dilation_height = dilation_height;
245   deconvolution_op->dilation_width = dilation_width;
246   deconvolution_op->groups = groups;
247   deconvolution_op->group_input_channels = group_input_channels;
248   deconvolution_op->group_output_channels = group_output_channels;
249   deconvolution_op->input_pixel_stride = input_pixel_stride;
250   deconvolution_op->output_pixel_stride = output_pixel_stride;
251 
252   memcpy(&deconvolution_op->params, params, params_size);
253   deconvolution_op->type = operator_type;
254   deconvolution_op->ukernel.type = ukernel_type;
255   deconvolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
256     .general_case = gemm_ukernels->igemm,
257     .gemm_case = gemm_ukernels->gemm,
258     .mr = mr,
259     .nr = nr,
260     .kr = kr,
261     .sr = sr,
262   };
263 
264   deconvolution_op->state = xnn_run_state_invalid;
265 
266   *deconvolution_op_out = deconvolution_op;
267   return xnn_status_success;
268 
269 error:
270   xnn_delete_operator(deconvolution_op);
271   return status;
272 }
273 
xnn_create_deconvolution2d_nhwc_qs8(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,int8_t input_zero_point,float input_scale,float kernel_scale,const int8_t * kernel,const int32_t * bias,int8_t output_zero_point,float output_scale,int8_t output_min,int8_t output_max,uint32_t flags,xnn_operator_t * deconvolution_op_out)274 enum xnn_status xnn_create_deconvolution2d_nhwc_qs8(
275     uint32_t output_padding_top,
276     uint32_t output_padding_right,
277     uint32_t output_padding_bottom,
278     uint32_t output_padding_left,
279     uint32_t kernel_height,
280     uint32_t kernel_width,
281     uint32_t stride_height,
282     uint32_t stride_width,
283     uint32_t dilation_height,
284     uint32_t dilation_width,
285     uint32_t groups,
286     size_t group_input_channels,
287     size_t group_output_channels,
288     size_t input_pixel_stride,
289     size_t output_pixel_stride,
290     int8_t input_zero_point,
291     float input_scale,
292     float kernel_scale,
293     const int8_t* kernel,
294     const int32_t* bias,
295     int8_t output_zero_point,
296     float output_scale,
297     int8_t output_min,
298     int8_t output_max,
299     uint32_t flags,
300     xnn_operator_t* deconvolution_op_out)
301 {
302   if (input_scale <= 0.0f || !isnormal(input_scale)) {
303     xnn_log_error(
304       "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
305       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qs8), input_scale);
306     return xnn_status_invalid_parameter;
307   }
308 
309   if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
310     xnn_log_error(
311       "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
312       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qs8), kernel_scale);
313     return xnn_status_invalid_parameter;
314   }
315 
316   if (output_scale <= 0.0f || !isnormal(output_scale)) {
317     xnn_log_error(
318       "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
319       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qs8), output_scale);
320     return xnn_status_invalid_parameter;
321   }
322 
323   if (output_min >= output_max) {
324     xnn_log_error(
325       "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: range min must be below range max",
326       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qs8), output_min, output_max);
327     return xnn_status_invalid_parameter;
328   }
329 
330   const float requantization_scale = input_scale * kernel_scale / output_scale;
331   if (requantization_scale >= 256.0f) {
332     xnn_log_error(
333       "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
334       "requantization scale %.7g is greater or equal to 256.0",
335       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qs8),
336       input_scale, kernel_scale, output_scale, requantization_scale);
337     return xnn_status_unsupported_parameter;
338   }
339 
340   union xnn_qs8_conv_minmax_params params;
341   if XNN_LIKELY(xnn_params.qs8.gemm.init.qs8 != NULL) {
342     xnn_params.qs8.gemm.init.qs8(&params,
343       requantization_scale, output_zero_point, output_min, output_max);
344   }
345   const struct xnn_qs8_packing_params packing_params = {
346     .input_zero_point = input_zero_point,
347   };
348   return create_deconvolution2d_nhwc(
349     output_padding_top, output_padding_right, output_padding_bottom, output_padding_left,
350     kernel_height, kernel_width,
351     stride_height, stride_width,
352     dilation_height, dilation_width,
353     groups, group_input_channels, group_output_channels,
354     input_pixel_stride, output_pixel_stride,
355     kernel, bias, flags,
356     0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
357     0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
358     sizeof(int32_t) /* sizeof(bias element) */,
359     (xnn_pack_conv_goki_w_function) xnn_pack_qs8_conv_goki_w,
360     (xnn_pack_deconv_goki_w_function) xnn_pack_qs8_deconv_goki_w,
361     &packing_params, input_zero_point /* input padding byte */, 0 /* packed weights padding byte */,
362     &params, sizeof(params),
363     &xnn_params.qs8.gemm, &xnn_params.qs8.gemm.minmax,
364     xnn_operator_type_deconvolution_nhwc_qs8,
365     deconvolution_op_out);
366 }
367 
xnn_create_deconvolution2d_nhwc_qu8(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,uint8_t input_zero_point,float input_scale,uint8_t kernel_zero_point,float kernel_scale,const uint8_t * kernel,const int32_t * bias,uint8_t output_zero_point,float output_scale,uint8_t output_min,uint8_t output_max,uint32_t flags,xnn_operator_t * deconvolution_op_out)368 enum xnn_status xnn_create_deconvolution2d_nhwc_qu8(
369     uint32_t output_padding_top,
370     uint32_t output_padding_right,
371     uint32_t output_padding_bottom,
372     uint32_t output_padding_left,
373     uint32_t kernel_height,
374     uint32_t kernel_width,
375     uint32_t stride_height,
376     uint32_t stride_width,
377     uint32_t dilation_height,
378     uint32_t dilation_width,
379     uint32_t groups,
380     size_t group_input_channels,
381     size_t group_output_channels,
382     size_t input_pixel_stride,
383     size_t output_pixel_stride,
384     uint8_t input_zero_point,
385     float input_scale,
386     uint8_t kernel_zero_point,
387     float kernel_scale,
388     const uint8_t* kernel,
389     const int32_t* bias,
390     uint8_t output_zero_point,
391     float output_scale,
392     uint8_t output_min,
393     uint8_t output_max,
394     uint32_t flags,
395     xnn_operator_t* deconvolution_op_out)
396 {
397   if (input_scale <= 0.0f || !isnormal(input_scale)) {
398     xnn_log_error(
399       "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
400       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), input_scale);
401     return xnn_status_invalid_parameter;
402   }
403 
404   if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
405     xnn_log_error(
406       "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
407       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), kernel_scale);
408     return xnn_status_invalid_parameter;
409   }
410 
411   if (output_scale <= 0.0f || !isnormal(output_scale)) {
412     xnn_log_error(
413       "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
414       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), output_scale);
415     return xnn_status_invalid_parameter;
416   }
417 
418   if (output_min >= output_max) {
419     xnn_log_error(
420       "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
421       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), output_min, output_max);
422     return xnn_status_invalid_parameter;
423   }
424 
425   const float requantization_scale = input_scale * kernel_scale / output_scale;
426   if (requantization_scale >= 256.0f) {
427     xnn_log_error(
428       "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
429       "requantization scale %.7g is greater or equal to 256.0",
430       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8),
431       input_scale, kernel_scale, output_scale, requantization_scale);
432     return xnn_status_unsupported_parameter;
433   }
434 
435   union xnn_qu8_conv_minmax_params params;
436   if XNN_LIKELY(xnn_params.qu8.gemm.init.qu8 != NULL) {
437     xnn_params.qu8.gemm.init.qu8(&params,
438       kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max);
439   }
440   const struct xnn_qu8_packing_params packing_params = {
441     .input_zero_point = input_zero_point,
442     .kernel_zero_point = kernel_zero_point,
443   };
444   return create_deconvolution2d_nhwc(
445     output_padding_top, output_padding_right, output_padding_bottom, output_padding_left,
446     kernel_height, kernel_width,
447     stride_height, stride_width,
448     dilation_height, dilation_width,
449     groups, group_input_channels, group_output_channels,
450     input_pixel_stride, output_pixel_stride,
451     kernel, bias, flags,
452     0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
453     0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
454     sizeof(int32_t) /* sizeof(bias element) */,
455     (xnn_pack_conv_goki_w_function) xnn_pack_qu8_conv_goki_w,
456     (xnn_pack_deconv_goki_w_function) xnn_pack_qu8_deconv_goki_w,
457     &packing_params, input_zero_point /* input padding byte */, kernel_zero_point /* packed weights padding byte */,
458     &params, sizeof(params),
459     &xnn_params.qu8.gemm, &xnn_params.qu8.gemm.minmax,
460     xnn_operator_type_deconvolution_nhwc_qu8,
461     deconvolution_op_out);
462 }
463 
xnn_create_deconvolution2d_nhwc_f32(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,const float * kernel,const float * bias,float output_min,float output_max,uint32_t flags,xnn_operator_t * deconvolution_op_out)464 enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
465     uint32_t output_padding_top,
466     uint32_t output_padding_right,
467     uint32_t output_padding_bottom,
468     uint32_t output_padding_left,
469     uint32_t kernel_height,
470     uint32_t kernel_width,
471     uint32_t stride_height,
472     uint32_t stride_width,
473     uint32_t dilation_height,
474     uint32_t dilation_width,
475     uint32_t groups,
476     size_t group_input_channels,
477     size_t group_output_channels,
478     size_t input_pixel_stride,
479     size_t output_pixel_stride,
480     const float* kernel,
481     const float* bias,
482     float output_min,
483     float output_max,
484     uint32_t flags,
485     xnn_operator_t* deconvolution_op_out)
486 {
487   if (isnan(output_min)) {
488     xnn_log_error(
489       "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
490       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32));
491     return xnn_status_invalid_parameter;
492   }
493 
494   if (isnan(output_max)) {
495     xnn_log_error(
496       "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
497       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32));
498     return xnn_status_invalid_parameter;
499   }
500 
501   if (output_min >= output_max) {
502     xnn_log_error(
503       "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
504       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32), output_min, output_max);
505     return xnn_status_invalid_parameter;
506   }
507 
508   const struct gemm_parameters* gemm_parameters = &xnn_params.f32.gemm;
509   if (gemm_parameters->nr > group_output_channels) {
510     // Default micro-kernel is suboptimal. Try to find a better micro-kernel.
511     if (xnn_params.f32.gemm2.minmax.igemm.function[XNN_UARCH_DEFAULT] != NULL) {
512       gemm_parameters = &xnn_params.f32.gemm2;
513     }
514   }
515   const struct gemm_fused_ukernels* gemm_ukernels = &gemm_parameters->minmax;
516   const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max);
517   if (linear_activation && gemm_parameters->linear.gemm.function[XNN_UARCH_DEFAULT] != NULL) {
518     gemm_ukernels = &gemm_parameters->linear;
519   }
520 
521   union xnn_f32_minmax_params params;
522   if XNN_LIKELY(xnn_params.f32.gemm.init.f32 != NULL) {
523     gemm_parameters->init.f32(&params, output_min, output_max);
524   }
525   return create_deconvolution2d_nhwc(
526     output_padding_top, output_padding_right, output_padding_bottom, output_padding_left,
527     kernel_height, kernel_width,
528     stride_height, stride_width,
529     dilation_height, dilation_width,
530     groups, group_input_channels, group_output_channels,
531     input_pixel_stride, output_pixel_stride,
532     kernel, bias, flags,
533     2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
534     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
535     sizeof(float) /* sizeof(bias element) */,
536     (xnn_pack_conv_goki_w_function) xnn_pack_f32_conv_goki_w,
537     (xnn_pack_deconv_goki_w_function) xnn_pack_f32_deconv_goki_w,
538     NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
539     &params, sizeof(params),
540     gemm_parameters, gemm_ukernels,
541     xnn_operator_type_deconvolution_nhwc_f32,
542     deconvolution_op_out);
543 }
544 
setup_conv_path(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,size_t output_height,size_t output_width,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t params_size,size_t num_threads)545 static enum xnn_status setup_conv_path(
546   xnn_operator_t deconvolution_op,
547   size_t batch_size,
548   size_t input_height,
549   size_t input_width,
550   const void* input,
551   size_t output_height,
552   size_t output_width,
553   void* output,
554   uint32_t log2_input_element_size,
555   uint32_t log2_filter_element_size,
556   uint32_t bias_element_size,
557   uint32_t log2_output_element_size,
558   const void* params,
559   size_t params_size,
560   size_t num_threads)
561 {
562   assert(deconvolution_op->ukernel.type == xnn_ukernel_type_igemm);
563 
564   const size_t kernel_height = deconvolution_op->kernel_height;
565   const size_t kernel_width = deconvolution_op->kernel_width;
566   const size_t kernel_size = kernel_height * kernel_width;
567 
568   const size_t groups = deconvolution_op->groups;
569   const size_t output_size = output_height * output_width;
570   const size_t mr = deconvolution_op->ukernel.igemm.mr;
571   const size_t tiled_output_size = round_up(output_size, mr);
572   const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
573 
574   if (input_height != deconvolution_op->last_input_height ||
575       input_width != deconvolution_op->last_input_width)
576   {
577     const void** indirection_buffer = (const void**) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
578     if (indirection_buffer == NULL) {
579       xnn_log_error(
580         "failed to allocate %zu bytes for %s operator indirection buffer",
581         indirection_buffer_size, xnn_operator_type_to_string(deconvolution_op->type));
582       return xnn_status_out_of_memory;
583     }
584     deconvolution_op->indirection_buffer = indirection_buffer;
585     deconvolution_op->last_input = input;
586     deconvolution_op->last_input_height = input_height;
587     deconvolution_op->last_input_width = input_width;
588 
589     xnn_indirection_init_deconv2d(deconvolution_op, mr, log2_input_element_size);
590   }
591 
592   const size_t group_input_channels = deconvolution_op->group_input_channels;
593   const size_t group_output_channels = deconvolution_op->group_output_channels;
594   const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
595   const size_t w_stride = bias_element_size +
596     (round_up_po2(group_input_channels, deconvolution_op->ukernel.igemm.kr * deconvolution_op->ukernel.igemm.sr) * kernel_size << log2_filter_element_size);
597   deconvolution_op->context.igemm = (struct igemm_context) {
598       .ks = kernel_size,
599       .ks_scaled = kernel_size * mr * sizeof(void*),
600       .kc = group_input_channels << log2_input_element_size,
601       .w_stride = w_stride,
602       .indirect_a = deconvolution_op->indirection_buffer,
603       .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
604       .zero = deconvolution_op->zero_buffer,
605       .packed_w = deconvolution_op->packed_weights,
606       .c = deconvolution_op->output,
607       .cm_stride = deconvolution_op->output_pixel_stride << log2_output_element_size,
608       .cn_stride = nr << log2_output_element_size,
609       .ga_stride = group_input_channels << log2_input_element_size,
610       .gw_stride = w_stride * round_up(group_output_channels, nr),
611       .gc_stride = group_output_channels << log2_output_element_size,
612       .ba_stride = input_height * input_width * deconvolution_op->input_pixel_stride << log2_input_element_size,
613       .bc_stride = output_size * deconvolution_op->output_pixel_stride << log2_output_element_size,
614       .log2_csize = log2_output_element_size,
615       .ukernel = deconvolution_op->ukernel.igemm.general_case,
616   };
617   if (output_size == 1 && deconvolution_op->ukernel.igemm.mr1_case.function[XNN_UARCH_DEFAULT] != NULL) {
618     deconvolution_op->context.igemm.ukernel = deconvolution_op->ukernel.igemm.mr1_case;
619   }
620   memcpy(&deconvolution_op->context.igemm.params, params, params_size);
621 
622   size_t nc = group_output_channels;
623   if (num_threads > 1) {
624     const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
625     const size_t target_tiles_per_thread = 5;
626     const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
627     if (max_nc < nc) {
628       nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
629     }
630   }
631   if (groups == 1) {
632     if (batch_size > 1) {
633       deconvolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
634       deconvolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
635       deconvolution_op->compute.range[0] = batch_size;
636       deconvolution_op->compute.range[1] = output_size;
637       deconvolution_op->compute.range[2] = group_output_channels;
638     } else {
639       deconvolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
640       deconvolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
641       deconvolution_op->compute.range[0] = output_size;
642       deconvolution_op->compute.range[1] = group_output_channels;
643     }
644     deconvolution_op->compute.tile[0] = mr;
645     deconvolution_op->compute.tile[1] = nc;
646   } else {
647     if (batch_size > 1) {
648       deconvolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
649       deconvolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
650       deconvolution_op->compute.range[0] = batch_size;
651       deconvolution_op->compute.range[1] = groups;
652       deconvolution_op->compute.range[2] = output_size;
653       deconvolution_op->compute.range[3] = group_output_channels;
654     } else {
655       deconvolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
656       deconvolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
657       deconvolution_op->compute.range[0] = groups;
658       deconvolution_op->compute.range[1] = output_size;
659       deconvolution_op->compute.range[2] = group_output_channels;
660     }
661     deconvolution_op->compute.tile[0] = mr;
662     deconvolution_op->compute.tile[1] = nc;
663   }
664   deconvolution_op->state = xnn_run_state_ready;
665   return xnn_status_success;
666 }
667 
setup_subconv2d_path(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,size_t output_height,size_t output_width,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t params_size,size_t num_threads,bool use_gemm)668 static enum xnn_status setup_subconv2d_path(
669   xnn_operator_t deconvolution_op,
670   size_t batch_size,
671   size_t input_height,
672   size_t input_width,
673   const void* input,
674   size_t output_height,
675   size_t output_width,
676   void* output,
677   uint32_t log2_input_element_size,
678   uint32_t log2_filter_element_size,
679   uint32_t bias_element_size,
680   uint32_t log2_output_element_size,
681   const void* params,
682   size_t params_size,
683   size_t num_threads,
684   bool use_gemm)
685 {
686   assert(deconvolution_op->ukernel.type == xnn_ukernel_type_subconv2d);
687 
688   const size_t kernel_height = deconvolution_op->kernel_height;
689   const size_t kernel_width = deconvolution_op->kernel_width;
690   const size_t kernel_size = kernel_height * kernel_width;
691   const size_t stride_height = deconvolution_op->stride_height;
692   const size_t stride_width = deconvolution_op->stride_width;
693 
694   const size_t groups = deconvolution_op->groups;
695   const size_t output_size = output_height * output_width;
696   const size_t mr = deconvolution_op->ukernel.igemm.mr;
697 
698   const size_t input_pixel_stride = deconvolution_op->input_pixel_stride << log2_input_element_size;
699   const size_t output_pixel_stride = deconvolution_op->output_pixel_stride << log2_output_element_size;
700 
701   const bool any_size_change =
702     input_height != deconvolution_op->last_input_height ||
703     input_width != deconvolution_op->last_input_width ||
704     output_height != deconvolution_op->last_output_height ||
705     output_width != deconvolution_op->last_output_width;
706 
707   if (any_size_change || output != deconvolution_op->last_output) {
708     // Initialize subconvolution parameters which depend on output dimensions or MR.
709     struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
710     const size_t modulo_padding_top = deconvolution_op->padding_top % stride_height;
711     const size_t modulo_padding_left = deconvolution_op->padding_left % stride_width;
712     for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
713       for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
714         const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
715         const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
716         subconvolution_params->scaled_kernel_size = mr * subconvolution_params->indirection_x_stride;
717         subconvolution_params->slice_width = divide_round_up(output_width - output_x_start, stride_width);
718         subconvolution_params->slice_height = divide_round_up(output_height - output_y_start, stride_height);
719         subconvolution_params->output =
720           (void*) ((uintptr_t) output + ((output_y_start * output_width + output_x_start) * output_pixel_stride));
721         ++subconvolution_params;
722       }
723     }
724     deconvolution_op->last_output = output;
725   }
726 
727   if (any_size_change) {
728     if (!use_gemm) {
729       const size_t indirection_buffer_size = sizeof(void*) *
730         kernel_size * output_height * stride_width * round_up(divide_round_up(output_width, stride_width), mr);
731 
732       const void** indirection_buffer =
733         (const void**) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
734       if (indirection_buffer == NULL) {
735         xnn_log_error(
736           "failed to allocate %zu bytes for %s operator indirection buffer",
737           indirection_buffer_size, xnn_operator_type_to_string(deconvolution_op->type));
738         return xnn_status_out_of_memory;
739       }
740       deconvolution_op->indirection_buffer = indirection_buffer;
741       deconvolution_op->last_input = input;
742 
743       xnn_indirection_init_subconv2d(deconvolution_op, mr, log2_input_element_size);
744     }
745     deconvolution_op->last_input_height = input_height;
746     deconvolution_op->last_input_width = input_width;
747     deconvolution_op->last_output_height = output_height;
748     deconvolution_op->last_output_width = output_width;
749   }
750 
751   const size_t group_input_channels = deconvolution_op->group_input_channels;
752   const size_t group_output_channels = deconvolution_op->group_output_channels;
753   const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
754   const uint32_t kr = deconvolution_op->ukernel.igemm.kr;
755   const uint32_t sr = deconvolution_op->ukernel.igemm.sr;
756   const size_t w_stride = stride_height * stride_width * bias_element_size +
757     (round_up_po2(group_input_channels, kr * sr) * kernel_size << log2_filter_element_size);
758   if (use_gemm) {
759     deconvolution_op->context.subgemm = (struct subgemm_context) {
760         .subconvolution_params = deconvolution_op->subconvolution_buffer,
761         .kc = group_input_channels << log2_input_element_size,
762         .a = input,
763         .ax_stride = input_pixel_stride,
764         .ay_stride = input_width * input_pixel_stride,
765         .cx_stride = stride_width * output_pixel_stride,
766         .cy_stride = stride_height * output_width * output_pixel_stride,
767         .cn_stride = nr << log2_output_element_size,
768         .ga_stride = group_input_channels << log2_input_element_size,
769         .gw_stride = w_stride * round_up(group_output_channels, nr),
770         .gc_stride = group_output_channels << log2_output_element_size,
771         .ba_stride = input_height * input_width * input_pixel_stride,
772         .bc_stride = output_size * output_pixel_stride,
773         .log2_csize = log2_output_element_size,
774         .ukernel = deconvolution_op->ukernel.igemm.gemm_case,
775     };
776     memcpy(&deconvolution_op->context.subgemm.params, params, params_size);
777   } else {
778     deconvolution_op->context.subconv = (struct subconv_context) {
779         .subconvolution_params = deconvolution_op->subconvolution_buffer,
780         .kc = group_input_channels << log2_input_element_size,
781         .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
782         .zero = deconvolution_op->zero_buffer,
783         .cx_stride = stride_width * output_pixel_stride,
784         .cy_stride = stride_height * output_width * output_pixel_stride,
785         .cn_stride = nr << log2_output_element_size,
786         .ga_stride = group_input_channels << log2_input_element_size,
787         .gw_stride = w_stride * round_up(group_output_channels, nr),
788         .gc_stride = group_output_channels << log2_output_element_size,
789         .ba_stride = input_height * input_width * input_pixel_stride,
790         .bc_stride = output_size * output_pixel_stride,
791         .log2_csize = log2_output_element_size,
792         .ukernel = deconvolution_op->ukernel.igemm.general_case,
793     };
794     memcpy(&deconvolution_op->context.subconv.params, params, params_size);
795   }
796 
797   const size_t output_height_positions = divide_round_up(output_height, stride_height);
798   const size_t output_width_positions = divide_round_up(output_width, stride_width);
799 
800   size_t nc = group_output_channels;
801   if (num_threads > 1) {
802     const size_t num_other_tiles = groups * stride_height * stride_width *
803       output_height_positions * divide_round_up(output_width_positions, mr);
804     const size_t target_tiles_per_thread = 5;
805     const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
806     if (max_nc < nc) {
807       nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
808     }
809   }
810 
811   if (groups == 1) {
812     deconvolution_op->compute.type = xnn_parallelization_type_5d_tile_2d;
813     deconvolution_op->compute.task_5d_tile_2d = use_gemm ?
814       (pthreadpool_task_5d_tile_2d_t) xnn_compute_subgemm2d : (pthreadpool_task_5d_tile_2d_t) xnn_compute_subconv2d;
815     deconvolution_op->compute.range[0] = batch_size;
816     deconvolution_op->compute.range[1] = stride_height * stride_width;
817     deconvolution_op->compute.range[2] = divide_round_up(output_height, stride_height);
818     deconvolution_op->compute.range[3] = divide_round_up(output_width, stride_width);
819     deconvolution_op->compute.range[4] = group_output_channels;
820     deconvolution_op->compute.tile[0] = mr;
821     deconvolution_op->compute.tile[1] = nc;
822   } else {
823     deconvolution_op->compute.type = xnn_parallelization_type_6d_tile_2d;
824     deconvolution_op->compute.task_6d_tile_2d = use_gemm ?
825       (pthreadpool_task_6d_tile_2d_t) xnn_compute_grouped_subgemm2d : (pthreadpool_task_6d_tile_2d_t) xnn_compute_grouped_subconv2d;
826     deconvolution_op->compute.range[0] = batch_size;
827     deconvolution_op->compute.range[1] = groups;
828     deconvolution_op->compute.range[2] = stride_height * stride_width;
829     deconvolution_op->compute.range[3] = divide_round_up(output_height, stride_height);
830     deconvolution_op->compute.range[4] = divide_round_up(output_width, stride_width);
831     deconvolution_op->compute.range[5] = group_output_channels;
832     deconvolution_op->compute.tile[0] = mr;
833     deconvolution_op->compute.tile[1] = nc;
834   }
835 
836   deconvolution_op->state = xnn_run_state_ready;
837   return xnn_status_success;
838 }
839 
setup_deconvolution2d_nhwc(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const void * input,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t params_size,size_t num_threads)840 static enum xnn_status setup_deconvolution2d_nhwc(
841   xnn_operator_t deconvolution_op,
842   size_t batch_size,
843   size_t input_height,
844   size_t input_width,
845   uint32_t adjustment_height,
846   uint32_t adjustment_width,
847   const void* input,
848   void* output,
849   uint32_t log2_input_element_size,
850   uint32_t log2_filter_element_size,
851   uint32_t bias_element_size,
852   uint32_t log2_output_element_size,
853   const void* params,
854   size_t params_size,
855   size_t num_threads)
856 {
857   deconvolution_op->state = xnn_run_state_invalid;
858 
859   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
860     xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
861       xnn_operator_type_to_string(deconvolution_op->type));
862     return xnn_status_uninitialized;
863   }
864 
865   if (input_width == 0 || input_height == 0) {
866     xnn_log_error(
867       "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
868       xnn_operator_type_to_string(deconvolution_op->type), input_width, input_height);
869     return xnn_status_invalid_parameter;
870   }
871 
872   if (adjustment_height >= deconvolution_op->stride_height) {
873     xnn_log_error(
874       "failed to setup %s operator with %" PRIu32 " height adjustment: "
875       "height adjustment must be smaller than height stride (%" PRIu32 ")",
876       xnn_operator_type_to_string(deconvolution_op->type), adjustment_height, deconvolution_op->stride_height);
877     return xnn_status_invalid_parameter;
878   }
879 
880   if (adjustment_width >= deconvolution_op->stride_width) {
881     xnn_log_error(
882       "failed to setup %s operator with %" PRIu32 " width adjustment: "
883       "width adjustment must be smaller than width stride (%" PRIu32 ")",
884       xnn_operator_type_to_string(deconvolution_op->type), adjustment_width, deconvolution_op->stride_width);
885     return xnn_status_invalid_parameter;
886   }
887 
888   if (batch_size == 0) {
889     deconvolution_op->state = xnn_run_state_skip;
890     return xnn_status_success;
891   }
892 
893   deconvolution_op->batch_size = batch_size;
894   deconvolution_op->input_height = input_height;
895   deconvolution_op->input_width = input_width;
896   deconvolution_op->input = input;
897   deconvolution_op->output = output;
898 
899   deconvolution_op->output_height = compute_output_dimension(
900       input_height, deconvolution_op->padding_top + deconvolution_op->padding_bottom,
901       adjustment_height, deconvolution_op->kernel_height, deconvolution_op->dilation_height, deconvolution_op->stride_height);
902   deconvolution_op->output_width = deconvolution_op->output_width = compute_output_dimension(
903       input_width, deconvolution_op->padding_left + deconvolution_op->padding_right,
904       adjustment_width, deconvolution_op->kernel_width, deconvolution_op->dilation_width, deconvolution_op->stride_width);
905 
906   switch (deconvolution_op->ukernel.type) {
907     case xnn_ukernel_type_igemm:
908       return setup_conv_path(
909         deconvolution_op,
910         batch_size,
911         input_height, input_width, input,
912         deconvolution_op->output_height, deconvolution_op->output_width, output,
913         log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
914         params, params_size, num_threads);
915     case xnn_ukernel_type_subconv2d:
916     {
917       const bool no_padding = (deconvolution_op->padding_top | deconvolution_op->padding_right | deconvolution_op->padding_bottom | deconvolution_op->padding_left) == 0;
918       const bool no_adjustment = (adjustment_height | adjustment_width) == 0;
919       const bool use_gemm = no_padding && no_adjustment &&
920         deconvolution_op->kernel_height == deconvolution_op->stride_height &&
921         deconvolution_op->kernel_width == deconvolution_op->stride_width &&
922         deconvolution_op->ukernel.igemm.gemm_case.function[XNN_UARCH_DEFAULT] != NULL;
923       return setup_subconv2d_path(
924         deconvolution_op,
925         batch_size,
926         input_height, input_width, input,
927         deconvolution_op->output_height, deconvolution_op->output_width, output,
928         log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
929         params, params_size, num_threads, use_gemm);
930     }
931     default:
932       XNN_UNREACHABLE;
933   }
934 }
935 
xnn_setup_deconvolution2d_nhwc_qs8(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const int8_t * input,int8_t * output,pthreadpool_t threadpool)936 enum xnn_status xnn_setup_deconvolution2d_nhwc_qs8(
937     xnn_operator_t deconvolution_op,
938     size_t batch_size,
939     size_t input_height,
940     size_t input_width,
941     uint32_t adjustment_height,
942     uint32_t adjustment_width,
943     const int8_t* input,
944     int8_t* output,
945     pthreadpool_t threadpool)
946 {
947   if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_qs8) {
948     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
949       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qs8),
950       xnn_operator_type_to_string(deconvolution_op->type));
951     return xnn_status_invalid_parameter;
952   }
953 
954   return setup_deconvolution2d_nhwc(
955     deconvolution_op,
956     batch_size, input_height, input_width,
957     adjustment_height, adjustment_width,
958     input, output,
959     0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
960     0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
961     sizeof(int32_t) /* sizeof(bias element) */,
962     0 /* log2(sizeof(output element)) = log2(sizeof(int8_t)) */,
963     &deconvolution_op->params.qs8_conv_minmax, sizeof(deconvolution_op->params.qs8_conv_minmax),
964     pthreadpool_get_threads_count(threadpool));
965 }
966 
xnn_setup_deconvolution2d_nhwc_qu8(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const uint8_t * input,uint8_t * output,pthreadpool_t threadpool)967 enum xnn_status xnn_setup_deconvolution2d_nhwc_qu8(
968     xnn_operator_t deconvolution_op,
969     size_t batch_size,
970     size_t input_height,
971     size_t input_width,
972     uint32_t adjustment_height,
973     uint32_t adjustment_width,
974     const uint8_t* input,
975     uint8_t* output,
976     pthreadpool_t threadpool)
977 {
978   if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_qu8) {
979     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
980       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8),
981       xnn_operator_type_to_string(deconvolution_op->type));
982     return xnn_status_invalid_parameter;
983   }
984 
985   return setup_deconvolution2d_nhwc(
986     deconvolution_op,
987     batch_size, input_height, input_width,
988     adjustment_height, adjustment_width,
989     input, output,
990     0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
991     0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
992     sizeof(int32_t) /* sizeof(bias element) */,
993     0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
994     &deconvolution_op->params.qu8_conv_minmax, sizeof(deconvolution_op->params.qu8_conv_minmax),
995     pthreadpool_get_threads_count(threadpool));
996 }
997 
xnn_setup_deconvolution2d_nhwc_f32(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const float * input,float * output,pthreadpool_t threadpool)998 enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
999     xnn_operator_t deconvolution_op,
1000     size_t batch_size,
1001     size_t input_height,
1002     size_t input_width,
1003     uint32_t adjustment_height,
1004     uint32_t adjustment_width,
1005     const float* input,
1006     float* output,
1007     pthreadpool_t threadpool)
1008 {
1009   if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_f32) {
1010     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1011       xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32),
1012       xnn_operator_type_to_string(deconvolution_op->type));
1013     return xnn_status_invalid_parameter;
1014   }
1015 
1016   return setup_deconvolution2d_nhwc(
1017     deconvolution_op,
1018     batch_size, input_height, input_width,
1019     adjustment_height, adjustment_width,
1020     input, output,
1021     2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1022     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1023     sizeof(float) /* sizeof(bias element) */,
1024     2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
1025     &deconvolution_op->params.f32_minmax, sizeof(deconvolution_op->params.f32_minmax),
1026     pthreadpool_get_threads_count(threadpool));
1027 }
1028