1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <assert.h>
10 #include <stdbool.h>
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <string.h>
14 #include <math.h>
15
16 #include <xnnpack.h>
17 #include <xnnpack/allocator.h>
18 #include <xnnpack/indirection.h>
19 #include <xnnpack/log.h>
20 #include <xnnpack/math.h>
21 #include <xnnpack/operator.h>
22 #include <xnnpack/pack.h>
23 #include <xnnpack/params.h>
24
25
compute_output_dimension(size_t input_dimension,size_t output_padding_dimension,size_t adjustment_dimension,size_t kernel_dimension,size_t dilation_dimension,size_t stride_dimension)26 static inline size_t compute_output_dimension(
27 size_t input_dimension,
28 size_t output_padding_dimension,
29 size_t adjustment_dimension,
30 size_t kernel_dimension,
31 size_t dilation_dimension,
32 size_t stride_dimension)
33 {
34 const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
35 return doz(
36 stride_dimension * (input_dimension - 1) + adjustment_dimension + effective_kernel_dimension,
37 output_padding_dimension);
38 }
39
create_deconvolution2d_nhwc(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,const void * kernel,const void * bias,uint32_t flags,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,xnn_pack_conv_goki_w_function pack_conv_goki_w,xnn_pack_deconv_goki_w_function pack_deconv_goki_w,const void * packing_params,int input_padding_byte,int packed_weights_padding_byte,const void * params,size_t params_size,const struct gemm_parameters * gemm_parameters,const struct gemm_fused_ukernels * gemm_ukernels,enum xnn_operator_type operator_type,xnn_operator_t * deconvolution_op_out)40 static enum xnn_status create_deconvolution2d_nhwc(
41 uint32_t output_padding_top,
42 uint32_t output_padding_right,
43 uint32_t output_padding_bottom,
44 uint32_t output_padding_left,
45 uint32_t kernel_height,
46 uint32_t kernel_width,
47 uint32_t stride_height,
48 uint32_t stride_width,
49 uint32_t dilation_height,
50 uint32_t dilation_width,
51 uint32_t groups,
52 size_t group_input_channels,
53 size_t group_output_channels,
54 size_t input_pixel_stride,
55 size_t output_pixel_stride,
56 const void* kernel,
57 const void* bias,
58 uint32_t flags,
59 uint32_t log2_input_element_size,
60 uint32_t log2_filter_element_size,
61 uint32_t bias_element_size,
62 xnn_pack_conv_goki_w_function pack_conv_goki_w,
63 xnn_pack_deconv_goki_w_function pack_deconv_goki_w,
64 const void* packing_params,
65 int input_padding_byte,
66 int packed_weights_padding_byte,
67 const void* params,
68 size_t params_size,
69 const struct gemm_parameters* gemm_parameters,
70 const struct gemm_fused_ukernels* gemm_ukernels,
71 enum xnn_operator_type operator_type,
72 xnn_operator_t* deconvolution_op_out)
73 {
74 xnn_operator_t deconvolution_op = NULL;
75 enum xnn_status status = xnn_status_uninitialized;
76
77 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
78 xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
79 xnn_operator_type_to_string(operator_type));
80 goto error;
81 }
82
83 status = xnn_status_invalid_parameter;
84
85 if (kernel_width == 0 || kernel_height == 0) {
86 xnn_log_error(
87 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
88 xnn_operator_type_to_string(operator_type), kernel_width, kernel_height);
89 goto error;
90 }
91
92 if (stride_width == 0 || stride_height == 0) {
93 xnn_log_error(
94 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
95 xnn_operator_type_to_string(operator_type), stride_width, stride_height);
96 goto error;
97 }
98
99 if (dilation_width == 0 || dilation_height == 0) {
100 xnn_log_error(
101 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " dilation: dilation dimensions must be non-zero",
102 xnn_operator_type_to_string(operator_type), dilation_width, dilation_height);
103 goto error;
104 }
105
106 if (groups == 0) {
107 xnn_log_error(
108 "failed to create %s operator with %" PRIu32 " groups: number of groups must be non-zero",
109 xnn_operator_type_to_string(operator_type), groups);
110 goto error;
111 }
112
113 if (group_input_channels == 0) {
114 xnn_log_error(
115 "failed to create %s operator with %zu input channels per group: number of channels must be non-zero",
116 xnn_operator_type_to_string(operator_type), group_input_channels);
117 goto error;
118 }
119
120 if (group_output_channels == 0) {
121 xnn_log_error(
122 "failed to create %s operator with %zu output channels per group: number of channels must be non-zero",
123 xnn_operator_type_to_string(operator_type), group_output_channels);
124 goto error;
125 }
126
127 const size_t input_channels = groups * group_input_channels;
128 if (input_pixel_stride < input_channels) {
129 xnn_log_error(
130 "failed to create %s operator with input pixel stride of %zu: "
131 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
132 xnn_operator_type_to_string(operator_type),
133 input_pixel_stride, groups, group_input_channels);
134 goto error;
135 }
136
137 const size_t output_channels = groups * group_output_channels;
138 if (output_pixel_stride < output_channels) {
139 xnn_log_error(
140 "failed to create %s operator with output pixel stride of %zu: "
141 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
142 xnn_operator_type_to_string(operator_type),
143 output_pixel_stride, groups, group_output_channels);
144 goto error;
145 }
146
147 status = xnn_status_out_of_memory;
148
149 deconvolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
150 if (deconvolution_op == NULL) {
151 xnn_log_error(
152 "failed to allocate %zu bytes for %s operator descriptor",
153 sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
154 goto error;
155 }
156
157 const uint32_t mr = gemm_parameters->mr;
158 const uint32_t nr = gemm_parameters->nr;
159 const uint32_t kr = UINT32_C(1) << gemm_parameters->log2_kr;
160 const uint32_t sr = UINT32_C(1) << gemm_parameters->log2_sr;
161
162 const uint32_t n_stride = round_up(group_output_channels, nr);
163 const uint32_t k_stride = round_up_po2(group_input_channels, kr * sr);
164 const uint32_t kernel_size = kernel_height * kernel_width;
165 enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_igemm;
166 size_t packed_group_weights_size = (((kernel_size * k_stride) << log2_filter_element_size) + bias_element_size) * n_stride;
167 if (max(stride_height, stride_width) > 1 && max(dilation_height, dilation_width) == 1 && stride_width <= kernel_width && stride_height <= kernel_height) {
168 ukernel_type = xnn_ukernel_type_subconv2d;
169 const size_t subkernels = stride_height * stride_width;
170 packed_group_weights_size = n_stride *
171 (((kernel_size * k_stride) << log2_filter_element_size) + bias_element_size * subkernels);
172
173 const size_t subconvolution_buffer_size = sizeof(struct subconvolution_params) * subkernels;
174 deconvolution_op->subconvolution_buffer = xnn_allocate_zero_memory(subconvolution_buffer_size);
175 if (deconvolution_op->subconvolution_buffer == NULL) {
176 xnn_log_error(
177 "failed to allocate %zu bytes for %s operator subconvolution buffer",
178 subconvolution_buffer_size, xnn_operator_type_to_string(operator_type));
179 goto error;
180 }
181
182 struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
183 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
184 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
185 const size_t subkernel_height = divide_round_up(kernel_height - offset_y, stride_height);
186 const size_t subkernel_width = divide_round_up(kernel_width - offset_x, stride_width);
187 const size_t subkernel_size = subkernel_height * subkernel_width;
188
189 subconvolution_params->indirection_x_stride = sizeof(void*) * subkernel_size;
190 subconvolution_params->w_stride = bias_element_size + ((k_stride * subkernel_size) << log2_filter_element_size);
191 subconvolution_params++;
192 }
193 }
194 }
195 deconvolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
196 if (deconvolution_op->packed_weights == NULL) {
197 xnn_log_error(
198 "failed to allocate %zu bytes for %s operator packed weights",
199 packed_group_weights_size * groups, xnn_operator_type_to_string(operator_type));
200 goto error;
201 }
202 memset(deconvolution_op->packed_weights, packed_weights_padding_byte, packed_group_weights_size * groups);
203
204 switch (ukernel_type) {
205 case xnn_ukernel_type_igemm:
206 pack_conv_goki_w(
207 groups, group_output_channels, kernel_size, group_input_channels,
208 nr, kr, sr,
209 kernel, bias, deconvolution_op->packed_weights,
210 0 /* extra bytes */,
211 packing_params);
212 break;
213 case xnn_ukernel_type_subconv2d:
214 pack_deconv_goki_w(
215 groups, group_output_channels, kernel_height, kernel_width, group_input_channels,
216 stride_height, stride_width,
217 nr, kr, sr,
218 kernel, bias, deconvolution_op->packed_weights, deconvolution_op->subconvolution_buffer,
219 packing_params);
220 break;
221 default:
222 XNN_UNREACHABLE;
223 }
224
225 const size_t zero_size = (k_stride << log2_input_element_size) + XNN_EXTRA_BYTES;
226 deconvolution_op->zero_buffer = xnn_allocate_simd_memory(zero_size);
227 if (deconvolution_op->zero_buffer == NULL) {
228 xnn_log_error(
229 "failed to allocate %zu bytes for %s operator zero padding",
230 zero_size, xnn_operator_type_to_string(operator_type));
231 goto error;
232 }
233 memset(deconvolution_op->zero_buffer, input_padding_byte, zero_size);
234
235 deconvolution_op->padding_top = output_padding_top;
236 deconvolution_op->padding_right = output_padding_right;
237 deconvolution_op->padding_bottom = output_padding_bottom;
238 deconvolution_op->padding_left = output_padding_left;
239
240 deconvolution_op->kernel_height = kernel_height;
241 deconvolution_op->kernel_width = kernel_width;
242 deconvolution_op->stride_height = stride_height;
243 deconvolution_op->stride_width = stride_width;
244 deconvolution_op->dilation_height = dilation_height;
245 deconvolution_op->dilation_width = dilation_width;
246 deconvolution_op->groups = groups;
247 deconvolution_op->group_input_channels = group_input_channels;
248 deconvolution_op->group_output_channels = group_output_channels;
249 deconvolution_op->input_pixel_stride = input_pixel_stride;
250 deconvolution_op->output_pixel_stride = output_pixel_stride;
251
252 memcpy(&deconvolution_op->params, params, params_size);
253 deconvolution_op->type = operator_type;
254 deconvolution_op->ukernel.type = ukernel_type;
255 deconvolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
256 .general_case = gemm_ukernels->igemm,
257 .gemm_case = gemm_ukernels->gemm,
258 .mr = mr,
259 .nr = nr,
260 .kr = kr,
261 .sr = sr,
262 };
263
264 deconvolution_op->state = xnn_run_state_invalid;
265
266 *deconvolution_op_out = deconvolution_op;
267 return xnn_status_success;
268
269 error:
270 xnn_delete_operator(deconvolution_op);
271 return status;
272 }
273
xnn_create_deconvolution2d_nhwc_qs8(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,int8_t input_zero_point,float input_scale,float kernel_scale,const int8_t * kernel,const int32_t * bias,int8_t output_zero_point,float output_scale,int8_t output_min,int8_t output_max,uint32_t flags,xnn_operator_t * deconvolution_op_out)274 enum xnn_status xnn_create_deconvolution2d_nhwc_qs8(
275 uint32_t output_padding_top,
276 uint32_t output_padding_right,
277 uint32_t output_padding_bottom,
278 uint32_t output_padding_left,
279 uint32_t kernel_height,
280 uint32_t kernel_width,
281 uint32_t stride_height,
282 uint32_t stride_width,
283 uint32_t dilation_height,
284 uint32_t dilation_width,
285 uint32_t groups,
286 size_t group_input_channels,
287 size_t group_output_channels,
288 size_t input_pixel_stride,
289 size_t output_pixel_stride,
290 int8_t input_zero_point,
291 float input_scale,
292 float kernel_scale,
293 const int8_t* kernel,
294 const int32_t* bias,
295 int8_t output_zero_point,
296 float output_scale,
297 int8_t output_min,
298 int8_t output_max,
299 uint32_t flags,
300 xnn_operator_t* deconvolution_op_out)
301 {
302 if (input_scale <= 0.0f || !isnormal(input_scale)) {
303 xnn_log_error(
304 "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
305 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qs8), input_scale);
306 return xnn_status_invalid_parameter;
307 }
308
309 if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
310 xnn_log_error(
311 "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
312 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qs8), kernel_scale);
313 return xnn_status_invalid_parameter;
314 }
315
316 if (output_scale <= 0.0f || !isnormal(output_scale)) {
317 xnn_log_error(
318 "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
319 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qs8), output_scale);
320 return xnn_status_invalid_parameter;
321 }
322
323 if (output_min >= output_max) {
324 xnn_log_error(
325 "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: range min must be below range max",
326 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qs8), output_min, output_max);
327 return xnn_status_invalid_parameter;
328 }
329
330 const float requantization_scale = input_scale * kernel_scale / output_scale;
331 if (requantization_scale >= 256.0f) {
332 xnn_log_error(
333 "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
334 "requantization scale %.7g is greater or equal to 256.0",
335 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qs8),
336 input_scale, kernel_scale, output_scale, requantization_scale);
337 return xnn_status_unsupported_parameter;
338 }
339
340 union xnn_qs8_conv_minmax_params params;
341 if XNN_LIKELY(xnn_params.qs8.gemm.init.qs8 != NULL) {
342 xnn_params.qs8.gemm.init.qs8(¶ms,
343 requantization_scale, output_zero_point, output_min, output_max);
344 }
345 const struct xnn_qs8_packing_params packing_params = {
346 .input_zero_point = input_zero_point,
347 };
348 return create_deconvolution2d_nhwc(
349 output_padding_top, output_padding_right, output_padding_bottom, output_padding_left,
350 kernel_height, kernel_width,
351 stride_height, stride_width,
352 dilation_height, dilation_width,
353 groups, group_input_channels, group_output_channels,
354 input_pixel_stride, output_pixel_stride,
355 kernel, bias, flags,
356 0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
357 0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
358 sizeof(int32_t) /* sizeof(bias element) */,
359 (xnn_pack_conv_goki_w_function) xnn_pack_qs8_conv_goki_w,
360 (xnn_pack_deconv_goki_w_function) xnn_pack_qs8_deconv_goki_w,
361 &packing_params, input_zero_point /* input padding byte */, 0 /* packed weights padding byte */,
362 ¶ms, sizeof(params),
363 &xnn_params.qs8.gemm, &xnn_params.qs8.gemm.minmax,
364 xnn_operator_type_deconvolution_nhwc_qs8,
365 deconvolution_op_out);
366 }
367
xnn_create_deconvolution2d_nhwc_qu8(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,uint8_t input_zero_point,float input_scale,uint8_t kernel_zero_point,float kernel_scale,const uint8_t * kernel,const int32_t * bias,uint8_t output_zero_point,float output_scale,uint8_t output_min,uint8_t output_max,uint32_t flags,xnn_operator_t * deconvolution_op_out)368 enum xnn_status xnn_create_deconvolution2d_nhwc_qu8(
369 uint32_t output_padding_top,
370 uint32_t output_padding_right,
371 uint32_t output_padding_bottom,
372 uint32_t output_padding_left,
373 uint32_t kernel_height,
374 uint32_t kernel_width,
375 uint32_t stride_height,
376 uint32_t stride_width,
377 uint32_t dilation_height,
378 uint32_t dilation_width,
379 uint32_t groups,
380 size_t group_input_channels,
381 size_t group_output_channels,
382 size_t input_pixel_stride,
383 size_t output_pixel_stride,
384 uint8_t input_zero_point,
385 float input_scale,
386 uint8_t kernel_zero_point,
387 float kernel_scale,
388 const uint8_t* kernel,
389 const int32_t* bias,
390 uint8_t output_zero_point,
391 float output_scale,
392 uint8_t output_min,
393 uint8_t output_max,
394 uint32_t flags,
395 xnn_operator_t* deconvolution_op_out)
396 {
397 if (input_scale <= 0.0f || !isnormal(input_scale)) {
398 xnn_log_error(
399 "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
400 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), input_scale);
401 return xnn_status_invalid_parameter;
402 }
403
404 if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
405 xnn_log_error(
406 "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
407 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), kernel_scale);
408 return xnn_status_invalid_parameter;
409 }
410
411 if (output_scale <= 0.0f || !isnormal(output_scale)) {
412 xnn_log_error(
413 "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
414 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), output_scale);
415 return xnn_status_invalid_parameter;
416 }
417
418 if (output_min >= output_max) {
419 xnn_log_error(
420 "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
421 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8), output_min, output_max);
422 return xnn_status_invalid_parameter;
423 }
424
425 const float requantization_scale = input_scale * kernel_scale / output_scale;
426 if (requantization_scale >= 256.0f) {
427 xnn_log_error(
428 "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
429 "requantization scale %.7g is greater or equal to 256.0",
430 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8),
431 input_scale, kernel_scale, output_scale, requantization_scale);
432 return xnn_status_unsupported_parameter;
433 }
434
435 union xnn_qu8_conv_minmax_params params;
436 if XNN_LIKELY(xnn_params.qu8.gemm.init.qu8 != NULL) {
437 xnn_params.qu8.gemm.init.qu8(¶ms,
438 kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max);
439 }
440 const struct xnn_qu8_packing_params packing_params = {
441 .input_zero_point = input_zero_point,
442 .kernel_zero_point = kernel_zero_point,
443 };
444 return create_deconvolution2d_nhwc(
445 output_padding_top, output_padding_right, output_padding_bottom, output_padding_left,
446 kernel_height, kernel_width,
447 stride_height, stride_width,
448 dilation_height, dilation_width,
449 groups, group_input_channels, group_output_channels,
450 input_pixel_stride, output_pixel_stride,
451 kernel, bias, flags,
452 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
453 0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
454 sizeof(int32_t) /* sizeof(bias element) */,
455 (xnn_pack_conv_goki_w_function) xnn_pack_qu8_conv_goki_w,
456 (xnn_pack_deconv_goki_w_function) xnn_pack_qu8_deconv_goki_w,
457 &packing_params, input_zero_point /* input padding byte */, kernel_zero_point /* packed weights padding byte */,
458 ¶ms, sizeof(params),
459 &xnn_params.qu8.gemm, &xnn_params.qu8.gemm.minmax,
460 xnn_operator_type_deconvolution_nhwc_qu8,
461 deconvolution_op_out);
462 }
463
xnn_create_deconvolution2d_nhwc_f32(uint32_t output_padding_top,uint32_t output_padding_right,uint32_t output_padding_bottom,uint32_t output_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,const float * kernel,const float * bias,float output_min,float output_max,uint32_t flags,xnn_operator_t * deconvolution_op_out)464 enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
465 uint32_t output_padding_top,
466 uint32_t output_padding_right,
467 uint32_t output_padding_bottom,
468 uint32_t output_padding_left,
469 uint32_t kernel_height,
470 uint32_t kernel_width,
471 uint32_t stride_height,
472 uint32_t stride_width,
473 uint32_t dilation_height,
474 uint32_t dilation_width,
475 uint32_t groups,
476 size_t group_input_channels,
477 size_t group_output_channels,
478 size_t input_pixel_stride,
479 size_t output_pixel_stride,
480 const float* kernel,
481 const float* bias,
482 float output_min,
483 float output_max,
484 uint32_t flags,
485 xnn_operator_t* deconvolution_op_out)
486 {
487 if (isnan(output_min)) {
488 xnn_log_error(
489 "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
490 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32));
491 return xnn_status_invalid_parameter;
492 }
493
494 if (isnan(output_max)) {
495 xnn_log_error(
496 "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
497 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32));
498 return xnn_status_invalid_parameter;
499 }
500
501 if (output_min >= output_max) {
502 xnn_log_error(
503 "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
504 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32), output_min, output_max);
505 return xnn_status_invalid_parameter;
506 }
507
508 const struct gemm_parameters* gemm_parameters = &xnn_params.f32.gemm;
509 if (gemm_parameters->nr > group_output_channels) {
510 // Default micro-kernel is suboptimal. Try to find a better micro-kernel.
511 if (xnn_params.f32.gemm2.minmax.igemm.function[XNN_UARCH_DEFAULT] != NULL) {
512 gemm_parameters = &xnn_params.f32.gemm2;
513 }
514 }
515 const struct gemm_fused_ukernels* gemm_ukernels = &gemm_parameters->minmax;
516 const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max);
517 if (linear_activation && gemm_parameters->linear.gemm.function[XNN_UARCH_DEFAULT] != NULL) {
518 gemm_ukernels = &gemm_parameters->linear;
519 }
520
521 union xnn_f32_minmax_params params;
522 if XNN_LIKELY(xnn_params.f32.gemm.init.f32 != NULL) {
523 gemm_parameters->init.f32(¶ms, output_min, output_max);
524 }
525 return create_deconvolution2d_nhwc(
526 output_padding_top, output_padding_right, output_padding_bottom, output_padding_left,
527 kernel_height, kernel_width,
528 stride_height, stride_width,
529 dilation_height, dilation_width,
530 groups, group_input_channels, group_output_channels,
531 input_pixel_stride, output_pixel_stride,
532 kernel, bias, flags,
533 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
534 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
535 sizeof(float) /* sizeof(bias element) */,
536 (xnn_pack_conv_goki_w_function) xnn_pack_f32_conv_goki_w,
537 (xnn_pack_deconv_goki_w_function) xnn_pack_f32_deconv_goki_w,
538 NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
539 ¶ms, sizeof(params),
540 gemm_parameters, gemm_ukernels,
541 xnn_operator_type_deconvolution_nhwc_f32,
542 deconvolution_op_out);
543 }
544
setup_conv_path(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,size_t output_height,size_t output_width,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t params_size,size_t num_threads)545 static enum xnn_status setup_conv_path(
546 xnn_operator_t deconvolution_op,
547 size_t batch_size,
548 size_t input_height,
549 size_t input_width,
550 const void* input,
551 size_t output_height,
552 size_t output_width,
553 void* output,
554 uint32_t log2_input_element_size,
555 uint32_t log2_filter_element_size,
556 uint32_t bias_element_size,
557 uint32_t log2_output_element_size,
558 const void* params,
559 size_t params_size,
560 size_t num_threads)
561 {
562 assert(deconvolution_op->ukernel.type == xnn_ukernel_type_igemm);
563
564 const size_t kernel_height = deconvolution_op->kernel_height;
565 const size_t kernel_width = deconvolution_op->kernel_width;
566 const size_t kernel_size = kernel_height * kernel_width;
567
568 const size_t groups = deconvolution_op->groups;
569 const size_t output_size = output_height * output_width;
570 const size_t mr = deconvolution_op->ukernel.igemm.mr;
571 const size_t tiled_output_size = round_up(output_size, mr);
572 const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
573
574 if (input_height != deconvolution_op->last_input_height ||
575 input_width != deconvolution_op->last_input_width)
576 {
577 const void** indirection_buffer = (const void**) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
578 if (indirection_buffer == NULL) {
579 xnn_log_error(
580 "failed to allocate %zu bytes for %s operator indirection buffer",
581 indirection_buffer_size, xnn_operator_type_to_string(deconvolution_op->type));
582 return xnn_status_out_of_memory;
583 }
584 deconvolution_op->indirection_buffer = indirection_buffer;
585 deconvolution_op->last_input = input;
586 deconvolution_op->last_input_height = input_height;
587 deconvolution_op->last_input_width = input_width;
588
589 xnn_indirection_init_deconv2d(deconvolution_op, mr, log2_input_element_size);
590 }
591
592 const size_t group_input_channels = deconvolution_op->group_input_channels;
593 const size_t group_output_channels = deconvolution_op->group_output_channels;
594 const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
595 const size_t w_stride = bias_element_size +
596 (round_up_po2(group_input_channels, deconvolution_op->ukernel.igemm.kr * deconvolution_op->ukernel.igemm.sr) * kernel_size << log2_filter_element_size);
597 deconvolution_op->context.igemm = (struct igemm_context) {
598 .ks = kernel_size,
599 .ks_scaled = kernel_size * mr * sizeof(void*),
600 .kc = group_input_channels << log2_input_element_size,
601 .w_stride = w_stride,
602 .indirect_a = deconvolution_op->indirection_buffer,
603 .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
604 .zero = deconvolution_op->zero_buffer,
605 .packed_w = deconvolution_op->packed_weights,
606 .c = deconvolution_op->output,
607 .cm_stride = deconvolution_op->output_pixel_stride << log2_output_element_size,
608 .cn_stride = nr << log2_output_element_size,
609 .ga_stride = group_input_channels << log2_input_element_size,
610 .gw_stride = w_stride * round_up(group_output_channels, nr),
611 .gc_stride = group_output_channels << log2_output_element_size,
612 .ba_stride = input_height * input_width * deconvolution_op->input_pixel_stride << log2_input_element_size,
613 .bc_stride = output_size * deconvolution_op->output_pixel_stride << log2_output_element_size,
614 .log2_csize = log2_output_element_size,
615 .ukernel = deconvolution_op->ukernel.igemm.general_case,
616 };
617 if (output_size == 1 && deconvolution_op->ukernel.igemm.mr1_case.function[XNN_UARCH_DEFAULT] != NULL) {
618 deconvolution_op->context.igemm.ukernel = deconvolution_op->ukernel.igemm.mr1_case;
619 }
620 memcpy(&deconvolution_op->context.igemm.params, params, params_size);
621
622 size_t nc = group_output_channels;
623 if (num_threads > 1) {
624 const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
625 const size_t target_tiles_per_thread = 5;
626 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
627 if (max_nc < nc) {
628 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
629 }
630 }
631 if (groups == 1) {
632 if (batch_size > 1) {
633 deconvolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
634 deconvolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
635 deconvolution_op->compute.range[0] = batch_size;
636 deconvolution_op->compute.range[1] = output_size;
637 deconvolution_op->compute.range[2] = group_output_channels;
638 } else {
639 deconvolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
640 deconvolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
641 deconvolution_op->compute.range[0] = output_size;
642 deconvolution_op->compute.range[1] = group_output_channels;
643 }
644 deconvolution_op->compute.tile[0] = mr;
645 deconvolution_op->compute.tile[1] = nc;
646 } else {
647 if (batch_size > 1) {
648 deconvolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
649 deconvolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
650 deconvolution_op->compute.range[0] = batch_size;
651 deconvolution_op->compute.range[1] = groups;
652 deconvolution_op->compute.range[2] = output_size;
653 deconvolution_op->compute.range[3] = group_output_channels;
654 } else {
655 deconvolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
656 deconvolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
657 deconvolution_op->compute.range[0] = groups;
658 deconvolution_op->compute.range[1] = output_size;
659 deconvolution_op->compute.range[2] = group_output_channels;
660 }
661 deconvolution_op->compute.tile[0] = mr;
662 deconvolution_op->compute.tile[1] = nc;
663 }
664 deconvolution_op->state = xnn_run_state_ready;
665 return xnn_status_success;
666 }
667
setup_subconv2d_path(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,size_t output_height,size_t output_width,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t params_size,size_t num_threads,bool use_gemm)668 static enum xnn_status setup_subconv2d_path(
669 xnn_operator_t deconvolution_op,
670 size_t batch_size,
671 size_t input_height,
672 size_t input_width,
673 const void* input,
674 size_t output_height,
675 size_t output_width,
676 void* output,
677 uint32_t log2_input_element_size,
678 uint32_t log2_filter_element_size,
679 uint32_t bias_element_size,
680 uint32_t log2_output_element_size,
681 const void* params,
682 size_t params_size,
683 size_t num_threads,
684 bool use_gemm)
685 {
686 assert(deconvolution_op->ukernel.type == xnn_ukernel_type_subconv2d);
687
688 const size_t kernel_height = deconvolution_op->kernel_height;
689 const size_t kernel_width = deconvolution_op->kernel_width;
690 const size_t kernel_size = kernel_height * kernel_width;
691 const size_t stride_height = deconvolution_op->stride_height;
692 const size_t stride_width = deconvolution_op->stride_width;
693
694 const size_t groups = deconvolution_op->groups;
695 const size_t output_size = output_height * output_width;
696 const size_t mr = deconvolution_op->ukernel.igemm.mr;
697
698 const size_t input_pixel_stride = deconvolution_op->input_pixel_stride << log2_input_element_size;
699 const size_t output_pixel_stride = deconvolution_op->output_pixel_stride << log2_output_element_size;
700
701 const bool any_size_change =
702 input_height != deconvolution_op->last_input_height ||
703 input_width != deconvolution_op->last_input_width ||
704 output_height != deconvolution_op->last_output_height ||
705 output_width != deconvolution_op->last_output_width;
706
707 if (any_size_change || output != deconvolution_op->last_output) {
708 // Initialize subconvolution parameters which depend on output dimensions or MR.
709 struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
710 const size_t modulo_padding_top = deconvolution_op->padding_top % stride_height;
711 const size_t modulo_padding_left = deconvolution_op->padding_left % stride_width;
712 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
713 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
714 const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
715 const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
716 subconvolution_params->scaled_kernel_size = mr * subconvolution_params->indirection_x_stride;
717 subconvolution_params->slice_width = divide_round_up(output_width - output_x_start, stride_width);
718 subconvolution_params->slice_height = divide_round_up(output_height - output_y_start, stride_height);
719 subconvolution_params->output =
720 (void*) ((uintptr_t) output + ((output_y_start * output_width + output_x_start) * output_pixel_stride));
721 ++subconvolution_params;
722 }
723 }
724 deconvolution_op->last_output = output;
725 }
726
727 if (any_size_change) {
728 if (!use_gemm) {
729 const size_t indirection_buffer_size = sizeof(void*) *
730 kernel_size * output_height * stride_width * round_up(divide_round_up(output_width, stride_width), mr);
731
732 const void** indirection_buffer =
733 (const void**) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
734 if (indirection_buffer == NULL) {
735 xnn_log_error(
736 "failed to allocate %zu bytes for %s operator indirection buffer",
737 indirection_buffer_size, xnn_operator_type_to_string(deconvolution_op->type));
738 return xnn_status_out_of_memory;
739 }
740 deconvolution_op->indirection_buffer = indirection_buffer;
741 deconvolution_op->last_input = input;
742
743 xnn_indirection_init_subconv2d(deconvolution_op, mr, log2_input_element_size);
744 }
745 deconvolution_op->last_input_height = input_height;
746 deconvolution_op->last_input_width = input_width;
747 deconvolution_op->last_output_height = output_height;
748 deconvolution_op->last_output_width = output_width;
749 }
750
751 const size_t group_input_channels = deconvolution_op->group_input_channels;
752 const size_t group_output_channels = deconvolution_op->group_output_channels;
753 const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
754 const uint32_t kr = deconvolution_op->ukernel.igemm.kr;
755 const uint32_t sr = deconvolution_op->ukernel.igemm.sr;
756 const size_t w_stride = stride_height * stride_width * bias_element_size +
757 (round_up_po2(group_input_channels, kr * sr) * kernel_size << log2_filter_element_size);
758 if (use_gemm) {
759 deconvolution_op->context.subgemm = (struct subgemm_context) {
760 .subconvolution_params = deconvolution_op->subconvolution_buffer,
761 .kc = group_input_channels << log2_input_element_size,
762 .a = input,
763 .ax_stride = input_pixel_stride,
764 .ay_stride = input_width * input_pixel_stride,
765 .cx_stride = stride_width * output_pixel_stride,
766 .cy_stride = stride_height * output_width * output_pixel_stride,
767 .cn_stride = nr << log2_output_element_size,
768 .ga_stride = group_input_channels << log2_input_element_size,
769 .gw_stride = w_stride * round_up(group_output_channels, nr),
770 .gc_stride = group_output_channels << log2_output_element_size,
771 .ba_stride = input_height * input_width * input_pixel_stride,
772 .bc_stride = output_size * output_pixel_stride,
773 .log2_csize = log2_output_element_size,
774 .ukernel = deconvolution_op->ukernel.igemm.gemm_case,
775 };
776 memcpy(&deconvolution_op->context.subgemm.params, params, params_size);
777 } else {
778 deconvolution_op->context.subconv = (struct subconv_context) {
779 .subconvolution_params = deconvolution_op->subconvolution_buffer,
780 .kc = group_input_channels << log2_input_element_size,
781 .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
782 .zero = deconvolution_op->zero_buffer,
783 .cx_stride = stride_width * output_pixel_stride,
784 .cy_stride = stride_height * output_width * output_pixel_stride,
785 .cn_stride = nr << log2_output_element_size,
786 .ga_stride = group_input_channels << log2_input_element_size,
787 .gw_stride = w_stride * round_up(group_output_channels, nr),
788 .gc_stride = group_output_channels << log2_output_element_size,
789 .ba_stride = input_height * input_width * input_pixel_stride,
790 .bc_stride = output_size * output_pixel_stride,
791 .log2_csize = log2_output_element_size,
792 .ukernel = deconvolution_op->ukernel.igemm.general_case,
793 };
794 memcpy(&deconvolution_op->context.subconv.params, params, params_size);
795 }
796
797 const size_t output_height_positions = divide_round_up(output_height, stride_height);
798 const size_t output_width_positions = divide_round_up(output_width, stride_width);
799
800 size_t nc = group_output_channels;
801 if (num_threads > 1) {
802 const size_t num_other_tiles = groups * stride_height * stride_width *
803 output_height_positions * divide_round_up(output_width_positions, mr);
804 const size_t target_tiles_per_thread = 5;
805 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
806 if (max_nc < nc) {
807 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
808 }
809 }
810
811 if (groups == 1) {
812 deconvolution_op->compute.type = xnn_parallelization_type_5d_tile_2d;
813 deconvolution_op->compute.task_5d_tile_2d = use_gemm ?
814 (pthreadpool_task_5d_tile_2d_t) xnn_compute_subgemm2d : (pthreadpool_task_5d_tile_2d_t) xnn_compute_subconv2d;
815 deconvolution_op->compute.range[0] = batch_size;
816 deconvolution_op->compute.range[1] = stride_height * stride_width;
817 deconvolution_op->compute.range[2] = divide_round_up(output_height, stride_height);
818 deconvolution_op->compute.range[3] = divide_round_up(output_width, stride_width);
819 deconvolution_op->compute.range[4] = group_output_channels;
820 deconvolution_op->compute.tile[0] = mr;
821 deconvolution_op->compute.tile[1] = nc;
822 } else {
823 deconvolution_op->compute.type = xnn_parallelization_type_6d_tile_2d;
824 deconvolution_op->compute.task_6d_tile_2d = use_gemm ?
825 (pthreadpool_task_6d_tile_2d_t) xnn_compute_grouped_subgemm2d : (pthreadpool_task_6d_tile_2d_t) xnn_compute_grouped_subconv2d;
826 deconvolution_op->compute.range[0] = batch_size;
827 deconvolution_op->compute.range[1] = groups;
828 deconvolution_op->compute.range[2] = stride_height * stride_width;
829 deconvolution_op->compute.range[3] = divide_round_up(output_height, stride_height);
830 deconvolution_op->compute.range[4] = divide_round_up(output_width, stride_width);
831 deconvolution_op->compute.range[5] = group_output_channels;
832 deconvolution_op->compute.tile[0] = mr;
833 deconvolution_op->compute.tile[1] = nc;
834 }
835
836 deconvolution_op->state = xnn_run_state_ready;
837 return xnn_status_success;
838 }
839
setup_deconvolution2d_nhwc(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const void * input,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t params_size,size_t num_threads)840 static enum xnn_status setup_deconvolution2d_nhwc(
841 xnn_operator_t deconvolution_op,
842 size_t batch_size,
843 size_t input_height,
844 size_t input_width,
845 uint32_t adjustment_height,
846 uint32_t adjustment_width,
847 const void* input,
848 void* output,
849 uint32_t log2_input_element_size,
850 uint32_t log2_filter_element_size,
851 uint32_t bias_element_size,
852 uint32_t log2_output_element_size,
853 const void* params,
854 size_t params_size,
855 size_t num_threads)
856 {
857 deconvolution_op->state = xnn_run_state_invalid;
858
859 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
860 xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
861 xnn_operator_type_to_string(deconvolution_op->type));
862 return xnn_status_uninitialized;
863 }
864
865 if (input_width == 0 || input_height == 0) {
866 xnn_log_error(
867 "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
868 xnn_operator_type_to_string(deconvolution_op->type), input_width, input_height);
869 return xnn_status_invalid_parameter;
870 }
871
872 if (adjustment_height >= deconvolution_op->stride_height) {
873 xnn_log_error(
874 "failed to setup %s operator with %" PRIu32 " height adjustment: "
875 "height adjustment must be smaller than height stride (%" PRIu32 ")",
876 xnn_operator_type_to_string(deconvolution_op->type), adjustment_height, deconvolution_op->stride_height);
877 return xnn_status_invalid_parameter;
878 }
879
880 if (adjustment_width >= deconvolution_op->stride_width) {
881 xnn_log_error(
882 "failed to setup %s operator with %" PRIu32 " width adjustment: "
883 "width adjustment must be smaller than width stride (%" PRIu32 ")",
884 xnn_operator_type_to_string(deconvolution_op->type), adjustment_width, deconvolution_op->stride_width);
885 return xnn_status_invalid_parameter;
886 }
887
888 if (batch_size == 0) {
889 deconvolution_op->state = xnn_run_state_skip;
890 return xnn_status_success;
891 }
892
893 deconvolution_op->batch_size = batch_size;
894 deconvolution_op->input_height = input_height;
895 deconvolution_op->input_width = input_width;
896 deconvolution_op->input = input;
897 deconvolution_op->output = output;
898
899 deconvolution_op->output_height = compute_output_dimension(
900 input_height, deconvolution_op->padding_top + deconvolution_op->padding_bottom,
901 adjustment_height, deconvolution_op->kernel_height, deconvolution_op->dilation_height, deconvolution_op->stride_height);
902 deconvolution_op->output_width = deconvolution_op->output_width = compute_output_dimension(
903 input_width, deconvolution_op->padding_left + deconvolution_op->padding_right,
904 adjustment_width, deconvolution_op->kernel_width, deconvolution_op->dilation_width, deconvolution_op->stride_width);
905
906 switch (deconvolution_op->ukernel.type) {
907 case xnn_ukernel_type_igemm:
908 return setup_conv_path(
909 deconvolution_op,
910 batch_size,
911 input_height, input_width, input,
912 deconvolution_op->output_height, deconvolution_op->output_width, output,
913 log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
914 params, params_size, num_threads);
915 case xnn_ukernel_type_subconv2d:
916 {
917 const bool no_padding = (deconvolution_op->padding_top | deconvolution_op->padding_right | deconvolution_op->padding_bottom | deconvolution_op->padding_left) == 0;
918 const bool no_adjustment = (adjustment_height | adjustment_width) == 0;
919 const bool use_gemm = no_padding && no_adjustment &&
920 deconvolution_op->kernel_height == deconvolution_op->stride_height &&
921 deconvolution_op->kernel_width == deconvolution_op->stride_width &&
922 deconvolution_op->ukernel.igemm.gemm_case.function[XNN_UARCH_DEFAULT] != NULL;
923 return setup_subconv2d_path(
924 deconvolution_op,
925 batch_size,
926 input_height, input_width, input,
927 deconvolution_op->output_height, deconvolution_op->output_width, output,
928 log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
929 params, params_size, num_threads, use_gemm);
930 }
931 default:
932 XNN_UNREACHABLE;
933 }
934 }
935
xnn_setup_deconvolution2d_nhwc_qs8(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const int8_t * input,int8_t * output,pthreadpool_t threadpool)936 enum xnn_status xnn_setup_deconvolution2d_nhwc_qs8(
937 xnn_operator_t deconvolution_op,
938 size_t batch_size,
939 size_t input_height,
940 size_t input_width,
941 uint32_t adjustment_height,
942 uint32_t adjustment_width,
943 const int8_t* input,
944 int8_t* output,
945 pthreadpool_t threadpool)
946 {
947 if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_qs8) {
948 xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
949 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qs8),
950 xnn_operator_type_to_string(deconvolution_op->type));
951 return xnn_status_invalid_parameter;
952 }
953
954 return setup_deconvolution2d_nhwc(
955 deconvolution_op,
956 batch_size, input_height, input_width,
957 adjustment_height, adjustment_width,
958 input, output,
959 0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
960 0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
961 sizeof(int32_t) /* sizeof(bias element) */,
962 0 /* log2(sizeof(output element)) = log2(sizeof(int8_t)) */,
963 &deconvolution_op->params.qs8_conv_minmax, sizeof(deconvolution_op->params.qs8_conv_minmax),
964 pthreadpool_get_threads_count(threadpool));
965 }
966
xnn_setup_deconvolution2d_nhwc_qu8(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const uint8_t * input,uint8_t * output,pthreadpool_t threadpool)967 enum xnn_status xnn_setup_deconvolution2d_nhwc_qu8(
968 xnn_operator_t deconvolution_op,
969 size_t batch_size,
970 size_t input_height,
971 size_t input_width,
972 uint32_t adjustment_height,
973 uint32_t adjustment_width,
974 const uint8_t* input,
975 uint8_t* output,
976 pthreadpool_t threadpool)
977 {
978 if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_qu8) {
979 xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
980 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_qu8),
981 xnn_operator_type_to_string(deconvolution_op->type));
982 return xnn_status_invalid_parameter;
983 }
984
985 return setup_deconvolution2d_nhwc(
986 deconvolution_op,
987 batch_size, input_height, input_width,
988 adjustment_height, adjustment_width,
989 input, output,
990 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
991 0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
992 sizeof(int32_t) /* sizeof(bias element) */,
993 0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
994 &deconvolution_op->params.qu8_conv_minmax, sizeof(deconvolution_op->params.qu8_conv_minmax),
995 pthreadpool_get_threads_count(threadpool));
996 }
997
xnn_setup_deconvolution2d_nhwc_f32(xnn_operator_t deconvolution_op,size_t batch_size,size_t input_height,size_t input_width,uint32_t adjustment_height,uint32_t adjustment_width,const float * input,float * output,pthreadpool_t threadpool)998 enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
999 xnn_operator_t deconvolution_op,
1000 size_t batch_size,
1001 size_t input_height,
1002 size_t input_width,
1003 uint32_t adjustment_height,
1004 uint32_t adjustment_width,
1005 const float* input,
1006 float* output,
1007 pthreadpool_t threadpool)
1008 {
1009 if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_f32) {
1010 xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1011 xnn_operator_type_to_string(xnn_operator_type_deconvolution_nhwc_f32),
1012 xnn_operator_type_to_string(deconvolution_op->type));
1013 return xnn_status_invalid_parameter;
1014 }
1015
1016 return setup_deconvolution2d_nhwc(
1017 deconvolution_op,
1018 batch_size, input_height, input_width,
1019 adjustment_height, adjustment_width,
1020 input, output,
1021 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1022 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1023 sizeof(float) /* sizeof(bias element) */,
1024 2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
1025 &deconvolution_op->params.f32_minmax, sizeof(deconvolution_op->params.f32_minmax),
1026 pthreadpool_get_threads_count(threadpool));
1027 }
1028