• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 #include <math.h>
8 #include <stdbool.h>
9 #include <stddef.h>
10 #include <stdint.h>
11 #include <string.h>
12 
13 #include <xnnpack.h>
14 #include <xnnpack/allocator.h>
15 #include <xnnpack/log.h>
16 #include <xnnpack/math.h>
17 #include <xnnpack/normalization.h>
18 #include <xnnpack/operator.h>
19 
20 /// Reorder the data in array using the indices in loop_order.
21 ///
22 /// Changing the loop order can have dramatic performance implications.
reorder_array(size_t num_dims,const size_t loop_order[restrict XNN_MIN_ELEMENTS (1)],size_t array[restrict XNN_MIN_ELEMENTS (1)])23 static void reorder_array(
24     size_t num_dims,
25     const size_t loop_order[restrict XNN_MIN_ELEMENTS(1) ],
26     size_t array[restrict XNN_MIN_ELEMENTS(1)])
27 {
28   size_t tmp[XNN_MAX_TENSOR_DIMS];
29   memcpy(tmp, array, sizeof(size_t) * num_dims);
30   for (size_t i = 0; i < num_dims; ++i) {
31     array[i] = tmp[loop_order[i]];
32   }
33 }
34 
init_transpose_nd(uint32_t flags,uint32_t datatype_init_flags,enum xnn_operator_type operator_type,xnn_operator_t transpose_op)35 static enum xnn_status init_transpose_nd(
36     uint32_t flags,
37     uint32_t datatype_init_flags,
38     enum xnn_operator_type operator_type,
39     xnn_operator_t transpose_op)
40 {
41   enum xnn_status status = xnn_status_unsupported_hardware;
42 
43   if ((xnn_params.init_flags & datatype_init_flags) != datatype_init_flags) {
44     xnn_log_error(
45       "failed to create %s operator: operations on data type are not supported",
46       xnn_operator_type_to_string(operator_type));
47     goto error;
48   }
49   transpose_op->flags = flags;
50   transpose_op->type = operator_type;
51 
52   return xnn_status_success;
53 
54 error:
55   return status;
56 }
57 
create_transpose_nd(uint32_t flags,uint32_t datatype_init_flags,enum xnn_operator_type operator_type,xnn_operator_t * transpose_op_out)58 static enum xnn_status create_transpose_nd(
59     uint32_t flags,
60     uint32_t datatype_init_flags,
61     enum xnn_operator_type operator_type,
62     xnn_operator_t* transpose_op_out)
63 {
64   enum xnn_status status = xnn_status_uninitialized;
65 
66   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
67     xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
68       xnn_operator_type_to_string(operator_type));
69     return status;
70   }
71 
72   status = xnn_status_out_of_memory;
73   xnn_operator_t transpose_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
74   if (transpose_op == NULL) {
75     xnn_log_error(
76       "failed to allocate %zu bytes for %s operator descriptor",
77       sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
78     goto error;
79   }
80 
81   status = init_transpose_nd(flags, datatype_init_flags, operator_type, transpose_op);
82   if (status != xnn_status_success) {
83     goto error;
84   }
85   *transpose_op_out = transpose_op;
86 
87   return xnn_status_success;
88 
89 error:
90   xnn_delete_operator(transpose_op);
91   return status;
92 }
93 
94 /// input_stride and output_stride are the number of elements between each
95 /// dimension, not the size of the dimension. This is because depth to space
96 /// splits the input channel dimension into three dimensions - block_size *
97 /// block_size * output_channels but gives input_channel_stride the stride over
98 /// all three dimensions. This must be multiplied by the product of the previous
99 /// dimensions to get the stride in elements. input_channel_stride is not
100 /// requried to be a multiple of block_size * block_size * output_channels so
101 /// the stride in number of elements must be supplied.
102 /// An interface for sub-tensors can easily be built on top of this.
setup_transpose_nd(xnn_operator_t transpose_op,const void * input,void * output,const size_t num_dims,const size_t * input_shape,const size_t * perm,const size_t * input_stride,const size_t * output_stride,size_t element_size)103 static enum xnn_status setup_transpose_nd(
104   xnn_operator_t transpose_op,
105   const void* input,
106   void* output,
107   const size_t num_dims,
108   const size_t* input_shape,
109   const size_t* perm,
110   const size_t* input_stride,
111   const size_t* output_stride,
112   size_t element_size)
113 {
114   enum xnn_status status = xnn_status_invalid_parameter;
115   transpose_op->state = xnn_run_state_invalid;
116 
117   if (num_dims == 0) {
118     xnn_log_error(
119       "failed to create %s operator with %zu num_dims: num_dims must be non-zero",
120       xnn_operator_type_to_string(transpose_op->type), num_dims);
121     goto error;
122   }
123 
124   if (num_dims > XNN_MAX_TENSOR_DIMS) {
125     xnn_log_error(
126       "failed to create %s operator with %zu num_dims: num_dims must be <= %d",
127       xnn_operator_type_to_string(transpose_op->type), num_dims, XNN_MAX_TENSOR_DIMS);
128     goto error;
129   }
130 
131   for (size_t i = 0; i < num_dims; ++i) {
132     if (perm[i] >= num_dims) {
133       xnn_log_error(
134           "failed to create %s operator with %zu perm and %zu num_dims: 0 <= perm < num_dims",
135           xnn_operator_type_to_string(transpose_op->type), perm[i], num_dims);
136       goto error;
137     }
138   }
139 
140   for (size_t i = 0; i < num_dims - 1; ++i) {
141     for (size_t j = i + 1; j < num_dims; ++j) {
142       if (perm[i] == perm[j]) {
143         xnn_log_error(
144             "failed to create %s operator with duplicate entries in perm",
145             xnn_operator_type_to_string(transpose_op->type));
146         goto error;
147       }
148     }
149   }
150 
151   if (input_stride != NULL) {
152     if (input_stride[num_dims - 1] != 1) {
153       xnn_log_error(
154           "failed to create %s operator with %zu input_stride[num_dims - 1]: input_stride[num_dims - 1] == 1",
155           xnn_operator_type_to_string(transpose_op->type), input_stride[num_dims - 1]);
156     }
157     size_t current_stride = 1;
158     for (size_t i = num_dims - 1; i > 0; --i) {
159       if ((input_stride[i - 1] < input_stride[i] * input_shape[i]) || (input_stride[i - 1] < current_stride)) {
160         xnn_log_error(
161             "failed to create %s operator with %zu input_shape and %zu input_stride: input_stride >= input_shape",
162             xnn_operator_type_to_string(transpose_op->type), input_shape[i], input_stride[i]);
163       }
164       current_stride *= input_shape[i];
165     }
166   }
167 
168   if (output_stride != NULL) {
169     if (output_stride[num_dims - 1] != 1) {
170       xnn_log_error(
171           "failed to create %s operator with %zu output_stride[num_dims - 1]: output_stride[num_dims - 1] == 1",
172           xnn_operator_type_to_string(transpose_op->type), output_stride[num_dims - 1]);
173     }
174     size_t current_stride = 1;
175     for (size_t i = num_dims - 1; i > 0; --i) {
176       if ((output_stride[i - 1] < output_stride[i] * input_shape[perm[i]]) || (output_stride[i - 1] < current_stride)) {
177         xnn_log_error(
178             "failed to create %s operator with %zu output_shape and %zu output_stride: output_stride >= output_shape",
179             xnn_operator_type_to_string(transpose_op->type), input_shape[perm[i]], output_stride[i]);
180       }
181       current_stride *= input_shape[perm[i]];
182     }
183   }
184 
185   transpose_op->channels = num_dims;
186 
187   struct transpose_context* context = &transpose_op->context.transpose;
188   size_t normalized_dims;
189   size_t normalized_shape[XNN_MAX_TENSOR_DIMS];
190   size_t normalized_perm[XNN_MAX_TENSOR_DIMS];
191   size_t normalized_element_size;
192   xnn_normalize_transpose_permutation(num_dims, element_size, perm, input_shape, input_stride, output_stride, &normalized_dims,
193                                       &normalized_element_size, normalized_perm, normalized_shape, context->input_stride, context->output_stride);
194 
195   size_t loop_order[XNN_MAX_TENSOR_DIMS];
196   memcpy(loop_order, normalized_perm, sizeof(size_t) * normalized_dims);
197 
198   /// The innermost loop must iterate over the contiguous input dimension and the second most inner loop over the
199   /// contiguous output dimension.
200   if (normalized_dims > 1) {
201     for (size_t i = 0; i < normalized_dims - 2; ++i) {
202       if (loop_order[i] == normalized_dims - 1) {
203         size_t tmp = loop_order[i];
204         loop_order[i] = loop_order[normalized_dims - 2];
205         loop_order[normalized_dims - 2] = tmp;
206         tmp = context->output_stride[i];
207         context->output_stride[i] = context->output_stride[normalized_dims - 2];
208         context->output_stride[normalized_dims - 2] = tmp;
209         break;
210       }
211     }
212   }
213 
214   for (size_t i = 0; i < normalized_dims; ++i) {
215     transpose_op->compute.range[i] = normalized_shape[i];
216   }
217   reorder_array(normalized_dims, loop_order, context->input_stride);
218   reorder_array(normalized_dims, loop_order, transpose_op->compute.range);
219 
220   bool variable_size_ukernel = false;
221   switch (normalized_element_size) {
222     case 1:
223       context->log2_element_size = 0;
224       context->const_size_ukernel = xnn_params.x8.transpose.const_size_ukernel;
225       transpose_op->compute.tile[0] = xnn_params.x8.transpose.tile_size;
226       transpose_op->compute.tile[1] = xnn_params.x8.transpose.tile_size;
227       break;
228     case 2:
229       context->log2_element_size = 1;
230       transpose_op->compute.tile[0] = xnn_params.x16.transpose.tile_size;
231       transpose_op->compute.tile[1] = xnn_params.x16.transpose.tile_size;
232       context->const_size_ukernel = xnn_params.x16.transpose.const_size_ukernel;
233       break;
234     case 4:
235       context->log2_element_size = 2;
236       transpose_op->compute.tile[0] = xnn_params.x32.transpose.tile_size;
237       transpose_op->compute.tile[1] = xnn_params.x32.transpose.tile_size;
238       context->const_size_ukernel = xnn_params.x32.transpose.const_size_ukernel;
239       break;
240     default:
241       context->element_size = normalized_element_size;
242       transpose_op->compute.tile[0] = xnn_params.xx.transpose.tile_size;
243       transpose_op->compute.tile[1] = xnn_params.xx.transpose.tile_size;
244       context->variable_size_ukernel = xnn_params.xx.transpose.variable_size_ukernel;
245       variable_size_ukernel = true;
246   }
247 
248   struct univector_contiguous_context* univector_context = &transpose_op->context.univector_contiguous;
249   switch (normalized_dims) {
250     case 1:
251       transpose_op->compute.type = xnn_parallelization_type_1d_tile_1d;
252       transpose_op->compute.task_1d = (pthreadpool_task_1d_t) xnn_compute_univector_contiguous;
253       transpose_op->compute.range[0] = normalized_element_size;
254       univector_context->ukernel = xnn_params.xx.copy;
255       univector_context->log2_xsize = 0;
256       univector_context->log2_ysize = 0;
257       break;
258     case 2:
259       transpose_op->compute.type = xnn_parallelization_type_2d_tile_2d;
260       if (variable_size_ukernel) {
261         transpose_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_transposev_2d;
262       } else {
263         transpose_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_transposec_2d;
264       }
265       break;
266     case 3:
267       transpose_op->compute.type = xnn_parallelization_type_3d_tile_2d;
268       if (variable_size_ukernel) {
269         transpose_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_transposev_3d;
270       } else {
271         transpose_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_transposec_3d;
272       }
273       break;
274     case 4:
275       transpose_op->compute.type = xnn_parallelization_type_4d_tile_2d;
276       if (variable_size_ukernel) {
277         transpose_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_transposev_4d;
278       } else {
279         transpose_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_transposec_4d;
280       }
281       break;
282     case 5:
283       transpose_op->compute.type = xnn_parallelization_type_5d_tile_2d;
284       if (variable_size_ukernel) {
285         transpose_op->compute.task_5d_tile_2d = (pthreadpool_task_5d_tile_2d_t) xnn_compute_transposev_5d;
286       } else {
287         transpose_op->compute.task_5d_tile_2d = (pthreadpool_task_5d_tile_2d_t) xnn_compute_transposec_5d;
288       }
289       break;
290     case 6:
291       transpose_op->compute.type = xnn_parallelization_type_6d_tile_2d;
292       if (variable_size_ukernel) {
293         transpose_op->compute.task_6d_tile_2d = (pthreadpool_task_6d_tile_2d_t) xnn_compute_transposev_6d;
294       } else {
295         transpose_op->compute.task_6d_tile_2d = (pthreadpool_task_6d_tile_2d_t) xnn_compute_transposec_6d;
296       }
297       break;
298     default:
299       XNN_UNREACHABLE;
300   }
301 
302   if (transpose_op->channels == 1) {
303     transpose_op->context.univector_contiguous.x = input;
304     transpose_op->context.univector_contiguous.y = output;
305   } else {
306     transpose_op->context.transpose.x = input;
307     transpose_op->context.transpose.y = output;
308   }
309   transpose_op->state = xnn_run_state_ready;
310 
311   return xnn_status_success;
312 
313 error:
314   xnn_delete_operator(transpose_op);
315   return status;
316 }
317 
xnn_create_transpose_nd_x32(uint32_t flags,xnn_operator_t * transpose_op_out)318 enum xnn_status xnn_create_transpose_nd_x32(
319   uint32_t flags,
320   xnn_operator_t* transpose_op_out)
321 {
322   return create_transpose_nd(
323     flags,
324     XNN_INIT_FLAG_X32,
325     xnn_operator_type_transpose_nd_x32,
326     transpose_op_out);
327 }
328 
xnn_create_transpose_nd_x16(uint32_t flags,xnn_operator_t * transpose_op_out)329 enum xnn_status xnn_create_transpose_nd_x16(
330   uint32_t flags,
331   xnn_operator_t* transpose_op_out)
332 {
333   return create_transpose_nd(
334     flags,
335     XNN_INIT_FLAG_X16,
336     xnn_operator_type_transpose_nd_x16,
337     transpose_op_out);
338 }
339 
xnn_create_transpose_nd_x8(uint32_t flags,xnn_operator_t * transpose_op_out)340 enum xnn_status xnn_create_transpose_nd_x8(
341   uint32_t flags,
342   xnn_operator_t* transpose_op_out)
343 {
344   return create_transpose_nd(
345     flags,
346     XNN_INIT_FLAG_X8,
347     xnn_operator_type_transpose_nd_x8,
348     transpose_op_out);
349 }
350 
xnn_setup_transpose_nd_x32(xnn_operator_t transpose_op,const void * input,void * output,size_t num_dims,const size_t * shape,const size_t * perm,pthreadpool_t threadpool)351 enum xnn_status xnn_setup_transpose_nd_x32(
352     xnn_operator_t transpose_op,
353     const void* input,
354     void* output,
355     size_t num_dims,
356     const size_t* shape,
357     const size_t* perm,
358     pthreadpool_t threadpool)
359 {
360   if (transpose_op->type != xnn_operator_type_transpose_nd_x32) {
361     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
362       xnn_operator_type_to_string(xnn_operator_type_transpose_nd_x32),
363       xnn_operator_type_to_string(transpose_op->type));
364     return xnn_status_invalid_parameter;
365   }
366 
367   return setup_transpose_nd(
368     transpose_op,
369     input, output,
370     num_dims, shape, perm, NULL, NULL,
371     sizeof(uint32_t));
372 }
373 
xnn_setup_transpose_nd_x16(xnn_operator_t transpose_op,const void * input,void * output,size_t num_dims,const size_t * shape,const size_t * perm,pthreadpool_t threadpool)374 enum xnn_status xnn_setup_transpose_nd_x16(
375     xnn_operator_t transpose_op,
376     const void* input,
377     void* output,
378     size_t num_dims,
379     const size_t* shape,
380     const size_t* perm,
381     pthreadpool_t threadpool)
382 {
383   if (transpose_op->type != xnn_operator_type_transpose_nd_x16) {
384     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
385       xnn_operator_type_to_string(xnn_operator_type_transpose_nd_x16),
386       xnn_operator_type_to_string(transpose_op->type));
387     return xnn_status_invalid_parameter;
388   }
389 
390   return setup_transpose_nd(
391     transpose_op,
392     input, output,
393     num_dims, shape, perm, NULL, NULL,
394     sizeof(uint16_t));
395 }
396 
xnn_setup_transpose_nd_x8(xnn_operator_t transpose_op,const void * input,void * output,size_t num_dims,const size_t * shape,const size_t * perm,pthreadpool_t threadpool)397 enum xnn_status xnn_setup_transpose_nd_x8(
398     xnn_operator_t transpose_op,
399     const void* input,
400     void* output,
401     size_t num_dims,
402     const size_t* shape,
403     const size_t* perm,
404     pthreadpool_t threadpool)
405 {
406   if (transpose_op->type != xnn_operator_type_transpose_nd_x8) {
407     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
408       xnn_operator_type_to_string(xnn_operator_type_transpose_nd_x8),
409       xnn_operator_type_to_string(transpose_op->type));
410     return xnn_status_invalid_parameter;
411   }
412 
413   return setup_transpose_nd(
414     transpose_op,
415     input, output,
416     num_dims, shape, perm, NULL, NULL,
417     sizeof(uint8_t));
418 }
419 
run_transpose_nd(uint32_t flags,const void * input,void * output,const size_t num_dims,const size_t * input_shape,const size_t * output_perm,size_t element_size,uint32_t datatype_init_flags,enum xnn_operator_type operator_type,pthreadpool_t threadpool)420 enum xnn_status run_transpose_nd(
421     uint32_t flags,
422     const void* input,
423     void* output,
424     const size_t num_dims,
425     const size_t* input_shape,
426     const size_t* output_perm,
427     size_t element_size,
428     uint32_t datatype_init_flags,
429     enum xnn_operator_type operator_type,
430     pthreadpool_t threadpool) {
431   enum xnn_status status = xnn_status_uninitialized;
432 
433   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
434     xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
435       xnn_operator_type_to_string(operator_type));
436     return status;
437   }
438 
439   struct xnn_operator transpose_op;
440   memset(&transpose_op, 0, sizeof(transpose_op));
441 
442   status = init_transpose_nd(
443       flags,
444       datatype_init_flags,
445       operator_type,
446       &transpose_op);
447   if (status != xnn_status_success) {
448     return status;
449   }
450 
451   status = setup_transpose_nd(&transpose_op,
452                               input,
453                               output,
454                               num_dims,
455                               input_shape,
456                               output_perm,
457                               NULL,
458                               NULL,
459                               element_size);
460   if (status != xnn_status_success) {
461     return status;
462   }
463 
464   return xnn_run_operator(&transpose_op, threadpool);
465 }
466 
xnn_run_transpose_nd_x32(uint32_t flags,const void * input,void * output,const size_t num_dims,const size_t * input_shape,const size_t * output_perm,pthreadpool_t threadpool)467 enum xnn_status xnn_run_transpose_nd_x32(
468     uint32_t flags,
469     const void* input,
470     void* output,
471     const size_t num_dims,
472     const size_t* input_shape,
473     const size_t* output_perm,
474     pthreadpool_t threadpool) {
475 
476   return run_transpose_nd(
477     flags,
478     input,
479     output,
480     num_dims,
481     input_shape,
482     output_perm,
483     sizeof(uint32_t),
484     XNN_INIT_FLAG_X32,
485     xnn_operator_type_transpose_nd_x32,
486     threadpool);
487 }
488 
xnn_run_transpose_nd_x16(uint32_t flags,const void * input,void * output,const size_t num_dims,const size_t * input_shape,const size_t * output_perm,pthreadpool_t threadpool)489 enum xnn_status xnn_run_transpose_nd_x16(
490     uint32_t flags,
491     const void* input,
492     void* output,
493     const size_t num_dims,
494     const size_t* input_shape,
495     const size_t* output_perm,
496     pthreadpool_t threadpool) {
497 
498   return run_transpose_nd(
499     flags,
500     input,
501     output,
502     num_dims,
503     input_shape,
504     output_perm,
505     sizeof(uint16_t),
506     XNN_INIT_FLAG_X16,
507     xnn_operator_type_transpose_nd_x16,
508     threadpool);
509 }
510 
xnn_run_transpose_nd_x8(uint32_t flags,const void * input,void * output,const size_t num_dims,const size_t * input_shape,const size_t * output_perm,pthreadpool_t threadpool)511 enum xnn_status xnn_run_transpose_nd_x8(
512     uint32_t flags,
513     const void* input,
514     void* output,
515     const size_t num_dims,
516     const size_t* input_shape,
517     const size_t* output_perm,
518     pthreadpool_t threadpool) {
519 
520   return run_transpose_nd(
521     flags,
522     input,
523     output,
524     num_dims,
525     input_shape,
526     output_perm,
527     sizeof(uint8_t),
528     XNN_INIT_FLAG_X8,
529     xnn_operator_type_transpose_nd_x8,
530     threadpool);
531 }
532 
xnn_create_depth_to_space_nchw2nhwc_x32(size_t output_channels,size_t input_channel_stride,size_t output_channel_stride,uint32_t block_size,uint32_t flags,xnn_operator_t * depth_to_space_op_out)533 enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x32(
534     size_t output_channels,
535     size_t input_channel_stride,
536     size_t output_channel_stride,
537     uint32_t block_size,
538     uint32_t flags,
539     xnn_operator_t* depth_to_space_op_out)
540 {
541   xnn_operator_t depth_to_space_op = NULL;
542   enum xnn_status status = xnn_status_uninitialized;
543 
544   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
545     xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
546       xnn_operator_type_to_string(xnn_operator_type_depth_to_space_nchw2nhwc_x32));
547     goto error;
548   }
549 
550   status = xnn_status_invalid_parameter;
551 
552   if (output_channels == 0) {
553     xnn_log_error("failed to create %s operator with %zu output channels: number of channels must be non-zero",
554       xnn_operator_type_to_string(xnn_operator_type_depth_to_space_nchw2nhwc_x32), output_channels);
555     goto error;
556   }
557 
558   if (output_channel_stride < output_channels) {
559     xnn_log_error(
560       "failed to create %s operator with output channel stride of %zu: "
561       "stride must be at least as large as the number of output channels (%zu)",
562       xnn_operator_type_to_string(xnn_operator_type_depth_to_space_nchw2nhwc_x32),
563       output_channel_stride, output_channels);
564     goto error;
565   }
566 
567   if (block_size <= 1) {
568     xnn_log_error("failed to create %s operator with %u block size: block size must be greater than 1",
569       xnn_operator_type_to_string(xnn_operator_type_depth_to_space_nchw2nhwc_x32),
570       block_size);
571     goto error;
572   }
573 
574   const size_t input_channels = output_channels * block_size * block_size;
575   if (input_channel_stride < input_channels) {
576     xnn_log_error(
577       "failed to create %s operator with input channel stride of %zu: "
578       "stride must be at least as large as the number of input channels (%" PRIu32 "x%" PRIu32 "x%zu)",
579       xnn_operator_type_to_string(xnn_operator_type_depth_to_space_nchw2nhwc_x32),
580       input_channel_stride, block_size, block_size, input_channels);
581     goto error;
582   }
583 
584   status = xnn_status_out_of_memory;
585 
586   depth_to_space_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
587   if (depth_to_space_op == NULL) {
588     xnn_log_error(
589       "failed to allocate %zu bytes for %s operator descriptor",
590       sizeof(struct xnn_operator), xnn_operator_type_to_string(xnn_operator_type_depth_to_space_nchw2nhwc_x32));
591     goto error;
592   }
593 
594   depth_to_space_op->channels = output_channels;
595   depth_to_space_op->input_pixel_stride = input_channel_stride;
596   depth_to_space_op->output_pixel_stride = output_channel_stride;
597   depth_to_space_op->block_size = block_size;
598 
599   depth_to_space_op->type = xnn_operator_type_depth_to_space_nchw2nhwc_x32;
600   depth_to_space_op->flags = flags;
601 
602   depth_to_space_op->state = xnn_run_state_invalid;
603 
604   *depth_to_space_op_out = depth_to_space_op;
605   return xnn_status_success;
606 
607 error:
608   xnn_delete_operator(depth_to_space_op);
609   return status;
610 }
611 
xnn_setup_depth_to_space_nchw2nhwc_x32(xnn_operator_t depth_to_space_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,pthreadpool_t threadpool)612 enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x32(
613     xnn_operator_t depth_to_space_op,
614     size_t batch_size,
615     size_t input_height,
616     size_t input_width,
617     const void* input,
618     void* output,
619     pthreadpool_t threadpool)
620 {
621   if (depth_to_space_op->type != xnn_operator_type_depth_to_space_nchw2nhwc_x32) {
622     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
623       xnn_operator_type_to_string(xnn_operator_type_depth_to_space_nchw2nhwc_x32),
624       xnn_operator_type_to_string(depth_to_space_op->type));
625     return xnn_status_invalid_parameter;
626   }
627   depth_to_space_op->state = xnn_run_state_invalid;
628 
629   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
630     xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
631       xnn_operator_type_to_string(xnn_operator_type_depth_to_space_nchw2nhwc_x32));
632     return xnn_status_uninitialized;
633   }
634 
635   if (input_width == 0 || input_height == 0) {
636     xnn_log_error("failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
637       xnn_operator_type_to_string(xnn_operator_type_depth_to_space_nchw2nhwc_x32), input_width, input_height);
638     return xnn_status_invalid_parameter;
639   }
640 
641   if (batch_size == 0) {
642     depth_to_space_op->state = xnn_run_state_skip;
643     return xnn_status_success;
644   }
645 
646   const uint32_t block_size = depth_to_space_op->block_size;
647   const size_t channels = depth_to_space_op->channels;
648 
649   const size_t input_shape[6] = {batch_size, block_size, block_size, channels, input_height, input_width};
650   const size_t perm[6] = {0, 4, 1, 5, 2, 3};
651   const size_t area = input_height * input_width;
652   const size_t elements_per_batch = area * channels;
653   const size_t input_stride[6] = {
654     depth_to_space_op->input_pixel_stride * area,
655     block_size * elements_per_batch,
656     elements_per_batch,
657     area,
658     input_width,
659     1};
660   const size_t output_stride[6] = {
661     input_height * block_size * input_width * block_size * depth_to_space_op->output_pixel_stride,
662     block_size * input_width * block_size * depth_to_space_op->output_pixel_stride,
663     input_width * block_size * depth_to_space_op->output_pixel_stride,
664     block_size * depth_to_space_op->output_pixel_stride,
665     depth_to_space_op->output_pixel_stride,
666     1};
667 
668   return setup_transpose_nd(
669     depth_to_space_op,
670     input,
671     output,
672     6,
673     input_shape,
674     perm,
675     input_stride,
676     output_stride,
677     sizeof(uint32_t));
678 }
679 
create_depth_to_space_nhwc(size_t output_channels,size_t input_channel_stride,size_t output_channel_stride,uint32_t block_size,uint32_t flags,enum xnn_operator_type operator_type,xnn_operator_t * depth_to_space_op_out)680 static enum xnn_status create_depth_to_space_nhwc(
681     size_t output_channels,
682     size_t input_channel_stride,
683     size_t output_channel_stride,
684     uint32_t block_size,
685     uint32_t flags,
686     enum xnn_operator_type operator_type,
687     xnn_operator_t* depth_to_space_op_out)
688 {
689   xnn_operator_t depth_to_space_op = NULL;
690   enum xnn_status status = xnn_status_uninitialized;
691 
692   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
693     xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
694       xnn_operator_type_to_string(operator_type));
695     goto error;
696   }
697 
698   status = xnn_status_invalid_parameter;
699 
700   if (output_channels == 0) {
701     xnn_log_error("failed to create %s operator with %zu output channels: number of channels must be non-zero",
702       xnn_operator_type_to_string(operator_type), output_channels);
703     goto error;
704   }
705 
706   if (output_channel_stride < output_channels) {
707     xnn_log_error(
708       "failed to create %s operator with output channel stride of %zu: "
709       "stride must be at least as large as the number of output channels (%zu)",
710       xnn_operator_type_to_string(operator_type),
711       output_channel_stride, output_channels);
712     goto error;
713   }
714 
715   if (block_size <= 1) {
716     xnn_log_error("failed to create %s operator with %u block size: block size must be greater than 1",
717       xnn_operator_type_to_string(operator_type),
718       block_size);
719     goto error;
720   }
721 
722   const size_t input_channels = output_channels * block_size * block_size;
723   if (input_channel_stride < input_channels) {
724     xnn_log_error(
725       "failed to create %s operator with input channel stride of %zu: "
726       "stride must be at least as large as the number of input channels (%" PRIu32 "x%" PRIu32 "x%zu)",
727       xnn_operator_type_to_string(operator_type),
728       input_channel_stride, block_size, block_size, input_channels);
729     goto error;
730   }
731 
732   status = xnn_status_out_of_memory;
733 
734   depth_to_space_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
735   if (depth_to_space_op == NULL) {
736     xnn_log_error(
737       "failed to allocate %zu bytes for %s operator descriptor",
738       sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
739     goto error;
740   }
741 
742   depth_to_space_op->channels = output_channels;
743   depth_to_space_op->input_pixel_stride = input_channel_stride;
744   depth_to_space_op->output_pixel_stride = output_channel_stride;
745   depth_to_space_op->block_size = block_size;
746 
747   depth_to_space_op->type = operator_type;
748   depth_to_space_op->flags = flags;
749 
750   depth_to_space_op->state = xnn_run_state_invalid;
751 
752   *depth_to_space_op_out = depth_to_space_op;
753   return xnn_status_success;
754 
755 error:
756   xnn_delete_operator(depth_to_space_op);
757   return status;
758 }
759 
xnn_create_depth_to_space_nhwc_x8(size_t output_channels,size_t input_channel_stride,size_t output_channel_stride,uint32_t block_size,uint32_t flags,xnn_operator_t * depth_to_space_op_out)760 enum xnn_status xnn_create_depth_to_space_nhwc_x8(
761     size_t output_channels,
762     size_t input_channel_stride,
763     size_t output_channel_stride,
764     uint32_t block_size,
765     uint32_t flags,
766     xnn_operator_t* depth_to_space_op_out)
767 {
768   return create_depth_to_space_nhwc(
769     output_channels,
770     input_channel_stride,
771     output_channel_stride,
772     block_size,
773     flags,
774     xnn_operator_type_depth_to_space_nhwc_x8,
775     depth_to_space_op_out);
776 }
777 
xnn_create_depth_to_space_nhwc_x16(size_t output_channels,size_t input_channel_stride,size_t output_channel_stride,uint32_t block_size,uint32_t flags,xnn_operator_t * depth_to_space_op_out)778 enum xnn_status xnn_create_depth_to_space_nhwc_x16(
779     size_t output_channels,
780     size_t input_channel_stride,
781     size_t output_channel_stride,
782     uint32_t block_size,
783     uint32_t flags,
784     xnn_operator_t* depth_to_space_op_out)
785 {
786   return create_depth_to_space_nhwc(
787     output_channels,
788     input_channel_stride,
789     output_channel_stride,
790     block_size,
791     flags,
792     xnn_operator_type_depth_to_space_nhwc_x16,
793     depth_to_space_op_out);
794 }
795 
xnn_create_depth_to_space_nhwc_x32(size_t output_channels,size_t input_channel_stride,size_t output_channel_stride,uint32_t block_size,uint32_t flags,xnn_operator_t * depth_to_space_op_out)796 enum xnn_status xnn_create_depth_to_space_nhwc_x32(
797     size_t output_channels,
798     size_t input_channel_stride,
799     size_t output_channel_stride,
800     uint32_t block_size,
801     uint32_t flags,
802     xnn_operator_t* depth_to_space_op_out)
803 {
804   return create_depth_to_space_nhwc(
805     output_channels,
806     input_channel_stride,
807     output_channel_stride,
808     block_size,
809     flags,
810     xnn_operator_type_depth_to_space_nhwc_x32,
811     depth_to_space_op_out);
812 }
813 
setup_depth_to_space_nhwc(xnn_operator_t depth_to_space_op,enum xnn_operator_type expected_operator_type,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,uint32_t element_size)814 static enum xnn_status setup_depth_to_space_nhwc(
815     xnn_operator_t depth_to_space_op,
816     enum xnn_operator_type expected_operator_type,
817     size_t batch_size,
818     size_t input_height,
819     size_t input_width,
820     const void* input,
821     void* output,
822     uint32_t element_size)
823 {
824   if (depth_to_space_op->type != expected_operator_type) {
825     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
826       xnn_operator_type_to_string(expected_operator_type),
827       xnn_operator_type_to_string(depth_to_space_op->type));
828     return xnn_status_invalid_parameter;
829   }
830   depth_to_space_op->state = xnn_run_state_invalid;
831 
832   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
833     xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
834       xnn_operator_type_to_string(expected_operator_type));
835     return xnn_status_uninitialized;
836   }
837 
838   if (input_width == 0 || input_height == 0) {
839     xnn_log_error("failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
840       xnn_operator_type_to_string(expected_operator_type), input_width, input_height);
841     return xnn_status_invalid_parameter;
842   }
843 
844   if (batch_size == 0) {
845     depth_to_space_op->state = xnn_run_state_skip;
846     return xnn_status_success;
847   }
848 
849   const uint32_t block_size = depth_to_space_op->block_size;
850   const size_t channels = depth_to_space_op->channels;
851   const size_t input_pixel_stride = depth_to_space_op->input_pixel_stride;
852   const size_t output_pixel_stride = depth_to_space_op->output_pixel_stride;
853   const size_t block_output_pixel_stride = block_size * depth_to_space_op->output_pixel_stride;
854 
855   const size_t input_shape[5] = {batch_size * input_height, input_width, block_size, block_size, channels};
856   const size_t perm[5] = {0, 2, 1, 3, 4};
857   const size_t input_stride[5] = {
858     input_width * input_pixel_stride,
859     input_pixel_stride,
860     block_size * channels,
861     channels,
862     1};
863   const size_t output_stride[5] = {
864     block_size * input_width * block_output_pixel_stride,
865     input_width * block_output_pixel_stride,
866     block_output_pixel_stride,
867     output_pixel_stride,
868     1};
869 
870   return setup_transpose_nd(
871       depth_to_space_op,
872       input,
873       output,
874       5,
875       input_shape,
876       perm,
877       input_stride,
878       output_stride,
879       element_size);
880 }
881 
xnn_setup_depth_to_space_nhwc_x8(xnn_operator_t depth_to_space_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,pthreadpool_t threadpool)882 enum xnn_status xnn_setup_depth_to_space_nhwc_x8(
883     xnn_operator_t depth_to_space_op,
884     size_t batch_size,
885     size_t input_height,
886     size_t input_width,
887     const void* input,
888     void* output,
889     pthreadpool_t threadpool)
890 {
891   return setup_depth_to_space_nhwc(
892     depth_to_space_op,
893     xnn_operator_type_depth_to_space_nhwc_x8,
894     batch_size, input_height, input_width,
895     input, output, 1);
896 }
897 
xnn_setup_depth_to_space_nhwc_x16(xnn_operator_t depth_to_space_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,pthreadpool_t threadpool)898 enum xnn_status xnn_setup_depth_to_space_nhwc_x16(
899     xnn_operator_t depth_to_space_op,
900     size_t batch_size,
901     size_t input_height,
902     size_t input_width,
903     const void* input,
904     void* output,
905     pthreadpool_t threadpool)
906 {
907   return setup_depth_to_space_nhwc(
908     depth_to_space_op,
909     xnn_operator_type_depth_to_space_nhwc_x16,
910     batch_size, input_height, input_width,
911     input, output, 2);
912 }
913 
xnn_setup_depth_to_space_nhwc_x32(xnn_operator_t depth_to_space_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,pthreadpool_t threadpool)914 enum xnn_status xnn_setup_depth_to_space_nhwc_x32(
915     xnn_operator_t depth_to_space_op,
916     size_t batch_size,
917     size_t input_height,
918     size_t input_width,
919     const void* input,
920     void* output,
921     pthreadpool_t threadpool)
922 {
923   return setup_depth_to_space_nhwc(
924     depth_to_space_op,
925     xnn_operator_type_depth_to_space_nhwc_x32,
926     batch_size, input_height, input_width,
927     input, output, 4);
928 }
929 
create_space_to_depth_nhwc(size_t input_channels,size_t input_channel_stride,size_t output_channel_stride,uint32_t block_size,uint32_t flags,enum xnn_operator_type operator_type,xnn_operator_t * space_to_depth_op_out)930 static enum xnn_status create_space_to_depth_nhwc(
931     size_t input_channels,
932     size_t input_channel_stride,
933     size_t output_channel_stride,
934     uint32_t block_size,
935     uint32_t flags,
936     enum xnn_operator_type operator_type,
937     xnn_operator_t* space_to_depth_op_out)
938 {
939   xnn_operator_t space_to_depth_op = NULL;
940   enum xnn_status status = xnn_status_uninitialized;
941 
942   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
943     xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
944       xnn_operator_type_to_string(operator_type));
945     goto error;
946   }
947 
948   status = xnn_status_invalid_parameter;
949 
950   if (input_channels == 0) {
951     xnn_log_error("failed to create %s operator with %zu input channels: number of channels must be non-zero",
952       xnn_operator_type_to_string(operator_type), input_channels);
953     goto error;
954   }
955 
956   if (input_channel_stride < input_channels) {
957     xnn_log_error(
958       "failed to create %s operator with input channel stride of %zu: "
959       "stride must be at least as large as the number of input channels (%zu)",
960       xnn_operator_type_to_string(operator_type),
961       input_channel_stride, input_channels);
962     goto error;
963   }
964 
965   if (block_size <= 1) {
966     xnn_log_error("failed to create %s operator with %u block size: block size must be greater than 1",
967       xnn_operator_type_to_string(operator_type),
968       block_size);
969     goto error;
970   }
971 
972   const size_t output_channels = input_channels * block_size * block_size;
973   if (output_channel_stride < output_channels) {
974     xnn_log_error(
975       "failed to create %s operator with output channel stride of %zu: "
976       "stride must be at least as large as the number of output channels (%" PRIu32 "x%" PRIu32 "x%zu)",
977       xnn_operator_type_to_string(operator_type),
978       output_channel_stride, block_size, block_size, input_channels);
979     goto error;
980   }
981 
982   status = xnn_status_out_of_memory;
983 
984   space_to_depth_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
985   if (space_to_depth_op == NULL) {
986     xnn_log_error(
987       "failed to allocate %zu bytes for %s operator descriptor",
988       sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
989     goto error;
990   }
991 
992   space_to_depth_op->channels = input_channels;
993   space_to_depth_op->input_pixel_stride = input_channel_stride;
994   space_to_depth_op->output_pixel_stride = output_channel_stride;
995   space_to_depth_op->block_size = block_size;
996 
997   space_to_depth_op->type = operator_type;
998   space_to_depth_op->flags = flags;
999 
1000   space_to_depth_op->state = xnn_run_state_invalid;
1001 
1002   *space_to_depth_op_out = space_to_depth_op;
1003   return xnn_status_success;
1004 
1005 error:
1006   xnn_delete_operator(space_to_depth_op);
1007   return status;
1008 }
1009 
xnn_create_space_to_depth_nhwc_x8(size_t input_channels,size_t input_channel_stride,size_t output_channel_stride,uint32_t block_size,uint32_t flags,xnn_operator_t * space_to_depth_op_out)1010 enum xnn_status xnn_create_space_to_depth_nhwc_x8(
1011     size_t input_channels,
1012     size_t input_channel_stride,
1013     size_t output_channel_stride,
1014     uint32_t block_size,
1015     uint32_t flags,
1016     xnn_operator_t* space_to_depth_op_out)
1017 {
1018   return create_space_to_depth_nhwc(
1019     input_channels,
1020     input_channel_stride,
1021     output_channel_stride,
1022     block_size,
1023     flags,
1024     xnn_operator_type_space_to_depth_nhwc_x8,
1025     space_to_depth_op_out);
1026 }
1027 
xnn_create_space_to_depth_nhwc_x16(size_t input_channels,size_t input_channel_stride,size_t output_channel_stride,uint32_t block_size,uint32_t flags,xnn_operator_t * space_to_depth_op_out)1028 enum xnn_status xnn_create_space_to_depth_nhwc_x16(
1029     size_t input_channels,
1030     size_t input_channel_stride,
1031     size_t output_channel_stride,
1032     uint32_t block_size,
1033     uint32_t flags,
1034     xnn_operator_t* space_to_depth_op_out)
1035 {
1036   return create_space_to_depth_nhwc(
1037     input_channels,
1038     input_channel_stride,
1039     output_channel_stride,
1040     block_size,
1041     flags,
1042     xnn_operator_type_space_to_depth_nhwc_x16,
1043     space_to_depth_op_out);
1044 }
1045 
xnn_create_space_to_depth_nhwc_x32(size_t input_channels,size_t input_channel_stride,size_t output_channel_stride,uint32_t block_size,uint32_t flags,xnn_operator_t * space_to_depth_op_out)1046 enum xnn_status xnn_create_space_to_depth_nhwc_x32(
1047     size_t input_channels,
1048     size_t input_channel_stride,
1049     size_t output_channel_stride,
1050     uint32_t block_size,
1051     uint32_t flags,
1052     xnn_operator_t* space_to_depth_op_out)
1053 {
1054   return create_space_to_depth_nhwc(
1055     input_channels,
1056     input_channel_stride,
1057     output_channel_stride,
1058     block_size,
1059     flags,
1060     xnn_operator_type_space_to_depth_nhwc_x32,
1061     space_to_depth_op_out);
1062 }
1063 
setup_space_to_depth_nhwc(xnn_operator_t space_to_depth_op,enum xnn_operator_type expected_operator_type,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,uint32_t element_size)1064 static enum xnn_status setup_space_to_depth_nhwc(
1065     xnn_operator_t space_to_depth_op,
1066     enum xnn_operator_type expected_operator_type,
1067     size_t batch_size,
1068     size_t input_height,
1069     size_t input_width,
1070     const void* input,
1071     void* output,
1072     uint32_t element_size)
1073 {
1074   if (space_to_depth_op->type != expected_operator_type) {
1075     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1076       xnn_operator_type_to_string(expected_operator_type),
1077       xnn_operator_type_to_string(space_to_depth_op->type));
1078     return xnn_status_invalid_parameter;
1079   }
1080   space_to_depth_op->state = xnn_run_state_invalid;
1081 
1082   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
1083     xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
1084       xnn_operator_type_to_string(expected_operator_type));
1085     return xnn_status_uninitialized;
1086   }
1087 
1088   if (input_width == 0 || input_height == 0) {
1089     xnn_log_error("failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
1090       xnn_operator_type_to_string(expected_operator_type), input_width, input_height);
1091     return xnn_status_invalid_parameter;
1092   }
1093 
1094   if (batch_size == 0) {
1095     space_to_depth_op->state = xnn_run_state_skip;
1096     return xnn_status_success;
1097   }
1098 
1099   const uint32_t block_size = space_to_depth_op->block_size;
1100 
1101   const size_t input_shape[5] = {batch_size * (input_height / block_size), block_size, input_width / block_size, block_size, space_to_depth_op->channels};
1102   const size_t perm[5] = {0, 2, 1, 3, 4};
1103 
1104   const size_t input_stride[5] = {
1105     block_size * input_width * space_to_depth_op->input_pixel_stride,
1106     input_width * space_to_depth_op->input_pixel_stride,
1107     block_size * space_to_depth_op->input_pixel_stride,
1108     space_to_depth_op->input_pixel_stride,
1109     1};
1110   const size_t output_stride[5] = {
1111     (input_width/block_size) * space_to_depth_op->output_pixel_stride,
1112     space_to_depth_op->output_pixel_stride,
1113     block_size * space_to_depth_op->channels,
1114     space_to_depth_op->channels,
1115     1};
1116 
1117   return setup_transpose_nd(
1118       space_to_depth_op,
1119       input,
1120       output,
1121       5,
1122       input_shape,
1123       perm,
1124       input_stride,
1125       output_stride,
1126       element_size);
1127 }
1128 
xnn_setup_space_to_depth_nhwc_x8(xnn_operator_t space_to_depth_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,pthreadpool_t threadpool)1129 enum xnn_status xnn_setup_space_to_depth_nhwc_x8(
1130     xnn_operator_t space_to_depth_op,
1131     size_t batch_size,
1132     size_t input_height,
1133     size_t input_width,
1134     const void* input,
1135     void* output,
1136     pthreadpool_t threadpool)
1137 {
1138   return setup_space_to_depth_nhwc(
1139     space_to_depth_op,
1140     xnn_operator_type_space_to_depth_nhwc_x8,
1141     batch_size, input_height, input_width,
1142     input, output, sizeof(uint8_t));
1143 }
1144 
xnn_setup_space_to_depth_nhwc_x16(xnn_operator_t space_to_depth_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,pthreadpool_t threadpool)1145 enum xnn_status xnn_setup_space_to_depth_nhwc_x16(
1146     xnn_operator_t space_to_depth_op,
1147     size_t batch_size,
1148     size_t input_height,
1149     size_t input_width,
1150     const void* input,
1151     void* output,
1152     pthreadpool_t threadpool)
1153 {
1154   return setup_space_to_depth_nhwc(
1155     space_to_depth_op,
1156     xnn_operator_type_space_to_depth_nhwc_x16,
1157     batch_size, input_height, input_width,
1158     input, output, sizeof(uint16_t));
1159 }
1160 
xnn_setup_space_to_depth_nhwc_x32(xnn_operator_t space_to_depth_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,pthreadpool_t threadpool)1161 enum xnn_status xnn_setup_space_to_depth_nhwc_x32(
1162     xnn_operator_t space_to_depth_op,
1163     size_t batch_size,
1164     size_t input_height,
1165     size_t input_width,
1166     const void* input,
1167     void* output,
1168     pthreadpool_t threadpool)
1169 {
1170   return setup_space_to_depth_nhwc(
1171     space_to_depth_op,
1172     xnn_operator_type_space_to_depth_nhwc_x32,
1173     batch_size, input_height, input_width,
1174     input, output, sizeof(uint32_t));
1175 }
1176