• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #pragma once
10 
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <stdint.h>
14 
15 #include <pthreadpool.h>
16 
17 #ifdef __cplusplus
18 extern "C" {
19 #endif
20 
21 /// The number of bytes XNNPACK may read beyond array bounds.
22 /// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK.
23 ///
24 /// Note: XNNPACK reads, but never writes beyond array bounds.
25 #define XNN_EXTRA_BYTES 16
26 
27 /// Maximum number of dimensions in tensor shape.
28 #define XNN_MAX_TENSOR_DIMS 6
29 
30 /// Allow sparse inference in a Runtime.
31 ///
32 /// Note: this flag hints XNNPACK to consider sparse inference, but does not guarantee it.
33 #define XNN_FLAG_SPARSE_INFERENCE 0x00000001
34 #define XNN_FLAG_HINT_SPARSE_INFERENCE XNN_FLAG_SPARSE_INFERENCE
35 
36 /// Allow IEEE FP16 inference in a Runtime.
37 ///
38 /// Note: this flag hints XNNPACK to consider IEEE FP16 inference, but does not guarantee it.
39 #define XNN_FLAG_FP16_INFERENCE 0x00000002
40 #define XNN_FLAG_HINT_FP16_INFERENCE XNN_FLAG_FP16_INFERENCE
41 
42 /// Force IEEE FP16 inference in a Runtime, and fail if FP16 inference is not possible.
43 ///
44 /// Note: this flag guarantees that XNNPACK will use IEEE FP16 inference, or fail to create the Runtime object.
45 /// Warning: on x86 systems FP16 computations will be emulated at a substantial performance cost.
46 #define XNN_FLAG_FORCE_FP16_INFERENCE 0x00000004
47 
48 /// Enable timing of each operator's runtime.
49 #define XNN_FLAG_BASIC_PROFILING 0x00000008
50 
51 /// The convolution operator represents a depthwise convolution, and use HWGo layout for filters.
52 #define XNN_FLAG_DEPTHWISE_CONVOLUTION 0x00000001
53 
54 /// Assume transposed weights in a fully connected operator.
55 #define XNN_FLAG_TRANSPOSE_WEIGHTS 0x00000001
56 
57 /// The operator assumes NHWC layout for the input, regardless of the output layout.
58 #define XNN_FLAG_INPUT_NHWC 0x00000002
59 
60 /// Match "SAME" padding in TensorFlow. Exact padding values are computed dynamically depending on input size.
61 #define XNN_FLAG_TENSORFLOW_SAME_PADDING 0x00000004
62 
63 /// Implicitly flatten and reshape input of a Fully Connected operator into a 2D tensor.
64 #define XNN_FLAG_TENSORFLOW_RESHAPE_2D 0x00000004
65 
66 /// Match behaviour of TensorFlow 1.x.
67 #define XNN_FLAG_TENSORFLOW_LEGACY_MODE 0x00000004
68 
69 /// Static weights of the FP16 operator are in FP32 format.
70 #define XNN_FLAG_FP32_STATIC_WEIGHTS 0x00000008
71 
72 /// Align corners of input and output images in resize operations.
73 #define XNN_FLAG_ALIGN_CORNERS 0x00000008
74 
75 /// Yield worker threads of the thread pool to the system scheduler after the inference.
76 #define XNN_FLAG_YIELD_WORKERS 0x00000010
77 
78 /// Status code for any XNNPACK function call.
79 enum xnn_status {
80   /// The call succeeded, and all output arguments now contain valid data.
81   xnn_status_success = 0,
82   xnn_status_uninitialized = 1,
83   xnn_status_invalid_parameter = 2,
84   xnn_status_invalid_state = 3,
85   xnn_status_unsupported_parameter = 4,
86   xnn_status_unsupported_hardware = 5,
87   xnn_status_out_of_memory = 6,
88 };
89 
90 struct xnn_allocator {
91   /// User-specified pointer that will be passed as-is to all functions in this structure.
92   void* context;
93   /// Pointer to a function to be called for general memory allocation.
94   ///
95   /// @param context - The user-specified pointer from xnn_allocator structure.
96   /// @param size - The size of the memory block to allocate, in bytes.
97   ///
98   /// @returns Pointer to the allocated memory block of at least @ref size bytes.
99   ///          If allocation fails, the function must return NULL.
100   void* (*allocate)(void* context, size_t size);
101   /// Pointer to a function to be called for general memory re-allocation, i.e. to increase or shrink a previously
102   /// allocated memory block. The content of the old memory block is copied to the new memory block.
103   ///
104   /// @param context - The user-specified pointer from xnn_allocator structure.
105   /// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
106   ///                  If the pointer is NULL, the @ref reallocate call is equivalent to an @ref allocate call.
107   /// @param size - The new size of the memory block to allocate, in bytes.
108   ///
109   /// @returns Pointer to the newly allocated memory block of at least @ref size bytes with the content of the previous
110   ///          memory block.
111   ///          If allocation fails, the function must return NULL, but must not release the previous memory block.
112   void* (*reallocate)(void* context, void* pointer, size_t size);
113   /// Pointer to a function to be called for general memory de-allocation.
114   ///
115   /// @param context - The user-specified pointer from xnn_allocator structure.
116   /// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
117   ///                  If the pointer is NULL, the @ref deallocate call is a no-op.
118   void (*deallocate)(void* context, void* pointer);
119   /// Pointer to a function to be called for aligned memory allocation.
120   ///
121   /// @param context - The user-specified pointer from xnn_allocator structure.
122   /// @param alignment - The alignment of the memory block to allocate, in bytes. Alignment is always a power-of-2.
123   /// @param size - The size of the memory block to allocate, in bytes.
124   ///
125   /// @returns Pointer to the allocated memory block of at least @ref size bytes.
126   ///          If allocation fails, the function must return NULL.
127   void* (*aligned_allocate)(void* context, size_t alignment, size_t size);
128   /// Pointer to a function to be called for aligned memory de-allocation.
129   ///
130   /// @param context - The user-specified pointer from xnn_allocator structure.
131   /// @param pointer - Pointer to a memory block allocated by @ref aligned_allocate function. Can be NULL.
132   ///                  If the pointer is NULL, the @ref aligned_deallocate call is a no-op.
133   void (*aligned_deallocate)(void* context, void* pointer);
134 };
135 
136 /// Initialize XNNPACK library.
137 ///
138 /// XNNPACK must be successfully initialized before use. During initialization, XNNPACK populates internal structures
139 /// depending on the host processor. Initialization can be time-consuming.
140 ///
141 /// @param[in] allocator - structure with function pointers to be use for memory allocation and de-allocation.
142 ///                        If this argument is NULL, system-provided memory management functions (e.g. malloc/free)
143 ///                        will be used.
144 ///
145 /// @retval xnn_status_success - XNNPACK is successfully initialized and ready to use.
146 /// @retval xnn_status_out_of_memory - initialization failed due to out-of-memory condition.
147 /// @retval xnn_status_unsupported_hardware - initialization failed because the host processor does not satisfy the
148 ///                                           minimum hardware requirements for XNNPACK. E.g. this may happen on x86
149 ///                                           processors without SSE2 extension, or on 32-bit ARM processors without
150 ///                                           the NEON SIMD extension.
151 enum xnn_status xnn_initialize(const struct xnn_allocator* allocator);
152 
153 /// Deinitialize XNNPACK library.
154 ///
155 /// To avoid memory and resource leaks, users must call xnn_deinitialize once for each successful xnn_initialize call.
156 ///
157 /// @retval xnn_status_success - deinitialization call succeeded.
158 enum xnn_status xnn_deinitialize(void);
159 
160 /// Subgraph is an abstract representation of a neural network model.
161 /// Subgraph objects are used to define Values (tensors) and Nodes (operators) comprising the model.
162 typedef struct xnn_subgraph* xnn_subgraph_t;
163 
164 /// Create a empty Subgraph object.
165 ///
166 /// @param external_value_ids - number of Value IDs to reserve for communication with external graph representation.
167 ///                             The Subgraph object would avoid creating internal Value IDs in the
168 ///                             [0, reserved_value_ids-1] range.
169 /// @param flags - binary features of the subgraph. No supported flags are currently defined.
170 /// @param subgraph_out - pointer to the variable that will be initialized with a handle to the Subgraph object upon
171 ///                       successful return.
172 enum xnn_status xnn_create_subgraph(
173   uint32_t external_value_ids,
174   uint32_t flags,
175   xnn_subgraph_t* subgraph_out);
176 
177 /// Destroy a Subgraph object, as well as Values, and Nodes associated with the subgraph.
178 ///
179 /// @param subgraph - the Subgraph object to destroy.
180 enum xnn_status xnn_delete_subgraph(
181   xnn_subgraph_t subgraph);
182 
183 #define XNN_VALUE_FLAG_EXTERNAL_INPUT  0x00000001
184 #define XNN_VALUE_FLAG_EXTERNAL_OUTPUT 0x00000002
185 
186 #define XNN_INVALID_VALUE_ID UINT32_MAX
187 
188 /// Type of elements in a Value object.
189 enum xnn_datatype {
190   /// Invalid data type. Valid Values never have this datatype.
191   xnn_datatype_invalid = 0,
192   /// IEEE754 single-precision floating-point.
193   xnn_datatype_fp32 = 1,
194   /// IEEE754 half-precision floating-point.
195   xnn_datatype_fp16 = 2,
196   /// Quantized 8-bit signed integer with shared per-Value quantization parameters.
197   xnn_datatype_qint8 = 3,
198   /// Quantized 8-bit unsigned integer with shared per-Value quantization parameters.
199   xnn_datatype_quint8 = 4,
200   /// Quantized 32-bit signed integer with shared per-Value quantization parameters.
201   xnn_datatype_qint32 = 5,
202   /// Quantized 8-bit signed integer with shared per-channel quantization parameters.
203   xnn_datatype_qcint8 = 6,
204   /// Quantized 32-bit signed integer with shared per-channel quantization parameters.
205   xnn_datatype_qcint32 = 7,
206 };
207 
208 /// Define a tensor-type Value and add it to a Subgraph.
209 ///
210 /// @param subgraph - a Subgraph object that will own the created Value.
211 /// @param datatype - type of the tensor elements.
212 /// @param num_dims - number of dimensions in the shape.
213 /// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
214 ///               XNNPACK does not keep any pointers to this array after the function returns.
215 /// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
216 ///               this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
217 ///               of the Subgraph object, and of any Runtime objects created from the Subgraph.
218 /// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
219 ///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
220 ///                      created for the Value.
221 /// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
222 ///                and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
223 /// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
224 ///                 valid @a external_id was provided, the variable will be initialized with the @a external_id value.
225 enum xnn_status xnn_define_tensor_value(
226   xnn_subgraph_t subgraph,
227   enum xnn_datatype datatype,
228   size_t num_dims,
229   const size_t* dims,
230   const void* data,
231   uint32_t external_id,
232   uint32_t flags,
233   uint32_t* id_out);
234 
235 /// Define a quantized tensor-type Value and add it to a Subgraph.
236 ///
237 /// @param subgraph - a Subgraph object that will own the created Value.
238 /// @param datatype - type of the tensor elements.
239 /// @param zero_point - offset from zero to subtract from the quantized elements in the Value.
240 /// @param scale - multiplication factor to convert quantized elements to real representation.
241 /// @param num_dims - number of dimensions in the shape.
242 /// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
243 ///               XNNPACK does not keep any pointers to this array after the function returns.
244 /// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
245 ///               this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
246 ///               of the Subgraph object, and of any Runtime objects created from the Subgraph.
247 /// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
248 ///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
249 ///                      created for the Value.
250 /// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
251 ///                and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
252 /// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
253 ///                 valid @a external_id was provided, the variable will be initialized with the @a external_id value.
254 enum xnn_status xnn_define_quantized_tensor_value(
255   xnn_subgraph_t subgraph,
256   enum xnn_datatype datatype,
257   int32_t zero_point,
258   float scale,
259   size_t num_dims,
260   const size_t* dims,
261   const void* data,
262   uint32_t external_id,
263   uint32_t flags,
264   uint32_t* id_out);
265 
266 /// Define a channelwise quantized tensor-type Value and add it to a Subgraph.
267 ///
268 /// @param subgraph - a Subgraph object that will own the created Value.
269 /// @param datatype - type of the tensor elements.
270 /// @param scale - per-channel multiplication factors to convert quantized elements to real representation.
271 /// @param num_dims - number of dimensions in the shape.
272 /// @param channel_dim - index of the channel dimension in the tensor with per-channel quantization parameters.
273 ///                      Typically this is the first dimension (dimension #0) of the filter tensors in the Convolution,
274 ///                      Deconvolution, and Fully Connected operators and the last dimension of the filter tensors in
275 ///                      the Depthwise Convolution operators.
276 /// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
277 ///               XNNPACK does not keep any pointers to this array after the function returns.
278 /// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
279 ///               this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
280 ///               of the Subgraph object, and of any Runtime objects created from the Subgraph.
281 /// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
282 ///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
283 ///                      created for the Value.
284 /// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
285 ///                and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
286 /// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
287 ///                 valid @a external_id was provided, the variable will be initialized with the @a external_id value.
288 enum xnn_status xnn_define_channelwise_quantized_tensor_value(
289   xnn_subgraph_t subgraph,
290   enum xnn_datatype datatype,
291   const float* scale,
292   size_t num_dims,
293   size_t channel_dim,
294   const size_t* dims,
295   const void* data,
296   uint32_t external_id,
297   uint32_t flags,
298   uint32_t* id_out);
299 
300 /// Define a Convert Node and add it to a Subgraph.
301 ///
302 /// @param subgraph - a Subgraph object that will own the created Node.
303 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
304 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
305 ///                    shape must match the shape of the input tensor.
306 /// @param flags - binary features of the Convert Node. No supported flags are currently defined.
307 enum xnn_status xnn_define_convert(
308   xnn_subgraph_t subgraph,
309   uint32_t input_id,
310   uint32_t output_id,
311   uint32_t flags);
312 
313 /// Define a 2D Convolution Node and add it to a Subgraph.
314 ///
315 /// @param subgraph - a Subgraph object that will own the created Node.
316 /// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
317 ///                            flag is specified.
318 /// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
319 ///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
320 /// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
321 ///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
322 /// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
323 ///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
324 /// @param kernel_height - kernel (filter) height.
325 /// @param kernel_width - kernel (filter) width.
326 /// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
327 /// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
328 /// @param dilation_height - dilation of kernel elements along the height dimension.
329 /// @param dilation_width - dilation of kernel elements along the width dimension.
330 /// @param groups - number of convolution groups.
331 /// @param group_input_channels - number of input channels per group.
332 /// @param group_output_channels - number of output channels per group.
333 /// @param output_min - lower bound for clipping output values.
334 /// @param output_max - upper bound for clipping output values.
335 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
336 ///                   with [N, IH, IW, groups * group_input_channels] dimensions
337 /// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
338 ///                    with [groups * group_output_channels, kernel_height, kernel_width, group_input_channels]
339 ///                    dimensions.
340 /// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If
341 ///                  present, the bias tensor must be a 1D tensor defined in the @a subgraph with [groups *
342 ///                  group_output_channels] dimensions.
343 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
344 ///                    with [N, OH, OW, groups * group_output_channels] dimensions.
345 /// @param flags - binary features of the 2D Convolution Node. The only currently supported values is
346 ///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
347 enum xnn_status xnn_define_convolution_2d(
348   xnn_subgraph_t subgraph,
349   uint32_t input_padding_top,
350   uint32_t input_padding_right,
351   uint32_t input_padding_bottom,
352   uint32_t input_padding_left,
353   uint32_t kernel_height,
354   uint32_t kernel_width,
355   uint32_t subsampling_height,
356   uint32_t subsampling_width,
357   uint32_t dilation_height,
358   uint32_t dilation_width,
359   uint32_t groups,
360   size_t group_input_channels,
361   size_t group_output_channels,
362   float output_min,
363   float output_max,
364   uint32_t input_id,
365   uint32_t filter_id,
366   uint32_t bias_id,
367   uint32_t output_id,
368   uint32_t flags);
369 
370 /// Define a 2D Deconvolution (Transposed Convolution) Node and add it to a Subgraph.
371 ///
372 /// @param subgraph - a Subgraph object that will own the created Node.
373 /// @param padding_top - implicit padding above 2D output data.
374 /// @param padding_right - implicit padding to the right of 2D output data.
375 /// @param padding_bottom - implicit padding below 2D output data.
376 /// @param padding_left - implicit padding to the left of 2D output data.
377 /// @param adjustment_height - additional elements in the bottom of the 2D output data.
378 /// @param adjustment_width - additional elements to the right of the 2D output data.
379 /// @param kernel_height - kernel (filter) height.
380 /// @param kernel_width - kernel (filter) width.
381 /// @param upsampling_height - height of upsampling region for deconvolution input (deconvolution height stride).
382 /// @param upsampling_width - width of upsampling region for deconvolution input (deconvolution width stride).
383 /// @param dilation_height - dilation of kernel elements along the height dimension.
384 /// @param dilation_width - dilation of kernel elements along the width dimension.
385 /// @param groups - number of convolution groups.
386 /// @param group_input_channels - number of input channels per group.
387 /// @param group_output_channels - number of output channels per group.
388 /// @param output_min - lower bound for clipping output values.
389 /// @param output_max - upper bound for clipping output values.
390 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
391 ///                   with [N, IH, IW, groups * group_input_channels] dimensions
392 /// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
393 ///                    with [groups * group_output_channels, kernel_height, kernel_width, group_input_channels]
394 ///                    dimensions.
395 /// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If
396 ///                  present, the bias tensor must be a 1D tensor defined in the @a subgraph with
397 ///                  [groups * group_output_channels] dimensions.
398 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
399 ///                    with [N, OH, OW, groups * group_output_channels] dimensions.
400 /// @param flags - binary features of the 2D Deconvolution Node. No supported flags are currently defined.
401 enum xnn_status xnn_define_deconvolution_2d(
402   xnn_subgraph_t subgraph,
403   uint32_t padding_top,
404   uint32_t padding_right,
405   uint32_t padding_bottom,
406   uint32_t padding_left,
407   uint32_t adjustment_height,
408   uint32_t adjustment_width,
409   uint32_t kernel_height,
410   uint32_t kernel_width,
411   uint32_t upsampling_height,
412   uint32_t upsampling_width,
413   uint32_t dilation_height,
414   uint32_t dilation_width,
415   uint32_t groups,
416   size_t group_input_channels,
417   size_t group_output_channels,
418   float output_min,
419   float output_max,
420   uint32_t input_id,
421   uint32_t filter_id,
422   uint32_t bias_id,
423   uint32_t output_id,
424   uint32_t flags);
425 
426 /// Define a 2D Depthwise Convolution Node and add it to a Subgraph.
427 ///
428 /// @param subgraph - a Subgraph object that will own the created Node.
429 /// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
430 ///                            flag is specified.
431 /// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
432 ///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
433 /// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
434 ///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
435 /// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
436 ///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
437 /// @param kernel_height - kernel (filter) height.
438 /// @param kernel_width - kernel (filter) width.
439 /// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
440 /// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
441 /// @param dilation_height - dilation of kernel elements along the height dimension.
442 /// @param dilation_width - dilation of kernel elements along the width dimension.
443 /// @param depth_multiplier - ratio of output channels to input channels.
444 /// @param input_channels - number of input channels.
445 /// @param output_min - lower bound for clipping output values.
446 /// @param output_max - upper bound for clipping output values.
447 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
448 ///                   with [N, IH, IW, input_channels] dimensions
449 /// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
450 ///                    with [1, kernel_height, kernel_width, input_channels * depth_multiplier] dimensions.
451 /// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Depthwise Convolution Node without
452 ///                  a bias. If present, the bias tensor must be a 1D tensor defined in the @a subgraph with
453 ///                  [input_channels * depth_multiplier] dimensions.
454 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
455 ///                    with [N, OH, OW, input_channels * depth_multiplier] dimensions.
456 /// @param flags - binary features of the 2D Depthwise Convolution Node. The only currently supported values is
457 ///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
458 enum xnn_status xnn_define_depthwise_convolution_2d(
459   xnn_subgraph_t subgraph,
460   uint32_t input_padding_top,
461   uint32_t input_padding_right,
462   uint32_t input_padding_bottom,
463   uint32_t input_padding_left,
464   uint32_t kernel_height,
465   uint32_t kernel_width,
466   uint32_t subsampling_height,
467   uint32_t subsampling_width,
468   uint32_t dilation_height,
469   uint32_t dilation_width,
470   uint32_t depth_multiplier,
471   size_t input_channels,
472   float output_min,
473   float output_max,
474   uint32_t input_id,
475   uint32_t filter_id,
476   uint32_t bias_id,
477   uint32_t output_id,
478   uint32_t flags);
479 
480 /// Define a Depth To Space Node and add it to a Subgraph.
481 ///
482 /// The Depth To Space Node rearranges data from depth into blocks of spatial data (a reverse transform to
483 /// Space To Depth). For a given input pixel, an output square of pixels with side @a block_size is formed from values
484 /// in the corresponding number of its channels. The output depth is therefore @a block_size x @a block_size times
485 /// smaller than that of the input.
486 ///
487 /// @param subgraph - a Subgraph object that will own the created Node.
488 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
489 ///                   with [N, IH, IW, OC * block_size * block_size] dimensions.
490 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
491 ///                    with [N, IH * block_size, IW * block_size, OC] dimensions.
492 /// @param block_size - the size of the spatial block.
493 /// @param flags - binary features of the input_channels Node. No supported flags are currently defined.
494 enum xnn_status xnn_define_depth_to_space(
495   xnn_subgraph_t subgraph,
496   uint32_t input_id,
497   uint32_t output_id,
498   uint32_t block_size,
499   uint32_t flags);
500 
501 /// Define a 1D Global Average Pooling Node and add it to a Subgraph.
502 ///
503 /// @param subgraph - a Subgraph object that will own the created Node.
504 /// @param output_min - lower bound for clipping output values.
505 /// @param output_max - upper bound for clipping output values.
506 /// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 2 or more dimensions
507 ///                   defined in the @a subgraph. Averaging is performed across the second-innermost dimension.
508 /// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 2 or more
509 ///                    dimensions defined in the @a subgraph.
510 /// @param flags - binary features of the 1D Global Average Pooling Node. No supported flags are currently defined.
511 enum xnn_status xnn_define_global_average_pooling_1d(
512   xnn_subgraph_t subgraph,
513   float output_min,
514   float output_max,
515   uint32_t input_id,
516   uint32_t output_id,
517   uint32_t flags);
518 
519 /// Define a 2D Global Average Pooling Node and add it to a Subgraph.
520 ///
521 /// @param subgraph - a Subgraph object that will own the created Node.
522 /// @param output_min - lower bound for clipping output values.
523 /// @param output_max - upper bound for clipping output values.
524 /// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 3 or more dimensions
525 ///                   defined in the @a subgraph. Averaging is performed across the second- and third-innermost
526 ///                   dimensions.
527 /// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 3 or more
528 ///                    dimensions defined in the @a subgraph.
529 /// @param flags - binary features of the 2D Global Average Pooling Node. No supported flags are currently defined.
530 enum xnn_status xnn_define_global_average_pooling_2d(
531   xnn_subgraph_t subgraph,
532   float output_min,
533   float output_max,
534   uint32_t input_id,
535   uint32_t output_id,
536   uint32_t flags);
537 
538 /// Define a 2D Average Pooling Node and add it to a Subgraph.
539 ///
540 /// @param subgraph - a Subgraph object that will own the created Node.
541 /// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
542 ///                            flag is specified.
543 /// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
544 ///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
545 /// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
546 ///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
547 /// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
548 ///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
549 /// @param pooling_height - pooling (kernel) height.
550 /// @param pooling_width - pooling (kernel) width.
551 /// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
552 ///                        to vertically adjacent output pixels.
553 /// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
554 ///                        to horizontally adjacent output pixels.
555 /// @param output_min - lower bound for clipping output values.
556 /// @param output_max - upper bound for clipping output values.
557 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
558 ///                   with [N, IH, IW, channels] dimensions
559 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
560 ///                    with [N, OH, OW, channels] dimensions.
561 /// @param flags - binary features of the 2D Average Pooling Node. The only currently supported values is
562 ///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
563 enum xnn_status xnn_define_average_pooling_2d(
564   xnn_subgraph_t subgraph,
565   uint32_t input_padding_top,
566   uint32_t input_padding_right,
567   uint32_t input_padding_bottom,
568   uint32_t input_padding_left,
569   uint32_t pooling_height,
570   uint32_t pooling_width,
571   uint32_t stride_height,
572   uint32_t stride_width,
573   float output_min,
574   float output_max,
575   uint32_t input_id,
576   uint32_t output_id,
577   uint32_t flags);
578 
579 /// Define a Fully Connected Node and add it to a Subgraph.
580 ///
581 /// @param subgraph - a Subgraph object that will own the created Node.
582 /// @param output_min - lower bound for clipping output values.
583 /// @param output_max - upper bound for clipping output values.
584 /// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the
585 ///                   @a subgraph. If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the input tensor must be at least
586 ///                   1D and its last dimension must match the last dimension of the filter tensor. In particular, if
587 ///                   input is a 2D tensor, it must have [batch_size, input_channels] dimensions.
588 ///                   If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, the number of elements in the input tensor must be
589 ///                   divisible by the input_channels. The tensor will be first flattened into a 1D tensor of
590 ///                   [num_input_elements] dimensions, then reshaped into a 2D tensor of
591 ///                   [num_input_elements / input_channels, input_channels] dimensions where num_input_elements is the
592 ///                   total number of elements in the input tensor.
593 /// @param filter_id - Value ID for the filter tensor. The filter tensor must a 2D tensor defined in the @a subgraph.
594 ///                    If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is not specified, the filter tensor must have
595 ///                    [output_channels, input_channels] dimensions. If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is
596 ///                    specified, the filter tensor must have [input_channels, output_channels] dimensions.
597 /// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a Fully Connected Node without a bias.
598 ///                  If present, the bias tensor must be a 1D tensor defined in the @a subgraph with [output_channels]
599 ///                  dimensions.
600 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph.
601 ///                    If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the output tensor must have the same
602 ///                    dimensionality as the input tensor, all its dimensions but the last one must match the
603 ///                    corresponding dimensions of the input tensor, and the last dimensions of the output tensor must
604 ///                    match the first dimension of the filter tensor. In particular, if input is a 2D tensor, output
605 ///                    must be a 2D tensor of [batch_size, output_channels] dimensions.
606 ///                    If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, output must be a 2D tensor of
607 ///                    [num_input_elements / input_channels, output_channels] dimensions where num_input_elements is the
608 ///                    total number of elements in the input tensor.
609 /// @param flags - binary features of the Fully Connected Node. The only currently supported values are
610 ///                XNN_FLAG_TENSORFLOW_RESHAPE_2D and XNN_FLAG_TRANSPOSE_WEIGHTS.
611 enum xnn_status xnn_define_fully_connected(
612   xnn_subgraph_t subgraph,
613   float output_min,
614   float output_max,
615   uint32_t input_id,
616   uint32_t filter_id,
617   uint32_t bias_id,
618   uint32_t output_id,
619   uint32_t flags);
620 
621 /// Define a 2D Max Pooling Node and add it to a Subgraph.
622 ///
623 /// @param subgraph - a Subgraph object that will own the created Node.
624 /// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
625 ///                            flag is specified.
626 /// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
627 ///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
628 /// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
629 ///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
630 /// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
631 ///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
632 /// @param pooling_height - pooling (kernel) height.
633 /// @param pooling_width - pooling (kernel) width.
634 /// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
635 ///                        to vertically adjacent output pixels.
636 /// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
637 ///                        to horizontally adjacent output pixels.
638 /// @param dilation_height - dilation of pooling elements along the height dimension.
639 /// @param dilation_width - dilation of pooling elements along the width dimension.
640 /// @param output_min - lower bound for clipping output values.
641 /// @param output_max - upper bound for clipping output values.
642 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
643 ///                   with [N, IH, IW, channels] dimensions
644 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
645 ///                    with [N, OH, OW, channels] dimensions.
646 /// @param flags - binary features of the 2D Max Pooling Node. The only currently supported values is
647 ///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
648 enum xnn_status xnn_define_max_pooling_2d(
649   xnn_subgraph_t subgraph,
650   uint32_t input_padding_top,
651   uint32_t input_padding_right,
652   uint32_t input_padding_bottom,
653   uint32_t input_padding_left,
654   uint32_t pooling_height,
655   uint32_t pooling_width,
656   uint32_t stride_height,
657   uint32_t stride_width,
658   uint32_t dilation_height,
659   uint32_t dilation_width,
660   float output_min,
661   float output_max,
662   uint32_t input_id,
663   uint32_t output_id,
664   uint32_t flags);
665 
666 /// Define a 2D ArgMax Pooling Node and add it to a Subgraph.
667 ///
668 /// @param subgraph - a Subgraph object that will own the created Node.
669 /// @param input_padding_top - implicit zero-padding above 2D input data.
670 /// @param input_padding_right - implicit zero-padding to the right of 2D input data.
671 /// @param input_padding_bottom - implicit zero-padding below 2D input data.
672 /// @param input_padding_left - implicit zero-padding to the left of 2D input data.
673 /// @param pooling_height - pooling (kernel) height. Vertical stride between pooling regions match this value.
674 /// @param pooling_width - pooling (kernel) width. Horizontal stride between pooling regions match this value.
675 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
676 ///                   with [N, IH, IW, channels] dimensions
677 /// @param output_value_id - Value ID for the output tensor with the maximum values in the pools. The output tensor must
678 ///                          be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels] dimensions.
679 /// @param output_index_id - Value ID for the output tensor with the indexes of the maximum values in the pools. The
680 ///                          output tensor must be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels]
681 ///                          dimensions.
682 /// @param flags - binary features of the 2D ArgMax Pooling Node. No supported flags are currently defined.
683 enum xnn_status xnn_define_argmax_pooling_2d(
684   xnn_subgraph_t subgraph,
685   uint32_t input_padding_top,
686   uint32_t input_padding_right,
687   uint32_t input_padding_bottom,
688   uint32_t input_padding_left,
689   uint32_t pooling_height,
690   uint32_t pooling_width,
691   uint32_t input_id,
692   uint32_t output_value_id,
693   uint32_t output_index_id,
694   uint32_t flags);
695 
696 /// Define a 2D UnPooling Node and add it to a Subgraph.
697 ///
698 /// @param subgraph - a Subgraph object that will own the created Node.
699 /// @param padding_top - implicit padding above 2D output data.
700 /// @param padding_right - implicit padding to the right of 2D output data.
701 /// @param padding_bottom - implicit padding below 2D output data.
702 /// @param padding_left - implicit padding to the left of 2D output data.
703 /// @param pooling_height - height of the pooling window.
704 /// @param pooling_width - width of the pooling window.
705 /// @param input_value_id - Value ID for the input tensor with the max-pooling values to invert. The input value tensor
706 ///                         must be a 4D tensor defined in the @a subgraph with [N, IH, IW, channels] dimensions.
707 /// @param input_index_id - Value ID for the input tensor with the indices of the per-pool maximum values produced by
708 ///                         a 2D UnPooling Node. The input tensor must be a 4D tensor defined in the @a subgraph with
709 ///                         [N, IH, IW, channels] dimensions.
710 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
711 ///                    with [N, OH, OW, channels] dimensions.
712 /// @param flags - binary features of the 2D UnPooling Node. No supported flags are currently defined.
713 enum xnn_status xnn_define_unpooling_2d(
714   xnn_subgraph_t subgraph,
715   uint32_t padding_top,
716   uint32_t padding_right,
717   uint32_t padding_bottom,
718   uint32_t padding_left,
719   uint32_t pooling_height,
720   uint32_t pooling_width,
721   uint32_t input_value_id,
722   uint32_t input_index_id,
723   uint32_t output_id,
724   uint32_t flags);
725 
726 /// Define a 2-Input Add Node and add it to a Subgraph.
727 ///
728 /// The 2-Input Add Node computes elementwise addition of two tensor inputs with numpy broadcasting rules.
729 ///
730 /// @param subgraph - a Subgraph object that will own the created Node.
731 /// @param output_min - lower bound for clipping output values.
732 /// @param output_max - upper bound for clipping output values.
733 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
734 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
735 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
736 ///                    that dimension.
737 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
738 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
739 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
740 ///                    that dimension.
741 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
742 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
743 ///                    of the two inputs.
744 /// @param flags - binary features of the Add Node. No supported flags are currently defined.
745 enum xnn_status xnn_define_add2(
746   xnn_subgraph_t subgraph,
747   float output_min,
748   float output_max,
749   uint32_t input1_id,
750   uint32_t input2_id,
751   uint32_t output_id,
752   uint32_t flags);
753 
754 /// Define a 2-Input Multiply Node and add it to a Subgraph.
755 ///
756 /// The 2-Input Multiply Node computes elementwise multiplication of two tensor inputs with numpy broadcasting rules.
757 ///
758 /// @param subgraph - a Subgraph object that will own the created Node.
759 /// @param output_min - lower bound for clipping output values.
760 /// @param output_max - upper bound for clipping output values.
761 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
762 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
763 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
764 ///                    that dimension.
765 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
766 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
767 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
768 ///                    that dimension.
769 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
770 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
771 ///                    of the two inputs.
772 /// @param flags - binary features of the Multiply Node. No supported flags are currently defined.
773 enum xnn_status xnn_define_multiply2(
774   xnn_subgraph_t subgraph,
775   float output_min,
776   float output_max,
777   uint32_t input1_id,
778   uint32_t input2_id,
779   uint32_t output_id,
780   uint32_t flags);
781 
782 /// Define a Subtract Node and add it to a Subgraph.
783 ///
784 /// The Subtract Node computes elementwise subtraction of two tensor inputs with numpy broadcasting rules.
785 ///
786 /// @param subgraph - a Subgraph object that will own the created Node.
787 /// @param output_min - lower bound for clipping output values.
788 /// @param output_max - upper bound for clipping output values.
789 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
790 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
791 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
792 ///                    that dimension.
793 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
794 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
795 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
796 ///                    that dimension.
797 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
798 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
799 ///                    of the two inputs.
800 /// @param flags - binary features of the Subtract Node. No supported flags are currently defined.
801 enum xnn_status xnn_define_subtract(
802   xnn_subgraph_t subgraph,
803   float output_min,
804   float output_max,
805   uint32_t input1_id,
806   uint32_t input2_id,
807   uint32_t output_id,
808   uint32_t flags);
809 
810 /// Define a Divide Node and add it to a Subgraph.
811 ///
812 /// The Divide Node computes elementwise division of two tensor inputs with numpy broadcasting rules.
813 ///
814 /// @param subgraph - a Subgraph object that will own the created Node.
815 /// @param output_min - lower bound for clipping output values.
816 /// @param output_max - upper bound for clipping output values.
817 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
818 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
819 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
820 ///                    that dimension.
821 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
822 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
823 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
824 ///                    that dimension.
825 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
826 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
827 ///                    of the two inputs.
828 /// @param flags - binary features of the Divide Node. No supported flags are currently defined.
829 enum xnn_status xnn_define_divide(
830   xnn_subgraph_t subgraph,
831   float output_min,
832   float output_max,
833   uint32_t input1_id,
834   uint32_t input2_id,
835   uint32_t output_id,
836   uint32_t flags);
837 
838 /// Define a 2-Input Maximum Node and add it to a Subgraph.
839 ///
840 /// The 2-Input Maximum Node computes elementwise maximum of two tensor inputs with numpy broadcasting rules.
841 ///
842 /// @param subgraph - a Subgraph object that will own the created Node.
843 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
844 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
845 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
846 ///                    that dimension.
847 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
848 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
849 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
850 ///                    that dimension.
851 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
852 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
853 ///                    of the two inputs.
854 /// @param flags - binary features of the Maximum Node. No supported flags are currently defined.
855 enum xnn_status xnn_define_maximum2(
856   xnn_subgraph_t subgraph,
857   uint32_t input1_id,
858   uint32_t input2_id,
859   uint32_t output_id,
860   uint32_t flags);
861 
862 /// Define a 2-Input Minimum Node and add it to a Subgraph.
863 ///
864 /// The 2-Input Minimum Node computes elementwise minimum of two tensor inputs with numpy broadcasting rules.
865 ///
866 /// @param subgraph - a Subgraph object that will own the created Node.
867 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
868 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
869 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
870 ///                    that dimension.
871 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
872 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
873 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
874 ///                    that dimension.
875 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
876 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
877 ///                    of the two inputs.
878 /// @param flags - binary features of the Minimum Node. No supported flags are currently defined.
879 enum xnn_status xnn_define_minimum2(
880   xnn_subgraph_t subgraph,
881   uint32_t input1_id,
882   uint32_t input2_id,
883   uint32_t output_id,
884   uint32_t flags);
885 
886 /// Define a Squared Difference Node and add it to a Subgraph.
887 ///
888 /// The Squared Difference Node computes elementwise squared difference of two tensor inputs with numpy broadcasting
889 /// rules.
890 ///
891 /// @param subgraph - a Subgraph object that will own the created Node.
892 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
893 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
894 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
895 ///                    that dimension.
896 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
897 ///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
898 ///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
899 ///                    that dimension.
900 /// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
901 ///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
902 ///                    of the two inputs.
903 /// @param flags - binary features of the Squared Difference Node. No supported flags are currently defined.
904 enum xnn_status xnn_define_squared_difference(
905   xnn_subgraph_t subgraph,
906   uint32_t input1_id,
907   uint32_t input2_id,
908   uint32_t output_id,
909   uint32_t flags);
910 
911 /// Define a Constant Pad Node with static padding specification and add it to a Subgraph.
912 ///
913 /// @param subgraph - a Subgraph object that will own the created Node.
914 /// @param pre_paddings - number of padding elements to insert before input elements for every dimension. This array
915 ///                       must have as many elements as the the number of dimensions in the input tensor.
916 /// @param post_paddings - number of padding elements to insert after input elements for every dimension. This array
917 ///                        must have as many elements as the the number of dimensions in the input tensor.
918 /// @param padding_value - constant value used to initialize padding elements.
919 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
920 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
921 ///                    shape must match the shape of the input tensor with padding.
922 /// @param flags - binary features of the Constant Pad Node. No supported flags are currently defined.
923 enum xnn_status xnn_define_static_constant_pad(
924   xnn_subgraph_t subgraph,
925   const size_t* pre_paddings,
926   const size_t* post_paddings,
927   float padding_value,
928   uint32_t input_id,
929   uint32_t output_id,
930   uint32_t flags);
931 
932 /// Define a 2-Input Concatenate Node and add it to a Subgraph.
933 ///
934 /// The 2-Input Concatenate Node concatenates two tensors along a specified axis.
935 ///
936 /// @param subgraph - a Subgraph object that will own the created Node.
937 /// @param axis - the axis to concatenate the two input tensors along
938 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
939 ///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
940 ///                    second input.
941 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
942 ///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
943 ///                    first input.
944 /// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
945 ///                    in the @a subgraph with each dimension equal to the dimension of both inputs, except the axis
946 ///                    dimension, where it is the sum of the corresponding dimensions of both inputs.
947 /// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
948 enum xnn_status xnn_define_concatenate2(
949   xnn_subgraph_t subgraph,
950   size_t axis,
951   uint32_t input1_id,
952   uint32_t input2_id,
953   uint32_t output_id,
954   uint32_t flags);
955 
956 /// Define a 3-Input Concatenate Node and add it to a Subgraph.
957 ///
958 /// The 3-Input Concatenate Node concatenates three tensors along a specified axis.
959 ///
960 /// @param subgraph - a Subgraph object that will own the created Node.
961 /// @param axis - the axis to concatenate the three input tensors along
962 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
963 ///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
964 ///                    other inputs.
965 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
966 ///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
967 ///                    other inputs.
968 /// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in
969 ///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
970 ///                    other inputs.
971 /// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
972 ///                    in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis
973 ///                    dimension, where it is the sum of the corresponding dimensions of all inputs.
974 /// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
975 enum xnn_status xnn_define_concatenate3(
976   xnn_subgraph_t subgraph,
977   size_t axis,
978   uint32_t input1_id,
979   uint32_t input2_id,
980   uint32_t input3_id,
981   uint32_t output_id,
982   uint32_t flags);
983 
984 /// Define a 4-Input Concatenate Node and add it to a Subgraph.
985 ///
986 /// The 4-Input Concatenate Node concatenates four tensors along a specified axis.
987 ///
988 /// @param subgraph - a Subgraph object that will own the created Node.
989 /// @param axis - the axis to concatenate the four input tensors along
990 /// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
991 ///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
992 ///                    other inputs.
993 /// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
994 ///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
995 ///                    other inputs.
996 /// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in
997 ///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
998 ///                    other inputs.
999 /// @param input4_id - Value ID for the fourth input tensor. The input tensor must be an N-dimensional tensor defined in
1000 ///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
1001 ///                    other inputs.
1002 /// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
1003 ///                    in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis
1004 ///                    dimension, where it is the sum of the corresponding dimensions of all inputs.
1005 /// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
1006 enum xnn_status xnn_define_concatenate4(
1007   xnn_subgraph_t subgraph,
1008   size_t axis,
1009   uint32_t input1_id,
1010   uint32_t input2_id,
1011   uint32_t input3_id,
1012   uint32_t input4_id,
1013   uint32_t output_id,
1014   uint32_t flags);
1015 
1016 /// Define a 2-Output Split Node and add it to a Subgraph.
1017 ///
1018 /// The 2-Output Split Node splits an input tensor into two output tensors along a specified axis evenly.
1019 ///
1020 /// @param subgraph - a Subgraph object that will own the created Node.
1021 /// @param split_dim - the dimension to split the input tensor along
1022 /// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
1023 ///                   subgraph.
1024 /// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
1025 ///                     in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
1026 ///                     of the second output. The split_dim dimension is half of the input's split_dim.
1027 /// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
1028 ///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1029 ///                     dimension of the first output. The split_dim dimension is half of the input's split_dim.
1030 /// @param flags - binary features of the Split Node. No supported flags are currently defined.
1031 enum xnn_status xnn_define_even_split2(
1032   xnn_subgraph_t subgraph,
1033   size_t split_dim,
1034   uint32_t input_id,
1035   uint32_t output1_id,
1036   uint32_t output2_id,
1037   uint32_t flags);
1038 
1039 /// Define a 3-Output Split Node and add it to a Subgraph.
1040 ///
1041 /// The 3-Output Split Node splits an input tensor into three output tensors along a specified axis evenly.
1042 ///
1043 /// @param subgraph - a Subgraph object that will own the created Node.
1044 /// @param split_dim - the dimension to split the input tensor along
1045 /// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
1046 ///                   subgraph.
1047 /// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
1048 ///                     in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
1049 ///                     of the second and third output. The split_dim dimension is one third of the input's split_dim.
1050 /// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
1051 ///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1052 ///                     dimension of the first and third output. The split_dim dimension is one third of the input's
1053 ///                     split_dim.
1054 /// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor
1055 ///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1056 ///                     dimension of the second and third output. The split_dim dimension is one third of the input's
1057 ///                     split_dim.
1058 /// @param flags - binary features of the Split Node. No supported flags are currently defined.
1059 enum xnn_status xnn_define_even_split3(
1060   xnn_subgraph_t subgraph,
1061   size_t split_dim,
1062   uint32_t input_id,
1063   uint32_t output1_id,
1064   uint32_t output2_id,
1065   uint32_t output3_id,
1066   uint32_t flags);
1067 
1068 /// Define a 4-Output Split Node and add it to a Subgraph.
1069 ///
1070 /// The 4-Output Split Node splits an input tensor into four output tensors along a specified axis evenly.
1071 ///
1072 /// @param subgraph - a Subgraph object that will own the created Node.
1073 /// @param split_dim - the dimension to split the input tensor along
1074 /// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
1075 ///                   subgraph.
1076 /// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
1077 ///                     in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
1078 ///                     of the other output tensors. The split_dim dimension is one fourth of the input's split_dim.
1079 /// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
1080 ///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1081 ///                     dimension of the other output tensors. The split_dim dimension is one fourth of the input's
1082 ///                     split_dim.
1083 /// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor
1084 ///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1085 ///                     dimension of the other output tensors. The split_dim dimension is one fourth of the input's
1086 ///                     split_dim.
1087 /// @param output4_id - Value ID for the fourth output tensor. The output tensor must be an N-dimensional tensor
1088 ///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
1089 ///                     dimension of the other output tensors. The split_dim dimension is one fourth of the input's
1090 ///                     split_dim.
1091 /// @param flags - binary features of the Split Node. No supported flags are currently defined.
1092 enum xnn_status xnn_define_even_split4(
1093   xnn_subgraph_t subgraph,
1094   size_t split_dim,
1095   uint32_t input_id,
1096   uint32_t output1_id,
1097   uint32_t output2_id,
1098   uint32_t output3_id,
1099   uint32_t output4_id,
1100   uint32_t flags);
1101 
1102 /// Define a Reshape Node with static shape specification and add it to a Subgraph.
1103 ///
1104 /// @param subgraph - a Subgraph object that will own the created Node.
1105 /// @param num_dims - number of shape dimensions in the output tensor.
1106 /// @param new_shape - shape dimensions of the output tensor.
1107 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1108 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1109 ///                    shape must match the shape of the input tensor with padding.
1110 /// @param flags - binary features of the Reshape Node. No supported flags are currently defined.
1111 enum xnn_status xnn_define_static_reshape(
1112   xnn_subgraph_t subgraph,
1113   size_t num_dims,
1114   const size_t* new_shape,
1115   uint32_t input_id,
1116   uint32_t output_id,
1117   uint32_t flags);
1118 
1119 /// Define a 2D Resize Bilinear Node with static output height & width specification and add it to a Subgraph.
1120 ///
1121 /// @param subgraph - a Subgraph object that will own the created Node.
1122 /// @param new_height - height dimension of the output tensor.
1123 /// @param new_width - width dimension of the output tensor.
1124 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
1125 ///                   with [N, H, W, C] dimensions.
1126 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
1127 ///                    with [N, new_height, new_width, C] dimensions.
1128 /// @param flags - binary features of the 2D Resize Bilinear Node. The only currently supported values are
1129 ///                XNN_FLAG_TENSORFLOW_LEGACY_MODE and XNN_FLAG_ALIGN_CORNERS, which are mutually exclusive.
1130 enum xnn_status xnn_define_static_resize_bilinear_2d(
1131   xnn_subgraph_t subgraph,
1132   size_t new_height,
1133   size_t new_width,
1134   uint32_t input_id,
1135   uint32_t output_id,
1136   uint32_t flags);
1137 
1138 /// Define a PReLU (Parametric ReLU) Node and add it to a Subgraph.
1139 ///
1140 /// @param subgraph - a Subgraph object that will own the created Node.
1141 /// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
1142 ///                   with [N, H, W, channels] dimensions.
1143 /// @param slope_id - Value ID for the bias tensor. The bias tensor must be a 1D tensor defined in the @a subgraph with
1144 ///                   [channels] dimensions.
1145 /// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
1146 ///                    with [N, H, W, channels] dimensions.
1147 /// @param flags - binary features of the PReLU Node. No supported flags are currently defined.
1148 enum xnn_status xnn_define_prelu(
1149   xnn_subgraph_t subgraph,
1150   uint32_t input_id,
1151   uint32_t slope_id,
1152   uint32_t output_id,
1153   uint32_t flags);
1154 
1155 /// Define a Abs Node and add it to a Subgraph.
1156 ///
1157 /// @param subgraph - a Subgraph object that will own the created Node.
1158 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1159 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1160 ///                    shape must match the shape of the input tensor.
1161 /// @param flags - binary features of the Abs Node. No supported flags are currently defined.
1162 enum xnn_status xnn_define_abs(
1163   xnn_subgraph_t subgraph,
1164   uint32_t input_id,
1165   uint32_t output_id,
1166   uint32_t flags);
1167 
1168 /// Define a Bankers' Rounding Node and add it to a Subgraph.
1169 ///
1170 /// @param subgraph - a Subgraph object that will own the created Node.
1171 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1172 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1173 ///                    shape must match the shape of the input tensor.
1174 /// @param flags - binary features of the Bankers' Rounding Node. No supported flags are currently defined.
1175 enum xnn_status xnn_define_bankers_rounding(
1176   xnn_subgraph_t subgraph,
1177   uint32_t input_id,
1178   uint32_t output_id,
1179   uint32_t flags);
1180 
1181 /// Define a Ceiling Node and add it to a Subgraph.
1182 ///
1183 /// @param subgraph - a Subgraph object that will own the created Node.
1184 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1185 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1186 ///                    shape must match the shape of the input tensor.
1187 /// @param flags - binary features of the Ceiling Node. No supported flags are currently defined.
1188 enum xnn_status xnn_define_ceiling(
1189   xnn_subgraph_t subgraph,
1190   uint32_t input_id,
1191   uint32_t output_id,
1192   uint32_t flags);
1193 
1194 /// Define a Clamp Node and add it to a Subgraph.
1195 ///
1196 /// @param subgraph - a Subgraph object that will own the created Node.
1197 /// @param output_min - lower bound for clipping output values.
1198 /// @param output_max - upper bound for clipping output values.
1199 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1200 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1201 ///                    shape must match the shape of the input tensor.
1202 /// @param flags - binary features of the Clamp Node. No supported flags are currently defined.
1203 enum xnn_status xnn_define_clamp(
1204   xnn_subgraph_t subgraph,
1205   float output_min,
1206   float output_max,
1207   uint32_t input_id,
1208   uint32_t output_id,
1209   uint32_t flags);
1210 
1211 /// Define an ELU (Exponential Linear Unit) Node and add it to a Subgraph.
1212 ///
1213 /// @param subgraph - a Subgraph object that will own the created Node.
1214 /// @param alpha - scale factor for negative output elements.
1215 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1216 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1217 ///                    shape must match the shape of the input tensor.
1218 /// @param flags - binary features of the ELU Node. No supported flags are currently defined.
1219 enum xnn_status xnn_define_elu(
1220   xnn_subgraph_t subgraph,
1221   float alpha,
1222   uint32_t input_id,
1223   uint32_t output_id,
1224   uint32_t flags);
1225 
1226 /// Define a Floor Node and add it to a Subgraph.
1227 ///
1228 /// @param subgraph - a Subgraph object that will own the created Node.
1229 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1230 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1231 ///                    shape must match the shape of the input tensor.
1232 /// @param flags - binary features of the Floor Node. No supported flags are currently defined.
1233 enum xnn_status xnn_define_floor(
1234   xnn_subgraph_t subgraph,
1235   uint32_t input_id,
1236   uint32_t output_id,
1237   uint32_t flags);
1238 
1239 /// Define a HardSwish Node and add it to a Subgraph.
1240 ///
1241 /// @param subgraph - a Subgraph object that will own the created Node.
1242 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1243 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1244 ///                    shape must match the shape of the input tensor.
1245 /// @param flags - binary features of the HardSwish Node. No supported flags are currently defined.
1246 enum xnn_status xnn_define_hardswish(
1247   xnn_subgraph_t subgraph,
1248   uint32_t input_id,
1249   uint32_t output_id,
1250   uint32_t flags);
1251 
1252 /// Define a Leaky ReLU Node and add it to a Subgraph.
1253 ///
1254 /// @param subgraph - a Subgraph object that will own the created Node.
1255 /// @param negative_slope - scale factor for negative input elements.
1256 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1257 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1258 ///                    shape must match the shape of the input tensor.
1259 /// @param flags - binary features of the Leaky ReLU Node. No supported flags are currently defined.
1260 enum xnn_status xnn_define_leaky_relu(
1261   xnn_subgraph_t subgraph,
1262   float negative_slope,
1263   uint32_t input_id,
1264   uint32_t output_id,
1265   uint32_t flags);
1266 
1267 /// Define a Negate Node and add it to a Subgraph.
1268 ///
1269 /// @param subgraph - a Subgraph object that will own the created Node.
1270 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1271 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1272 ///                    shape must match the shape of the input tensor.
1273 /// @param flags - binary features of the Negate Node. No supported flags are currently defined.
1274 enum xnn_status xnn_define_negate(
1275   xnn_subgraph_t subgraph,
1276   uint32_t input_id,
1277   uint32_t output_id,
1278   uint32_t flags);
1279 
1280 /// Define a Sigmoid Node and add it to a Subgraph.
1281 ///
1282 /// @param subgraph - a Subgraph object that will own the created Node.
1283 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1284 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1285 ///                    shape must match the shape of the input tensor.
1286 /// @param flags - binary features of the Sigmoid Node. No supported flags are currently defined.
1287 enum xnn_status xnn_define_sigmoid(
1288   xnn_subgraph_t subgraph,
1289   uint32_t input_id,
1290   uint32_t output_id,
1291   uint32_t flags);
1292 
1293 /// Define a SoftMax Node and add it to a Subgraph.
1294 ///
1295 /// @param subgraph - a Subgraph object that will own the created Node.
1296 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph, and have at
1297 ///                   least one dimension.
1298 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1299 ///                    shape must match the shape of the input tensor.
1300 /// @param flags - binary features of the SoftMax Node. No supported flags are currently defined.
1301 enum xnn_status xnn_define_softmax(
1302   xnn_subgraph_t subgraph,
1303   uint32_t input_id,
1304   uint32_t output_id,
1305   uint32_t flags);
1306 
1307 /// Define a Square Node and add it to a Subgraph.
1308 ///
1309 /// @param subgraph - a Subgraph object that will own the created Node.
1310 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1311 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1312 ///                    shape must match the shape of the input tensor.
1313 /// @param flags - binary features of the Square Node. No supported flags are currently defined.
1314 enum xnn_status xnn_define_square(
1315   xnn_subgraph_t subgraph,
1316   uint32_t input_id,
1317   uint32_t output_id,
1318   uint32_t flags);
1319 
1320 /// Define a Square Root Node and add it to a Subgraph.
1321 ///
1322 /// @param subgraph - a Subgraph object that will own the created Node.
1323 /// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
1324 /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
1325 ///                    shape must match the shape of the input tensor.
1326 /// @param flags - binary features of the Square Root Node. No supported flags are currently defined.
1327 enum xnn_status xnn_define_square_root(
1328   xnn_subgraph_t subgraph,
1329   uint32_t input_id,
1330   uint32_t output_id,
1331   uint32_t flags);
1332 
1333 /// Define a Static Transpose Node and add it to a Subgraph.
1334 ///
1335 /// The Static Transpose Node applies a generalized transpose to the input tensor using the permuation in perm.
1336 ///
1337 /// @param subgraph - a Subgraph object that will own the created Node.
1338 /// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in
1339 ///                   the @a subgraph.
1340 /// @param output_id - Value ID for the output tensor. The output tensor must be an N-dimensional tensor defined
1341 ///                    in the @a subgraph with each dimension equal to its corresponding permuted input dimension.
1342 /// @param num_dims - the number of permutation dimensions. This must be equal to the number of input dimensions.
1343 /// @param perm - The permutation of the axis of the input tensor. The perm array must must contain 0 to N-1 in the
1344 ///               permuted order.
1345 /// @param flags - binary features of the Static Transpose Node. No supported flags are currently defined.
1346 enum xnn_status xnn_define_static_transpose(
1347   xnn_subgraph_t subgraph,
1348   size_t num_dims,
1349   const size_t* perm,
1350   uint32_t input_id,
1351   uint32_t output_id,
1352   uint32_t flags);
1353 
1354 /// Weights cache is a cache for packed weights. It can be reused between runtimes.
1355 typedef struct xnn_weights_cache* xnn_weights_cache_t;
1356 
1357 enum xnn_status xnn_create_weights_cache(xnn_weights_cache_t* weights_cache_out);
1358 
1359 /// Create a weights cache object specifying the initial size of weights cache (in bytes).
1360 /// @size - initial capacity of the weights cache (in bytes), i.e. it can hold size bytes without growing.
1361 /// @param weights_cache_out - pointer to the variable that will be initialized to a handle to the weights cache object
1362 ///                            upon successful return. Once created, the weights cache object can be shared between
1363 ///                            different Runtime objects.
1364 enum xnn_status xnn_create_weights_cache_with_size(size_t size, xnn_weights_cache_t* weights_cache_out);
1365 
1366 
1367 /// Weights cache can be finalized in these ways:
1368 enum xnn_weights_cache_finalization_kind {
1369   /// Weights cache is finalized, no insert operations into the weights cache is allowed, even if the "inserted"
1370   /// weights already exist in thee cache. Weights cache memory will also be trimmed to page boundary and set to
1371   /// read-only (to prevent writes).
1372   xnn_weights_cache_finalization_kind_hard,
1373   /// Weights cache will be finalized with some extra space at the end, this allows for "inserting" into the cache only
1374   /// if the weights are already in the cache, and errors on inserting uncached weights. There is memory overhead.
1375   xnn_weights_cache_finalization_kind_soft,
1376 };
1377 
1378 /// Finalizes the weights cache. The kind of finalization is specified by `finalization_kind`.
1379 /// @param weights_cache - the weights cache object to finalize.
1380 /// @param finalization_kind - the kind of finalization.
1381 enum xnn_status xnn_finalize_weights_cache(
1382   xnn_weights_cache_t weights_cache,
1383   enum xnn_weights_cache_finalization_kind finalization_kind);
1384 
1385 /// Destroy a weights cache object, as well as memory used for the cache.
1386 /// @param weights_cache - the weights cache object to destroy.
1387 enum xnn_status xnn_delete_weights_cache(xnn_weights_cache_t weights_cache);
1388 
1389 typedef struct xnn_workspace* xnn_workspace_t;
1390 
1391 /// Create a workspace object.
1392 /// @param workspace_out - pointer to the variable that will be initialized to a handle to the workspace object upon
1393 ///                        successful return. Once created, the workspace can be shared between different Runtime
1394 ///                        objects.
1395 enum xnn_status xnn_create_workspace(xnn_workspace_t* workspace_out);
1396 /// Destroy a workspace object, as well as memory used by the workspace. Object destruction can be deferred until all
1397 /// Runtime objects created with this workspace are destroyed.
1398 /// @param workspace - the workspace object to destroy.
1399 enum xnn_status xnn_release_workspace(xnn_workspace_t workspace);
1400 
1401 /// Runtime is a combination of an execution plan for subgraph Nodes and a memory manager for subgraph Values.
1402 typedef struct xnn_runtime* xnn_runtime_t;
1403 
1404 enum xnn_profile_info {
1405   /// Returns a size_t containing the number of operators.
1406   xnn_profile_info_num_operators,
1407   /// Returns a char[] containing the null character separated names of all operators.
1408   xnn_profile_info_operator_name,
1409   /// Returns a uint64_t[] with the runtimes of all operators in the same order as xnn_profile_info_operator_name.
1410   xnn_profile_info_operator_timing,
1411 };
1412 
1413 /// Return profile information for all operators.
1414 ///
1415 /// @param runtime - a Runtime object created with @ref xnn_create_runtime, @ref xnn_create_runtime_v2 or
1416 ///                  @ref xnn_create_runtime_v3.
1417 /// @param param_name - type of profile information required.
1418 /// @param param_value_size - the size in bytes of memory pointed to by param_value. If this is not sufficient then
1419 ///                           param_value_size_ret will be set to the required size and xnn_status_out_of_memory will be
1420 ///                           returned.
1421 /// @param param_value - a pointer to memory location where appropriate values for a given param_value will be written.
1422 /// @param param_value_size_ret - returns number of bytes required to write the result if param_value_size is not
1423 ///                               sufficient.
1424 enum xnn_status xnn_get_runtime_profiling_info(xnn_runtime_t runtime,
1425                                                enum xnn_profile_info param_name,
1426                                                size_t param_value_size,
1427                                                void* param_value,
1428                                                size_t* param_value_size_ret);
1429 
1430 /// Create a Runtime object from a subgraph.
1431 ///
1432 /// @param subgraph - a Subgraph object with all Values and Nodes that would be handled by the runtime. No Values or
1433 ///                   Nodes can be added to the runtime once it is constructed.
1434 /// @param weights_cache - a cache for packed weights. The runtime will look up and reuse packed weights in this cache,
1435 ///                        this will reduce memory allocated for packed weights.
1436 /// @param workspace - a workspace to hold internal tensors. The runtime will allocate space used for internal tensors
1437 ///                    and track them using workspace. Workspace can be shared and reused across different runtimes. If
1438 ///                    workspace is NULL, there will be no sharing: each runtime has its own workspace.
1439 /// @param threadpool - the thread pool to be used for parallelisation of computations in the runtime. If the thread
1440 ///                     pool is NULL, the computation would run on the caller thread without parallelization.
1441 /// @param flags - binary features of the runtime. The only currently supported values are
1442 ///                XNN_FLAG_HINT_SPARSE_INFERENCE, XNN_FLAG_HINT_FP16_INFERENCE, XNN_FLAG_FORCE_FP16_INFERENCE, and
1443 ///                XNN_FLAG_YIELD_WORKERS. If XNN_FLAG_YIELD_WORKERS is specified, worker threads would be yielded to
1444 ///                the system scheduler after processing the last operator in the Runtime.
1445 /// @param runtime_out - pointer to the variable that will be initialized with a handle to the Runtime object upon
1446 ///                      successful return. Once constructed, the Runtime object is independent of the Subgraph object
1447 ///                      used to create it.
1448 enum xnn_status xnn_create_runtime_v4(
1449   xnn_subgraph_t subgraph,
1450   xnn_weights_cache_t weights_cache,
1451   xnn_workspace_t workspace,
1452   pthreadpool_t threadpool,
1453   uint32_t flags,
1454   xnn_runtime_t* runtime_out);
1455 
1456 enum xnn_status xnn_create_runtime_v3(
1457   xnn_subgraph_t subgraph,
1458   xnn_weights_cache_t weights_cache,
1459   pthreadpool_t threadpool,
1460   uint32_t flags,
1461   xnn_runtime_t* runtime_out);
1462 
1463 enum xnn_status xnn_create_runtime_v2(
1464   xnn_subgraph_t subgraph,
1465   pthreadpool_t threadpool,
1466   uint32_t flags,
1467   xnn_runtime_t* runtime_out);
1468 
1469 enum xnn_status xnn_create_runtime(
1470   xnn_subgraph_t subgraph,
1471   xnn_runtime_t* runtime_out);
1472 
1473 struct xnn_external_value {
1474   uint32_t id;
1475   void* data;
1476 };
1477 
1478 /// Setup data pointers for external inputs and outputs in a Runtime object.
1479 ///
1480 /// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2.
1481 /// @param num_external_values - the number of external inputs and outputs specified in this call. This number must
1482 ///                              match the number of external inputs and outputs in the runtime, i.e. all external
1483 ///                              inputs and outputs in the runtime must be specified in one call.
1484 /// @param external_values - array with location information for all external inputs and outputs in the runtime.
1485 enum xnn_status xnn_setup_runtime(
1486   xnn_runtime_t runtime,
1487   size_t num_external_values,
1488   const struct xnn_external_value* external_values);
1489 
1490 /// Execute forward pass for all operators in the runtime.
1491 ///
1492 /// @param runtime - the Runtime object with the execution plan to invoke.
1493 enum xnn_status xnn_invoke_runtime(
1494   xnn_runtime_t runtime);
1495 
1496 /// Destroy a Runtime object, as well as operators and memory associated with it.
1497 ///
1498 /// @param runtime - the Runtime object to destroy.
1499 enum xnn_status xnn_delete_runtime(
1500   xnn_runtime_t runtime);
1501 
1502 typedef struct xnn_operator* xnn_operator_t;
1503 
1504 enum xnn_status xnn_run_operator(
1505   xnn_operator_t op,
1506   pthreadpool_t threadpool);
1507 
1508 enum xnn_status xnn_delete_operator(
1509   xnn_operator_t op);
1510 
1511 #ifndef XNN_NO_F32_OPERATORS
1512 
1513 enum xnn_status xnn_create_abs_nc_f32(
1514   size_t channels,
1515   size_t input_stride,
1516   size_t output_stride,
1517   uint32_t flags,
1518   xnn_operator_t* abs_op_out);
1519 
1520 enum xnn_status xnn_setup_abs_nc_f32(
1521   xnn_operator_t abs_op,
1522   size_t batch_size,
1523   const float* input,
1524   float* output,
1525   pthreadpool_t threadpool);
1526 
1527 enum xnn_status xnn_create_add_nd_f32(
1528   float output_min,
1529   float output_max,
1530   uint32_t flags,
1531   xnn_operator_t* add_op_out);
1532 
1533 enum xnn_status xnn_setup_add_nd_f32(
1534   xnn_operator_t add_op,
1535   size_t num_input1_dims,
1536   const size_t* input1_shape,
1537   size_t num_input2_dims,
1538   const size_t* input2_shape,
1539   const float* input1,
1540   const float* input2,
1541   float* output,
1542   pthreadpool_t threadpool);
1543 
1544 enum xnn_status xnn_create_argmax_pooling2d_nhwc_f32(
1545   uint32_t input_padding_top,
1546   uint32_t input_padding_right,
1547   uint32_t input_padding_bottom,
1548   uint32_t input_padding_left,
1549   uint32_t pooling_height,
1550   uint32_t pooling_width,
1551   size_t channels,
1552   size_t input_pixel_stride,
1553   size_t output_pixel_stride,
1554   uint32_t flags,
1555   xnn_operator_t* argmax_pooling_op_out);
1556 
1557 enum xnn_status xnn_setup_argmax_pooling2d_nhwc_f32(
1558   xnn_operator_t argmax_pooling_op,
1559   size_t batch_size,
1560   size_t input_height,
1561   size_t input_width,
1562   const float* input,
1563   float* output,
1564   uint32_t* index,
1565   pthreadpool_t threadpool);
1566 
1567 enum xnn_status xnn_create_average_pooling2d_nhwc_f32(
1568   uint32_t input_padding_top,
1569   uint32_t input_padding_right,
1570   uint32_t input_padding_bottom,
1571   uint32_t input_padding_left,
1572   uint32_t pooling_height,
1573   uint32_t pooling_width,
1574   uint32_t stride_height,
1575   uint32_t stride_width,
1576   size_t channels,
1577   size_t input_pixel_stride,
1578   size_t output_pixel_stride,
1579   float output_min,
1580   float output_max,
1581   uint32_t flags,
1582   xnn_operator_t* average_pooling_op_out);
1583 
1584 enum xnn_status xnn_setup_average_pooling2d_nhwc_f32(
1585   xnn_operator_t average_pooling_op,
1586   size_t batch_size,
1587   size_t input_height,
1588   size_t input_width,
1589   const float* input,
1590   float* output,
1591   pthreadpool_t threadpool);
1592 
1593 enum xnn_status xnn_create_bankers_rounding_nc_f32(
1594   size_t channels,
1595   size_t input_stride,
1596   size_t output_stride,
1597   uint32_t flags,
1598   xnn_operator_t* rounding_op_out);
1599 
1600 enum xnn_status xnn_setup_bankers_rounding_nc_f32(
1601   xnn_operator_t rounding_op,
1602   size_t batch_size,
1603   const float* input,
1604   float* output,
1605   pthreadpool_t threadpool);
1606 
1607 enum xnn_status xnn_create_ceiling_nc_f32(
1608   size_t channels,
1609   size_t input_stride,
1610   size_t output_stride,
1611   uint32_t flags,
1612   xnn_operator_t* ceiling_op_out);
1613 
1614 enum xnn_status xnn_setup_ceiling_nc_f32(
1615   xnn_operator_t ceiling_op,
1616   size_t batch_size,
1617   const float* input,
1618   float* output,
1619   pthreadpool_t threadpool);
1620 
1621 enum xnn_status xnn_create_clamp_nc_f32(
1622   size_t channels,
1623   size_t input_stride,
1624   size_t output_stride,
1625   float output_min,
1626   float output_max,
1627   uint32_t flags,
1628   xnn_operator_t* clamp_op_out);
1629 
1630 enum xnn_status xnn_setup_clamp_nc_f32(
1631   xnn_operator_t clamp_op,
1632   size_t batch_size,
1633   const float* input,
1634   float* output,
1635   pthreadpool_t threadpool);
1636 
1637 typedef const struct xnn_caches* xnn_caches_t;
1638 
1639 enum xnn_status xnn_create_convolution2d_nhwc_f32(
1640   uint32_t input_padding_top,
1641   uint32_t input_padding_right,
1642   uint32_t input_padding_bottom,
1643   uint32_t input_padding_left,
1644   uint32_t kernel_height,
1645   uint32_t kernel_width,
1646   uint32_t subsampling_height,
1647   uint32_t subsampling_width,
1648   uint32_t dilation_height,
1649   uint32_t dilation_width,
1650   uint32_t groups,
1651   size_t group_input_channels,
1652   size_t group_output_channels,
1653   size_t input_channel_stride,
1654   size_t output_channel_stride,
1655   const float* kernel,
1656   const float* bias,
1657   float output_min,
1658   float output_max,
1659   uint32_t flags,
1660   xnn_caches_t caches,
1661   xnn_operator_t* convolution_op_out);
1662 
1663 // Forward declare.
1664 struct xnn_post_operation;
1665 
1666 /// Create a convolution operator with a number of post operations. The
1667 /// convolution operator created using this function does not have output_min
1668 /// and output_max. The list of operators in post_operations will be applied in
1669 /// order. Convolution with post operations is only supported on JIT platforms
1670 /// and when JIT is enabled.
1671 enum xnn_status xnn_create_fused_convolution2d_nhwc_f32(
1672     uint32_t input_padding_top,
1673     uint32_t input_padding_right,
1674     uint32_t input_padding_bottom,
1675     uint32_t input_padding_left,
1676     uint32_t kernel_height,
1677     uint32_t kernel_width,
1678     uint32_t subsampling_height,
1679     uint32_t subsampling_width,
1680     uint32_t dilation_height,
1681     uint32_t dilation_width,
1682     uint32_t groups,
1683     size_t group_input_channels,
1684     size_t group_output_channels,
1685     size_t input_channel_stride,
1686     size_t output_channel_stride,
1687     const float* kernel,
1688     const float* bias,
1689     size_t num_post_operations,
1690     struct xnn_post_operation* post_operations,
1691     uint32_t flags,
1692     xnn_caches_t caches,
1693     xnn_operator_t* convolution_op_out);
1694 
1695 enum xnn_status xnn_setup_convolution2d_nhwc_f32(
1696   xnn_operator_t convolution_op,
1697   size_t batch_size,
1698   size_t input_height,
1699   size_t input_width,
1700   const float* input,
1701   float* output,
1702   pthreadpool_t threadpool);
1703 
1704 enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
1705   uint32_t output_padding_top,
1706   uint32_t output_padding_right,
1707   uint32_t output_padding_bottom,
1708   uint32_t output_padding_left,
1709   uint32_t kernel_height,
1710   uint32_t kernel_width,
1711   uint32_t stride_height,
1712   uint32_t stride_width,
1713   uint32_t dilation_height,
1714   uint32_t dilation_width,
1715   uint32_t groups,
1716   size_t group_input_channels,
1717   size_t group_output_channels,
1718   size_t input_pixel_stride,
1719   size_t output_pixel_stride,
1720   const float* kernel,
1721   const float* bias,
1722   float output_min,
1723   float output_max,
1724   uint32_t flags,
1725   xnn_caches_t caches,
1726   xnn_operator_t* deconvolution_op_out);
1727 
1728 enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
1729   xnn_operator_t deconvolution_op,
1730   size_t batch_size,
1731   size_t input_height,
1732   size_t input_width,
1733   uint32_t adjustment_height,
1734   uint32_t adjustment_width,
1735   const float* input,
1736   float* output,
1737   pthreadpool_t threadpool);
1738 
1739 enum xnn_status xnn_create_divide_nd_f32(
1740   float output_min,
1741   float output_max,
1742   uint32_t flags,
1743   xnn_operator_t* divide_op_out);
1744 
1745 enum xnn_status xnn_setup_divide_nd_f32(
1746   xnn_operator_t divide_op,
1747   size_t num_input1_dims,
1748   const size_t* input1_shape,
1749   size_t num_input2_dims,
1750   const size_t* input2_shape,
1751   const float* input1,
1752   const float* input2,
1753   float* output,
1754   pthreadpool_t threadpool);
1755 
1756 enum xnn_status xnn_create_elu_nc_f32(
1757   size_t channels,
1758   size_t input_stride,
1759   size_t output_stride,
1760   float alpha,
1761   uint32_t flags,
1762   xnn_operator_t* elu_op_out);
1763 
1764 enum xnn_status xnn_setup_elu_nc_f32(
1765   xnn_operator_t elu_op,
1766   size_t batch_size,
1767   const float* input,
1768   float* output,
1769   pthreadpool_t threadpool);
1770 
1771 enum xnn_status xnn_create_floor_nc_f32(
1772   size_t channels,
1773   size_t input_stride,
1774   size_t output_stride,
1775   uint32_t flags,
1776   xnn_operator_t* floor_op_out);
1777 
1778 enum xnn_status xnn_setup_floor_nc_f32(
1779   xnn_operator_t floor_op,
1780   size_t batch_size,
1781   const float* input,
1782   float* output,
1783   pthreadpool_t threadpool);
1784 
1785 enum xnn_status xnn_create_fully_connected_nc_f32(
1786   size_t input_channels,
1787   size_t output_channels,
1788   size_t input_stride,
1789   size_t output_stride,
1790   const float* kernel,
1791   const float* bias,
1792   float output_min,
1793   float output_max,
1794   uint32_t flags,
1795   const xnn_caches_t caches,
1796   xnn_operator_t* fully_connected_op_out);
1797 
1798 enum xnn_status xnn_setup_fully_connected_nc_f32(
1799   xnn_operator_t fully_connected_op,
1800   size_t batch_size,
1801   const float* input,
1802   float* output,
1803   pthreadpool_t threadpool);
1804 
1805 enum xnn_status xnn_create_global_average_pooling_nwc_f32(
1806   size_t channels,
1807   size_t input_stride,
1808   size_t output_stride,
1809   float output_min,
1810   float output_max,
1811   uint32_t flags,
1812   xnn_operator_t* global_average_pooling_op_out);
1813 
1814 enum xnn_status xnn_setup_global_average_pooling_nwc_f32(
1815   xnn_operator_t global_average_pooling_op,
1816   size_t batch_size,
1817   size_t width,
1818   const float* input,
1819   float* output,
1820   pthreadpool_t threadpool);
1821 
1822 enum xnn_status xnn_create_hardswish_nc_f32(
1823   size_t channels,
1824   size_t input_stride,
1825   size_t output_stride,
1826   uint32_t flags,
1827   xnn_operator_t* hardswish_op_out);
1828 
1829 enum xnn_status xnn_setup_hardswish_nc_f32(
1830   xnn_operator_t hardswish_op,
1831   size_t batch_size,
1832   const float* input,
1833   float* output,
1834   pthreadpool_t threadpool);
1835 
1836 enum xnn_status xnn_create_leaky_relu_nc_f32(
1837   size_t channels,
1838   size_t input_stride,
1839   size_t output_stride,
1840   float negative_slope,
1841   uint32_t flags,
1842   xnn_operator_t* leaky_relu_op_out);
1843 
1844 enum xnn_status xnn_setup_leaky_relu_nc_f32(
1845   xnn_operator_t leaky_relu_op,
1846   size_t batch_size,
1847   const float* input,
1848   float* output,
1849   pthreadpool_t threadpool);
1850 
1851 enum xnn_status xnn_create_max_pooling2d_nhwc_f32(
1852   uint32_t input_padding_top,
1853   uint32_t input_padding_right,
1854   uint32_t input_padding_bottom,
1855   uint32_t input_padding_left,
1856   uint32_t pooling_height,
1857   uint32_t pooling_width,
1858   uint32_t stride_height,
1859   uint32_t stride_width,
1860   uint32_t dilation_height,
1861   uint32_t dilation_width,
1862   size_t channels,
1863   size_t input_pixel_stride,
1864   size_t output_pixel_stride,
1865   float output_min,
1866   float output_max,
1867   uint32_t flags,
1868   xnn_operator_t* max_pooling_op_out);
1869 
1870 enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
1871   xnn_operator_t max_pooling_op,
1872   size_t batch_size,
1873   size_t input_height,
1874   size_t input_width,
1875   const float* input,
1876   float* output,
1877   pthreadpool_t threadpool);
1878 
1879 enum xnn_status xnn_create_maximum_nd_f32(
1880   uint32_t flags,
1881   xnn_operator_t* maximum_op_out);
1882 
1883 enum xnn_status xnn_setup_maximum_nd_f32(
1884   xnn_operator_t maximum_op,
1885   size_t num_input1_dims,
1886   const size_t* input1_shape,
1887   size_t num_input2_dims,
1888   const size_t* input2_shape,
1889   const float* input1,
1890   const float* input2,
1891   float* output,
1892   pthreadpool_t threadpool);
1893 
1894 enum xnn_status xnn_create_minimum_nd_f32(
1895   uint32_t flags,
1896   xnn_operator_t* minimum_op_out);
1897 
1898 enum xnn_status xnn_setup_minimum_nd_f32(
1899   xnn_operator_t minimum_op,
1900   size_t num_input1_dims,
1901   const size_t* input1_shape,
1902   size_t num_input2_dims,
1903   const size_t* input2_shape,
1904   const float* input1,
1905   const float* input2,
1906   float* output,
1907   pthreadpool_t threadpool);
1908 
1909 enum xnn_status xnn_create_multiply_nd_f32(
1910   float output_min,
1911   float output_max,
1912   uint32_t flags,
1913   xnn_operator_t* multiply_op_out);
1914 
1915 enum xnn_status xnn_setup_multiply_nd_f32(
1916   xnn_operator_t multiply_op,
1917   size_t num_input1_dims,
1918   const size_t* input1_shape,
1919   size_t num_input2_dims,
1920   const size_t* input2_shape,
1921   const float* input1,
1922   const float* input2,
1923   float* output,
1924   pthreadpool_t threadpool);
1925 
1926 enum xnn_status xnn_create_negate_nc_f32(
1927   size_t channels,
1928   size_t input_stride,
1929   size_t output_stride,
1930   uint32_t flags,
1931   xnn_operator_t* negate_op_out);
1932 
1933 enum xnn_status xnn_setup_negate_nc_f32(
1934   xnn_operator_t negate_op,
1935   size_t batch_size,
1936   const float* input,
1937   float* output,
1938   pthreadpool_t threadpool);
1939 
1940 enum xnn_status xnn_create_prelu_nc_f32(
1941   size_t channels,
1942   size_t input_stride,
1943   size_t output_stride,
1944   const float* negative_slope,
1945   uint32_t flags,
1946   xnn_caches_t caches,
1947   xnn_operator_t* prelu_op_out);
1948 
1949 enum xnn_status xnn_setup_prelu_nc_f32(
1950   xnn_operator_t prelu_op,
1951   size_t batch_size,
1952   const float* input,
1953   float* output,
1954   pthreadpool_t threadpool);
1955 
1956 enum xnn_status xnn_create_resize_bilinear2d_nchw_f32(
1957   size_t channels,
1958   size_t input_pixel_stride,
1959   size_t output_pixel_stride,
1960   uint32_t flags,
1961   xnn_operator_t* resize_op_out);
1962 
1963 enum xnn_status xnn_setup_resize_bilinear2d_nchw_f32(
1964   xnn_operator_t resize_op,
1965   size_t batch_size,
1966   size_t input_height,
1967   size_t input_width,
1968   size_t output_height,
1969   size_t output_width,
1970   const float* input,
1971   float* output,
1972   pthreadpool_t threadpool);
1973 
1974 enum xnn_status xnn_create_resize_bilinear2d_nhwc_f32(
1975   size_t channels,
1976   size_t input_pixel_stride,
1977   size_t output_pixel_stride,
1978   uint32_t flags,
1979   xnn_operator_t* resize_op_out);
1980 
1981 enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f32(
1982   xnn_operator_t resize_op,
1983   size_t batch_size,
1984   size_t input_height,
1985   size_t input_width,
1986   size_t output_height,
1987   size_t output_width,
1988   const float* input,
1989   float* output,
1990   pthreadpool_t threadpool);
1991 
1992 enum xnn_status xnn_create_sigmoid_nc_f32(
1993   size_t channels,
1994   size_t input_stride,
1995   size_t output_stride,
1996   uint32_t flags,
1997   xnn_operator_t* sigmoid_op_out);
1998 
1999 enum xnn_status xnn_setup_sigmoid_nc_f32(
2000   xnn_operator_t sigmoid_op,
2001   size_t batch_size,
2002   const float* input,
2003   float* output,
2004   pthreadpool_t threadpool);
2005 
2006 enum xnn_status xnn_create_softmax_nc_f32(
2007   size_t channels,
2008   size_t input_stride,
2009   size_t output_stride,
2010   uint32_t flags,
2011   xnn_operator_t* softmax_op_out);
2012 
2013 enum xnn_status xnn_setup_softmax_nc_f32(
2014   xnn_operator_t softmax_op,
2015   size_t batch_size,
2016   const float* input,
2017   float* output,
2018   pthreadpool_t threadpool);
2019 
2020 enum xnn_status xnn_create_square_nc_f32(
2021   size_t channels,
2022   size_t input_stride,
2023   size_t output_stride,
2024   uint32_t flags,
2025   xnn_operator_t* square_op_out);
2026 
2027 enum xnn_status xnn_setup_square_nc_f32(
2028   xnn_operator_t square_op,
2029   size_t batch_size,
2030   const float* input,
2031   float* output,
2032   pthreadpool_t threadpool);
2033 
2034 enum xnn_status xnn_create_square_root_nc_f32(
2035   size_t channels,
2036   size_t input_stride,
2037   size_t output_stride,
2038   uint32_t flags,
2039   xnn_operator_t* sqrt_op_out);
2040 
2041 enum xnn_status xnn_setup_square_root_nc_f32(
2042   xnn_operator_t sqrt_op,
2043   size_t batch_size,
2044   const float* input,
2045   float* output,
2046   pthreadpool_t threadpool);
2047 
2048 enum xnn_status xnn_create_squared_difference_nd_f32(
2049   uint32_t flags,
2050   xnn_operator_t* squared_difference_op_out);
2051 
2052 enum xnn_status xnn_setup_squared_difference_nd_f32(
2053   xnn_operator_t squared_difference_op,
2054   size_t num_input1_dims,
2055   const size_t* input1_shape,
2056   size_t num_input2_dims,
2057   const size_t* input2_shape,
2058   const float* input1,
2059   const float* input2,
2060   float* output,
2061   pthreadpool_t threadpool);
2062 
2063 enum xnn_status xnn_create_subtract_nd_f32(
2064   float output_min,
2065   float output_max,
2066   uint32_t flags,
2067   xnn_operator_t* subtract_op_out);
2068 
2069 enum xnn_status xnn_setup_subtract_nd_f32(
2070   xnn_operator_t subtract_op,
2071   size_t num_input1_dims,
2072   const size_t* input1_shape,
2073   size_t num_input2_dims,
2074   const size_t* input2_shape,
2075   const float* input1,
2076   const float* input2,
2077   float* output,
2078   pthreadpool_t threadpool);
2079 
2080 enum xnn_status xnn_create_truncation_nc_f32(
2081   size_t channels,
2082   size_t input_stride,
2083   size_t output_stride,
2084   uint32_t flags,
2085   xnn_operator_t* truncation_op_out);
2086 
2087 enum xnn_status xnn_setup_truncation_nc_f32(
2088   xnn_operator_t truncation_op,
2089   size_t batch_size,
2090   const float* input,
2091   float* output,
2092   pthreadpool_t threadpool);
2093 
2094 #ifndef XNN_NO_NCHW_OPERATORS
2095 
2096 enum xnn_status xnn_create_convolution2d_nchw_f32(
2097   uint32_t input_padding_top,
2098   uint32_t input_padding_right,
2099   uint32_t input_padding_bottom,
2100   uint32_t input_padding_left,
2101   uint32_t kernel_height,
2102   uint32_t kernel_width,
2103   uint32_t subsampling_height,
2104   uint32_t subsampling_width,
2105   uint32_t dilation_height,
2106   uint32_t dilation_width,
2107   uint32_t groups,
2108   size_t group_input_channels,
2109   size_t group_output_channels,
2110   size_t input_channel_stride,
2111   size_t output_channel_stride,
2112   const float* kernel,
2113   const float* bias,
2114   float output_min,
2115   float output_max,
2116   uint32_t flags,
2117   xnn_caches_t caches,
2118   xnn_operator_t* convolution_op_out);
2119 
2120 enum xnn_status xnn_setup_convolution2d_nchw_f32(
2121   xnn_operator_t convolution_op,
2122   size_t batch_size,
2123   size_t input_height,
2124   size_t input_width,
2125   const float* input,
2126   float* output,
2127   pthreadpool_t threadpool);
2128 
2129 enum xnn_status xnn_create_global_average_pooling_ncw_f32(
2130   size_t channels,
2131   float output_min,
2132   float output_max,
2133   uint32_t flags,
2134   xnn_operator_t* global_average_pooling_op_out);
2135 
2136 enum xnn_status xnn_setup_global_average_pooling_ncw_f32(
2137   xnn_operator_t global_average_pooling_op,
2138   size_t batch_size,
2139   size_t width,
2140   const float* input,
2141   float* output,
2142   pthreadpool_t threadpool);
2143 
2144 #endif  // XNN_NO_NCHW_OPERATORS
2145 
2146 #endif  // XNN_NO_F32_OPERATORS
2147 
2148 #ifndef XNN_NO_X32_OPERATORS
2149 
2150 enum xnn_status xnn_create_channel_shuffle_nc_x32(
2151   size_t groups,
2152   size_t group_channels,
2153   size_t input_stride,
2154   size_t output_stride,
2155   uint32_t flags,
2156   xnn_operator_t* channel_shuffle_op_out);
2157 
2158 enum xnn_status xnn_setup_channel_shuffle_nc_x32(
2159   xnn_operator_t channel_shuffle_op,
2160   size_t batch_size,
2161   const void* input,
2162   void* output,
2163   pthreadpool_t threadpool);
2164 
2165 enum xnn_status xnn_create_constant_pad_nd_x32(
2166   const void* padding_value,
2167   uint32_t flags,
2168   xnn_operator_t* constant_pad_op_out);
2169 
2170 enum xnn_status xnn_setup_constant_pad_nd_x32(
2171   xnn_operator_t constant_pad_op,
2172   size_t num_dims,
2173   const size_t* input_shape,
2174   const size_t* pre_padding,
2175   const size_t* post_padding,
2176   const void* input,
2177   void* output,
2178   pthreadpool_t threadpool);
2179 
2180 enum xnn_status xnn_create_copy_nc_x32(
2181   size_t channels,
2182   size_t input_stride,
2183   size_t output_stride,
2184   uint32_t flags,
2185   xnn_operator_t* copy_op_out);
2186 
2187 enum xnn_status xnn_setup_copy_nc_x32(
2188   xnn_operator_t copy_op,
2189   size_t batch_size,
2190   const void* input,
2191   void* output,
2192   pthreadpool_t threadpool);
2193 
2194 enum xnn_status xnn_create_depth_to_space_nhwc_x32(
2195   size_t output_channels,
2196   size_t input_channel_stride,
2197   size_t output_channel_stride,
2198   uint32_t block_size,
2199   uint32_t flags,
2200   xnn_operator_t* depth_to_space_op_out);
2201 
2202 enum xnn_status xnn_setup_depth_to_space_nhwc_x32(
2203   xnn_operator_t depth_to_space_op,
2204   size_t batch_size,
2205   size_t input_height,
2206   size_t input_width,
2207   const void* input,
2208   void* output,
2209   pthreadpool_t threadpool);
2210 
2211 enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x32(
2212   size_t output_channels,
2213   size_t input_channel_stride,
2214   size_t output_channel_stride,
2215   uint32_t block_size,
2216   uint32_t flags,
2217   xnn_operator_t* depth_to_space_op_out);
2218 
2219 enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x32(
2220   xnn_operator_t depth_to_space_op,
2221   size_t batch_size,
2222   size_t input_height,
2223   size_t input_width,
2224   const void* input,
2225   void* output,
2226   pthreadpool_t threadpool);
2227 
2228 enum xnn_status xnn_create_space_to_depth_nhwc_x32(
2229   size_t input_channels,
2230   size_t input_channel_stride,
2231   size_t output_channel_stride,
2232   uint32_t block_size,
2233   uint32_t flags,
2234   xnn_operator_t* space_to_depth_op_out);
2235 
2236 enum xnn_status xnn_setup_space_to_depth_nhwc_x32(
2237   xnn_operator_t space_to_depth_op,
2238   size_t batch_size,
2239   size_t input_height,
2240   size_t input_width,
2241   const void* input,
2242   void* output,
2243   pthreadpool_t threadpool);
2244 
2245 enum xnn_status xnn_create_transpose_nd_x32(
2246     uint32_t flags,
2247     xnn_operator_t* transpose_op_out);
2248 
2249 enum xnn_status xnn_setup_transpose_nd_x32(
2250     xnn_operator_t transpose_op,
2251     const void* input,
2252     void* output,
2253     const size_t num_dims,
2254     const size_t* input_shape,
2255     const size_t* output_perm,
2256     pthreadpool_t threadpool);
2257 
2258 enum xnn_status xnn_run_transpose_nd_x32(
2259     uint32_t flags,
2260     const void* input,
2261     void* output,
2262     const size_t num_dims,
2263     const size_t* input_shape,
2264     const size_t* output_perm,
2265     pthreadpool_t threadpool);
2266 
2267 enum xnn_status xnn_create_unpooling2d_nhwc_x32(
2268   uint32_t input_padding_top,
2269   uint32_t input_padding_right,
2270   uint32_t input_padding_bottom,
2271   uint32_t input_padding_left,
2272   uint32_t pooling_height,
2273   uint32_t pooling_width,
2274   size_t channels,
2275   size_t input_pixel_stride,
2276   size_t output_pixel_stride,
2277   uint32_t flags,
2278   xnn_operator_t* unpooling_op_out);
2279 
2280 enum xnn_status xnn_setup_unpooling2d_nhwc_x32(
2281   xnn_operator_t unpooling_op,
2282   size_t batch_size,
2283   size_t input_height,
2284   size_t input_width,
2285   const void* input,
2286   const uint32_t* index,
2287   void* output,
2288   pthreadpool_t threadpool);
2289 
2290 #endif  // XNN_NO_X32_OPERATORS
2291 
2292 #ifndef XNN_NO_F16_OPERATORS
2293 
2294 enum xnn_status xnn_create_abs_nc_f16(
2295   size_t channels,
2296   size_t input_stride,
2297   size_t output_stride,
2298   uint32_t flags,
2299   xnn_operator_t* abs_op_out);
2300 
2301 enum xnn_status xnn_setup_abs_nc_f16(
2302   xnn_operator_t abs_op,
2303   size_t batch_size,
2304   const void* input,
2305   void* output,
2306   pthreadpool_t threadpool);
2307 
2308 enum xnn_status xnn_create_add_nd_f16(
2309   float output_min,
2310   float output_max,
2311   uint32_t flags,
2312   xnn_operator_t* add_op_out);
2313 
2314 enum xnn_status xnn_setup_add_nd_f16(
2315   xnn_operator_t add_op,
2316   size_t num_input1_dims,
2317   const size_t* input1_shape,
2318   size_t num_input2_dims,
2319   const size_t* input2_shape,
2320   const void* input1,
2321   const void* input2,
2322   void* output,
2323   pthreadpool_t threadpool);
2324 
2325 enum xnn_status xnn_create_average_pooling2d_nhwc_f16(
2326   uint32_t input_padding_top,
2327   uint32_t input_padding_right,
2328   uint32_t input_padding_bottom,
2329   uint32_t input_padding_left,
2330   uint32_t pooling_height,
2331   uint32_t pooling_width,
2332   uint32_t stride_height,
2333   uint32_t stride_width,
2334   size_t channels,
2335   size_t input_pixel_stride,
2336   size_t output_pixel_stride,
2337   float output_min,
2338   float output_max,
2339   uint32_t flags,
2340   xnn_operator_t* average_pooling_op_out);
2341 
2342 enum xnn_status xnn_setup_average_pooling2d_nhwc_f16(
2343   xnn_operator_t average_pooling_op,
2344   size_t batch_size,
2345   size_t input_height,
2346   size_t input_width,
2347   const void* input,
2348   void* output,
2349   pthreadpool_t threadpool);
2350 
2351 enum xnn_status xnn_create_bankers_rounding_nc_f16(
2352   size_t channels,
2353   size_t input_stride,
2354   size_t output_stride,
2355   uint32_t flags,
2356   xnn_operator_t* rounding_op_out);
2357 
2358 enum xnn_status xnn_setup_bankers_rounding_nc_f16(
2359   xnn_operator_t rounding_op,
2360   size_t batch_size,
2361   const void* input,
2362   void* output,
2363   pthreadpool_t threadpool);
2364 
2365 enum xnn_status xnn_create_ceiling_nc_f16(
2366   size_t channels,
2367   size_t input_stride,
2368   size_t output_stride,
2369   uint32_t flags,
2370   xnn_operator_t* ceiling_op_out);
2371 
2372 enum xnn_status xnn_setup_ceiling_nc_f16(
2373   xnn_operator_t ceiling_op,
2374   size_t batch_size,
2375   const void* input,
2376   void* output,
2377   pthreadpool_t threadpool);
2378 
2379 enum xnn_status xnn_create_clamp_nc_f16(
2380   size_t channels,
2381   size_t input_stride,
2382   size_t output_stride,
2383   float output_min,
2384   float output_max,
2385   uint32_t flags,
2386   xnn_operator_t* clamp_op_out);
2387 
2388 enum xnn_status xnn_setup_clamp_nc_f16(
2389   xnn_operator_t clamp_op,
2390   size_t batch_size,
2391   const void* input,
2392   void* output,
2393   pthreadpool_t threadpool);
2394 
2395 enum xnn_status xnn_create_convolution2d_nhwc_f16(
2396   uint32_t input_padding_top,
2397   uint32_t input_padding_right,
2398   uint32_t input_padding_bottom,
2399   uint32_t input_padding_left,
2400   uint32_t kernel_height,
2401   uint32_t kernel_width,
2402   uint32_t subsampling_height,
2403   uint32_t subsampling_width,
2404   uint32_t dilation_height,
2405   uint32_t dilation_width,
2406   uint32_t groups,
2407   size_t group_input_channels,
2408   size_t group_output_channels,
2409   size_t input_channel_stride,
2410   size_t output_channel_stride,
2411   const void* kernel,
2412   const void* bias,
2413   float output_min,
2414   float output_max,
2415   uint32_t flags,
2416   xnn_caches_t caches,
2417   xnn_operator_t* convolution_op_out);
2418 
2419 enum xnn_status xnn_setup_convolution2d_nhwc_f16(
2420   xnn_operator_t convolution_op,
2421   size_t batch_size,
2422   size_t input_height,
2423   size_t input_width,
2424   const void* input,
2425   void* output,
2426   pthreadpool_t threadpool);
2427 
2428 enum xnn_status xnn_create_deconvolution2d_nhwc_f16(
2429   uint32_t output_padding_top,
2430   uint32_t output_padding_right,
2431   uint32_t output_padding_bottom,
2432   uint32_t output_padding_left,
2433   uint32_t kernel_height,
2434   uint32_t kernel_width,
2435   uint32_t stride_height,
2436   uint32_t stride_width,
2437   uint32_t dilation_height,
2438   uint32_t dilation_width,
2439   uint32_t groups,
2440   size_t group_input_channels,
2441   size_t group_output_channels,
2442   size_t input_pixel_stride,
2443   size_t output_pixel_stride,
2444   const void* kernel,
2445   const void* bias,
2446   float output_min,
2447   float output_max,
2448   uint32_t flags,
2449   xnn_caches_t caches,
2450   xnn_operator_t* deconvolution_op_out);
2451 
2452 enum xnn_status xnn_setup_deconvolution2d_nhwc_f16(
2453   xnn_operator_t deconvolution_op,
2454   size_t batch_size,
2455   size_t input_height,
2456   size_t input_width,
2457   uint32_t adjustment_height,
2458   uint32_t adjustment_width,
2459   const void* input,
2460   void* output,
2461   pthreadpool_t threadpool);
2462 
2463 enum xnn_status xnn_create_divide_nd_f16(
2464   float output_min,
2465   float output_max,
2466   uint32_t flags,
2467   xnn_operator_t* divide_op_out);
2468 
2469 enum xnn_status xnn_setup_divide_nd_f16(
2470   xnn_operator_t divide_op,
2471   size_t num_input1_dims,
2472   const size_t* input1_shape,
2473   size_t num_input2_dims,
2474   const size_t* input2_shape,
2475   const void* input1,
2476   const void* input2,
2477   void* output,
2478   pthreadpool_t threadpool);
2479 
2480 enum xnn_status xnn_create_elu_nc_f16(
2481   size_t channels,
2482   size_t input_stride,
2483   size_t output_stride,
2484   float alpha,
2485   uint32_t flags,
2486   xnn_operator_t* elu_op_out);
2487 
2488 enum xnn_status xnn_setup_elu_nc_f16(
2489   xnn_operator_t elu_op,
2490   size_t batch_size,
2491   const void* input,
2492   void* output,
2493   pthreadpool_t threadpool);
2494 
2495 enum xnn_status xnn_create_floor_nc_f16(
2496   size_t channels,
2497   size_t input_stride,
2498   size_t output_stride,
2499   uint32_t flags,
2500   xnn_operator_t* floor_op_out);
2501 
2502 enum xnn_status xnn_setup_floor_nc_f16(
2503   xnn_operator_t floor_op,
2504   size_t batch_size,
2505   const void* input,
2506   void* output,
2507   pthreadpool_t threadpool);
2508 
2509 enum xnn_status xnn_create_fully_connected_nc_f16(
2510   size_t input_channels,
2511   size_t output_channels,
2512   size_t input_stride,
2513   size_t output_stride,
2514   const void* kernel,
2515   const void* bias,
2516   float output_min,
2517   float output_max,
2518   uint32_t flags,
2519   xnn_caches_t caches,
2520   xnn_operator_t* fully_connected_op_out);
2521 
2522 enum xnn_status xnn_setup_fully_connected_nc_f16(
2523   xnn_operator_t fully_connected_op,
2524   size_t batch_size,
2525   const void* input,
2526   void* output,
2527   pthreadpool_t threadpool);
2528 
2529 enum xnn_status xnn_create_global_average_pooling_nwc_f16(
2530   size_t channels,
2531   size_t input_stride,
2532   size_t output_stride,
2533   float output_min,
2534   float output_max,
2535   uint32_t flags,
2536   xnn_operator_t* global_average_pooling_op_out);
2537 
2538 enum xnn_status xnn_setup_global_average_pooling_nwc_f16(
2539   xnn_operator_t global_average_pooling_op,
2540   size_t batch_size,
2541   size_t width,
2542   const void* input,
2543   void* output,
2544   pthreadpool_t threadpool);
2545 
2546 enum xnn_status xnn_create_hardswish_nc_f16(
2547   size_t channels,
2548   size_t input_stride,
2549   size_t output_stride,
2550   uint32_t flags,
2551   xnn_operator_t* hardswish_op_out);
2552 
2553 enum xnn_status xnn_setup_hardswish_nc_f16(
2554   xnn_operator_t hardswish_op,
2555   size_t batch_size,
2556   const void* input,
2557   void* output,
2558   pthreadpool_t threadpool);
2559 
2560 enum xnn_status xnn_create_leaky_relu_nc_f16(
2561   size_t channels,
2562   size_t input_stride,
2563   size_t output_stride,
2564   float negative_slope,
2565   uint32_t flags,
2566   xnn_operator_t* leaky_relu_op_out);
2567 
2568 enum xnn_status xnn_setup_leaky_relu_nc_f16(
2569   xnn_operator_t leaky_relu_op,
2570   size_t batch_size,
2571   const void* input,
2572   void* output,
2573   pthreadpool_t threadpool);
2574 
2575 enum xnn_status xnn_create_max_pooling2d_nhwc_f16(
2576   uint32_t input_padding_top,
2577   uint32_t input_padding_right,
2578   uint32_t input_padding_bottom,
2579   uint32_t input_padding_left,
2580   uint32_t pooling_height,
2581   uint32_t pooling_width,
2582   uint32_t stride_height,
2583   uint32_t stride_width,
2584   uint32_t dilation_height,
2585   uint32_t dilation_width,
2586   size_t channels,
2587   size_t input_pixel_stride,
2588   size_t output_pixel_stride,
2589   float output_min,
2590   float output_max,
2591   uint32_t flags,
2592   xnn_operator_t* max_pooling_op_out);
2593 
2594 enum xnn_status xnn_setup_max_pooling2d_nhwc_f16(
2595   xnn_operator_t max_pooling_op,
2596   size_t batch_size,
2597   size_t input_height,
2598   size_t input_width,
2599   const void* input,
2600   void* output,
2601   pthreadpool_t threadpool);
2602 
2603 enum xnn_status xnn_create_maximum_nd_f16(
2604   uint32_t flags,
2605   xnn_operator_t* maximum_op_out);
2606 
2607 enum xnn_status xnn_setup_maximum_nd_f16(
2608   xnn_operator_t maximum_op,
2609   size_t num_input1_dims,
2610   const size_t* input1_shape,
2611   size_t num_input2_dims,
2612   const size_t* input2_shape,
2613   const void* input1,
2614   const void* input2,
2615   void* output,
2616   pthreadpool_t threadpool);
2617 
2618 enum xnn_status xnn_create_minimum_nd_f16(
2619   uint32_t flags,
2620   xnn_operator_t* minimum_op_out);
2621 
2622 enum xnn_status xnn_setup_minimum_nd_f16(
2623   xnn_operator_t minimum_op,
2624   size_t num_input1_dims,
2625   const size_t* input1_shape,
2626   size_t num_input2_dims,
2627   const size_t* input2_shape,
2628   const void* input1,
2629   const void* input2,
2630   void* output,
2631   pthreadpool_t threadpool);
2632 
2633 enum xnn_status xnn_create_multiply_nd_f16(
2634   float output_min,
2635   float output_max,
2636   uint32_t flags,
2637   xnn_operator_t* multiply_op_out);
2638 
2639 enum xnn_status xnn_setup_multiply_nd_f16(
2640   xnn_operator_t multiply_op,
2641   size_t num_input1_dims,
2642   const size_t* input1_shape,
2643   size_t num_input2_dims,
2644   const size_t* input2_shape,
2645   const void* input1,
2646   const void* input2,
2647   void* output,
2648   pthreadpool_t threadpool);
2649 
2650 enum xnn_status xnn_create_negate_nc_f16(
2651   size_t channels,
2652   size_t input_stride,
2653   size_t output_stride,
2654   uint32_t flags,
2655   xnn_operator_t* negate_op_out);
2656 
2657 enum xnn_status xnn_setup_negate_nc_f16(
2658   xnn_operator_t negate_op,
2659   size_t batch_size,
2660   const void* input,
2661   void* output,
2662   pthreadpool_t threadpool);
2663 
2664 enum xnn_status xnn_create_prelu_nc_f16(
2665   size_t channels,
2666   size_t input_stride,
2667   size_t output_stride,
2668   const void* negative_slope,
2669   uint32_t flags,
2670   xnn_caches_t caches,
2671   xnn_operator_t* prelu_op_out);
2672 
2673 enum xnn_status xnn_setup_prelu_nc_f16(
2674   xnn_operator_t prelu_op,
2675   size_t batch_size,
2676   const void* input,
2677   void* output,
2678   pthreadpool_t threadpool);
2679 
2680 enum xnn_status xnn_create_resize_bilinear2d_nhwc_f16(
2681   size_t channels,
2682   size_t input_pixel_stride,
2683   size_t output_pixel_stride,
2684   uint32_t flags,
2685   xnn_operator_t* resize_op_out);
2686 
2687 enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f16(
2688   xnn_operator_t resize_op,
2689   size_t batch_size,
2690   size_t input_height,
2691   size_t input_width,
2692   size_t output_height,
2693   size_t output_width,
2694   const void* input,
2695   void* output,
2696   pthreadpool_t threadpool);
2697 
2698 enum xnn_status xnn_create_sigmoid_nc_f16(
2699   size_t channels,
2700   size_t input_stride,
2701   size_t output_stride,
2702   uint32_t flags,
2703   xnn_operator_t* sigmoid_op_out);
2704 
2705 enum xnn_status xnn_setup_sigmoid_nc_f16(
2706   xnn_operator_t sigmoid_op,
2707   size_t batch_size,
2708   const void* input,
2709   void* output,
2710   pthreadpool_t threadpool);
2711 
2712 enum xnn_status xnn_create_softmax_nc_f16(
2713   size_t channels,
2714   size_t input_stride,
2715   size_t output_stride,
2716   uint32_t flags,
2717   xnn_operator_t* softmax_op_out);
2718 
2719 enum xnn_status xnn_setup_softmax_nc_f16(
2720   xnn_operator_t softmax_op,
2721   size_t batch_size,
2722   const void* input,
2723   void* output,
2724   pthreadpool_t threadpool);
2725 
2726 enum xnn_status xnn_create_square_nc_f16(
2727   size_t channels,
2728   size_t input_stride,
2729   size_t output_stride,
2730   uint32_t flags,
2731   xnn_operator_t* square_op_out);
2732 
2733 enum xnn_status xnn_setup_square_nc_f16(
2734   xnn_operator_t square_op,
2735   size_t batch_size,
2736   const void* input,
2737   void* output,
2738   pthreadpool_t threadpool);
2739 
2740 enum xnn_status xnn_create_square_root_nc_f16(
2741   size_t channels,
2742   size_t input_stride,
2743   size_t output_stride,
2744   uint32_t flags,
2745   xnn_operator_t* sqrt_op_out);
2746 
2747 enum xnn_status xnn_setup_square_root_nc_f16(
2748   xnn_operator_t sqrt_op,
2749   size_t batch_size,
2750   const void* input,
2751   void* output,
2752   pthreadpool_t threadpool);
2753 
2754 enum xnn_status xnn_create_squared_difference_nd_f16(
2755   uint32_t flags,
2756   xnn_operator_t* squared_difference_op_out);
2757 
2758 enum xnn_status xnn_setup_squared_difference_nd_f16(
2759   xnn_operator_t squared_difference_op,
2760   size_t num_input1_dims,
2761   const size_t* input1_shape,
2762   size_t num_input2_dims,
2763   const size_t* input2_shape,
2764   const void* input1,
2765   const void* input2,
2766   void* output,
2767   pthreadpool_t threadpool);
2768 
2769 enum xnn_status xnn_create_subtract_nd_f16(
2770   float output_min,
2771   float output_max,
2772   uint32_t flags,
2773   xnn_operator_t* subtract_op_out);
2774 
2775 enum xnn_status xnn_setup_subtract_nd_f16(
2776   xnn_operator_t subtract_op,
2777   size_t num_input1_dims,
2778   const size_t* input1_shape,
2779   size_t num_input2_dims,
2780   const size_t* input2_shape,
2781   const void* input1,
2782   const void* input2,
2783   void* output,
2784   pthreadpool_t threadpool);
2785 
2786 enum xnn_status xnn_create_truncation_nc_f16(
2787   size_t channels,
2788   size_t input_stride,
2789   size_t output_stride,
2790   uint32_t flags,
2791   xnn_operator_t* truncation_op_out);
2792 
2793 enum xnn_status xnn_setup_truncation_nc_f16(
2794   xnn_operator_t truncation_op,
2795   size_t batch_size,
2796   const void* input,
2797   void* output,
2798   pthreadpool_t threadpool);
2799 
2800 #endif  // XNN_NO_F16_OPERATORS
2801 
2802 #ifndef XNN_NO_X16_OPERATORS
2803 
2804 enum xnn_status xnn_create_constant_pad_nd_x16(
2805   const void* padding_value,
2806   uint32_t flags,
2807   xnn_operator_t* constant_pad_op_out);
2808 
2809 enum xnn_status xnn_setup_constant_pad_nd_x16(
2810   xnn_operator_t constant_pad_op,
2811   size_t num_dims,
2812   const size_t* input_shape,
2813   const size_t* pre_padding,
2814   const size_t* post_padding,
2815   const void* input,
2816   void* output,
2817   pthreadpool_t threadpool);
2818 
2819 enum xnn_status xnn_create_copy_nc_x16(
2820   size_t channels,
2821   size_t input_stride,
2822   size_t output_stride,
2823   uint32_t flags,
2824   xnn_operator_t* copy_op_out);
2825 
2826 enum xnn_status xnn_setup_copy_nc_x16(
2827   xnn_operator_t copy_op,
2828   size_t batch_size,
2829   const void* input,
2830   void* output,
2831   pthreadpool_t threadpool);
2832 
2833 enum xnn_status xnn_create_depth_to_space_nhwc_x16(
2834   size_t output_channels,
2835   size_t input_channel_stride,
2836   size_t output_channel_stride,
2837   uint32_t block_size,
2838   uint32_t flags,
2839   xnn_operator_t* depth_to_space_op_out);
2840 
2841 enum xnn_status xnn_setup_depth_to_space_nhwc_x16(
2842   xnn_operator_t depth_to_space_op,
2843   size_t batch_size,
2844   size_t input_height,
2845   size_t input_width,
2846   const void* input,
2847   void* output,
2848   pthreadpool_t threadpool);
2849 
2850 enum xnn_status xnn_create_space_to_depth_nhwc_x16(
2851   size_t input_channels,
2852   size_t input_channel_stride,
2853   size_t output_channel_stride,
2854   uint32_t block_size,
2855   uint32_t flags,
2856   xnn_operator_t* space_to_depth_op_out);
2857 
2858 enum xnn_status xnn_setup_space_to_depth_nhwc_x16(
2859   xnn_operator_t space_to_depth_op,
2860   size_t batch_size,
2861   size_t input_height,
2862   size_t input_width,
2863   const void* input,
2864   void* output,
2865   pthreadpool_t threadpool);
2866 
2867 enum xnn_status xnn_create_transpose_nd_x16(
2868     uint32_t flags,
2869     xnn_operator_t* transpose_op_out);
2870 
2871 enum xnn_status xnn_setup_transpose_nd_x16(
2872     xnn_operator_t transpose_op,
2873     const void* input,
2874     void* output,
2875     const size_t num_dims,
2876     const size_t* input_shape,
2877     const size_t* output_perm,
2878     pthreadpool_t threadpool);
2879 
2880 enum xnn_status xnn_run_transpose_nd_x16(
2881     uint32_t flags,
2882     const void* input,
2883     void* output,
2884     const size_t num_dims,
2885     const size_t* input_shape,
2886     const size_t* output_perm,
2887     pthreadpool_t threadpool);
2888 
2889 #endif  // XNN_NO_X16_OPERATORS
2890 
2891 #ifndef XNN_NO_QC8_OPERATORS
2892 
2893 enum xnn_status xnn_create_convolution2d_nhwc_qc8(
2894   uint32_t input_padding_top,
2895   uint32_t input_padding_right,
2896   uint32_t input_padding_bottom,
2897   uint32_t input_padding_left,
2898   uint32_t kernel_height,
2899   uint32_t kernel_width,
2900   uint32_t subsampling_height,
2901   uint32_t subsampling_width,
2902   uint32_t dilation_height,
2903   uint32_t dilation_width,
2904   uint32_t groups,
2905   size_t group_input_channels,
2906   size_t group_output_channels,
2907   size_t input_channel_stride,
2908   size_t output_channel_stride,
2909   int8_t input_zero_point,
2910   float input_scale,
2911   const float* kernel_scale,
2912   const int8_t* kernel,
2913   const int32_t* bias,
2914   int8_t output_zero_point,
2915   float output_scale,
2916   int8_t output_min,
2917   int8_t output_max,
2918   uint32_t flags,
2919   xnn_caches_t caches,
2920   xnn_operator_t* convolution_op_out);
2921 
2922 enum xnn_status xnn_setup_convolution2d_nhwc_qc8(
2923   xnn_operator_t convolution_op,
2924   size_t batch_size,
2925   size_t input_height,
2926   size_t input_width,
2927   const int8_t* input,
2928   int8_t* output,
2929   pthreadpool_t threadpool);
2930 
2931 #endif  // XNN_NO_QC8_OPERATORS
2932 
2933 #ifndef XNN_NO_QS8_OPERATORS
2934 
2935 enum xnn_status xnn_create_add_nd_qs8(
2936   int8_t input1_zero_point,
2937   float input1_scale,
2938   int8_t input2_zero_point,
2939   float input2_scale,
2940   int8_t output_zero_point,
2941   float output_scale,
2942   int8_t output_min,
2943   int8_t output_max,
2944   uint32_t flags,
2945   xnn_operator_t* add_op_out);
2946 
2947 enum xnn_status xnn_setup_add_nd_qs8(
2948   xnn_operator_t add_op,
2949   size_t num_input1_dims,
2950   const size_t* input1_shape,
2951   size_t num_input2_dims,
2952   const size_t* input2_shape,
2953   const int8_t* input1,
2954   const int8_t* input2,
2955   int8_t* output,
2956   pthreadpool_t threadpool);
2957 
2958 enum xnn_status xnn_create_convolution2d_nhwc_qs8(
2959   uint32_t input_padding_top,
2960   uint32_t input_padding_right,
2961   uint32_t input_padding_bottom,
2962   uint32_t input_padding_left,
2963   uint32_t kernel_height,
2964   uint32_t kernel_width,
2965   uint32_t subsampling_height,
2966   uint32_t subsampling_width,
2967   uint32_t dilation_height,
2968   uint32_t dilation_width,
2969   uint32_t groups,
2970   size_t group_input_channels,
2971   size_t group_output_channels,
2972   size_t input_channel_stride,
2973   size_t output_channel_stride,
2974   int8_t input_zero_point,
2975   float input_scale,
2976   float kernel_scale,
2977   const int8_t* kernel,
2978   const int32_t* bias,
2979   int8_t output_zero_point,
2980   float output_scale,
2981   int8_t output_min,
2982   int8_t output_max,
2983   uint32_t flags,
2984   xnn_caches_t caches,
2985   xnn_operator_t* convolution_op_out);
2986 
2987 enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
2988   xnn_operator_t convolution_op,
2989   size_t batch_size,
2990   size_t input_height,
2991   size_t input_width,
2992   const int8_t* input,
2993   int8_t* output,
2994   pthreadpool_t threadpool);
2995 
2996 enum xnn_status xnn_create_deconvolution2d_nhwc_qs8(
2997   uint32_t output_padding_top,
2998   uint32_t output_padding_right,
2999   uint32_t output_padding_bottom,
3000   uint32_t output_padding_left,
3001   uint32_t kernel_height,
3002   uint32_t kernel_width,
3003   uint32_t stride_height,
3004   uint32_t stride_width,
3005   uint32_t dilation_height,
3006   uint32_t dilation_width,
3007   uint32_t groups,
3008   size_t group_input_channels,
3009   size_t group_output_channels,
3010   size_t input_pixel_stride,
3011   size_t output_pixel_stride,
3012   int8_t input_zero_point,
3013   float input_scale,
3014   float kernel_scale,
3015   const int8_t* kernel,
3016   const int32_t* bias,
3017   int8_t output_zero_point,
3018   float output_scale,
3019   int8_t output_min,
3020   int8_t output_max,
3021   uint32_t flags,
3022   xnn_caches_t caches,
3023   xnn_operator_t* deconvolution_op_out);
3024 
3025 enum xnn_status xnn_setup_deconvolution2d_nhwc_qs8(
3026   xnn_operator_t deconvolution_op,
3027   size_t batch_size,
3028   size_t input_height,
3029   size_t input_width,
3030   uint32_t adjustment_height,
3031   uint32_t adjustment_width,
3032   const int8_t* input,
3033   int8_t* output,
3034   pthreadpool_t threadpool);
3035 
3036 enum xnn_status xnn_create_elu_nc_qs8(
3037   size_t channels,
3038   size_t input_stride,
3039   size_t output_stride,
3040   float alpha,
3041   int8_t input_zero_point,
3042   float input_scale,
3043   int8_t output_zero_point,
3044   float output_scale,
3045   int8_t output_min,
3046   int8_t output_max,
3047   uint32_t flags,
3048   xnn_operator_t* elu_op_out);
3049 
3050 enum xnn_status xnn_setup_elu_nc_qs8(
3051   xnn_operator_t elu_op,
3052   size_t batch_size,
3053   const int8_t* input,
3054   int8_t* output,
3055   pthreadpool_t threadpool);
3056 
3057 enum xnn_status xnn_create_fully_connected_nc_qs8(
3058   size_t input_channels,
3059   size_t output_channels,
3060   size_t input_stride,
3061   size_t output_stride,
3062   int8_t input_zero_point,
3063   float input_scale,
3064   float kernel_scale,
3065   const int8_t* kernel,
3066   const int32_t* bias,
3067   int8_t output_zero_point,
3068   float output_scale,
3069   int8_t output_min,
3070   int8_t output_max,
3071   uint32_t flags,
3072   xnn_caches_t caches,
3073   xnn_operator_t* fully_connected_op_out);
3074 
3075 enum xnn_status xnn_setup_fully_connected_nc_qs8(
3076   xnn_operator_t fully_connected_op,
3077   size_t batch_size,
3078   const int8_t* input,
3079   int8_t* output,
3080   pthreadpool_t threadpool);
3081 
3082 enum xnn_status xnn_create_global_average_pooling_nwc_qs8(
3083   size_t channels,
3084   size_t input_stride,
3085   size_t output_stride,
3086   int8_t input_zero_point,
3087   float input_scale,
3088   int8_t output_zero_point,
3089   float output_scale,
3090   int8_t output_min,
3091   int8_t output_max,
3092   uint32_t flags,
3093   xnn_operator_t* global_average_pooling_op_out);
3094 
3095 enum xnn_status xnn_setup_global_average_pooling_nwc_qs8(
3096   xnn_operator_t global_average_pooling_op,
3097   size_t batch_size,
3098   size_t width,
3099   const int8_t* input,
3100   int8_t* output,
3101   pthreadpool_t threadpool);
3102 
3103 enum xnn_status xnn_create_multiply_nd_qs8(
3104   int8_t input1_zero_point,
3105   float input1_scale,
3106   int8_t input2_zero_point,
3107   float input2_scale,
3108   int8_t output_zero_point,
3109   float output_scale,
3110   int8_t output_min,
3111   int8_t output_max,
3112   uint32_t flags,
3113   xnn_operator_t* multiply_op_out);
3114 
3115 enum xnn_status xnn_setup_multiply_nd_qs8(
3116   xnn_operator_t multiply_op,
3117   size_t num_input1_dims,
3118   const size_t* input1_shape,
3119   size_t num_input2_dims,
3120   const size_t* input2_shape,
3121   const int8_t* input1,
3122   const int8_t* input2,
3123   int8_t* output,
3124   pthreadpool_t threadpool);
3125 
3126 enum xnn_status xnn_create_leaky_relu_nc_qs8(
3127   size_t channels,
3128   size_t input_stride,
3129   size_t output_stride,
3130   float negative_slope,
3131   int8_t input_zero_point,
3132   float input_scale,
3133   int8_t output_zero_point,
3134   float output_scale,
3135   uint32_t flags,
3136   xnn_operator_t* leaky_relu_op_out);
3137 
3138 enum xnn_status xnn_setup_leaky_relu_nc_qs8(
3139   xnn_operator_t leaky_relu_op,
3140   size_t batch_size,
3141   const int8_t* input,
3142   int8_t* output,
3143   pthreadpool_t threadpool);
3144 
3145 enum xnn_status xnn_create_sigmoid_nc_qs8(
3146   size_t channels,
3147   size_t input_stride,
3148   size_t output_stride,
3149   int8_t input_zero_point,
3150   float input_scale,
3151   int8_t output_zero_point,
3152   float output_scale,
3153   int8_t output_min,
3154   int8_t output_max,
3155   uint32_t flags,
3156   xnn_operator_t* sigmoid_op_out);
3157 
3158 enum xnn_status xnn_setup_sigmoid_nc_qs8(
3159   xnn_operator_t sigmoid_op,
3160   size_t batch_size,
3161   const int8_t* input,
3162   int8_t* output,
3163   pthreadpool_t threadpool);
3164 
3165 enum xnn_status xnn_create_subtract_nd_qs8(
3166   int8_t input1_zero_point,
3167   float input1_scale,
3168   int8_t input2_zero_point,
3169   float input2_scale,
3170   int8_t output_zero_point,
3171   float output_scale,
3172   int8_t output_min,
3173   int8_t output_max,
3174   uint32_t flags,
3175   xnn_operator_t* subtract_op_out);
3176 
3177 enum xnn_status xnn_setup_subtract_nd_qs8(
3178   xnn_operator_t subtract_op,
3179   size_t num_input1_dims,
3180   const size_t* input1_shape,
3181   size_t num_input2_dims,
3182   const size_t* input2_shape,
3183   const int8_t* input1,
3184   const int8_t* input2,
3185   int8_t* output,
3186   pthreadpool_t threadpool);
3187 
3188 enum xnn_status xnn_create_tanh_nc_qs8(
3189   size_t channels,
3190   size_t input_stride,
3191   size_t output_stride,
3192   int8_t input_zero_point,
3193   float input_scale,
3194   int8_t output_zero_point,
3195   float output_scale,
3196   int8_t output_min,
3197   int8_t output_max,
3198   uint32_t flags,
3199   xnn_operator_t* tanh_op_out);
3200 
3201 enum xnn_status xnn_setup_tanh_nc_qs8(
3202   xnn_operator_t tanh_op,
3203   size_t batch_size,
3204   const int8_t* input,
3205   int8_t* output,
3206   pthreadpool_t threadpool);
3207 
3208 #endif  // XNN_NO_QS8_OPERATORS
3209 
3210 #ifndef XNN_NO_QU8_OPERATORS
3211 
3212 enum xnn_status xnn_create_add_nd_qu8(
3213   uint8_t input1_zero_point,
3214   float input1_scale,
3215   uint8_t input2_zero_point,
3216   float input2_scale,
3217   uint8_t output_zero_point,
3218   float output_scale,
3219   uint8_t output_min,
3220   uint8_t output_max,
3221   uint32_t flags,
3222   xnn_operator_t* add_op_out);
3223 
3224 enum xnn_status xnn_setup_add_nd_qu8(
3225   xnn_operator_t add_op,
3226   size_t num_input1_dims,
3227   const size_t* input1_shape,
3228   size_t num_input2_dims,
3229   const size_t* input2_shape,
3230   const uint8_t* input1,
3231   const uint8_t* input2,
3232   uint8_t* output,
3233   pthreadpool_t threadpool);
3234 
3235 enum xnn_status xnn_create_average_pooling2d_nhwc_qu8(
3236   uint32_t input_padding_top,
3237   uint32_t input_padding_right,
3238   uint32_t input_padding_bottom,
3239   uint32_t input_padding_left,
3240   uint32_t pooling_height,
3241   uint32_t pooling_width,
3242   uint32_t stride_height,
3243   uint32_t stride_width,
3244   size_t channels,
3245   size_t input_pixel_stride,
3246   size_t output_pixel_stride,
3247   uint8_t input_zero_point,
3248   float input_scale,
3249   uint8_t output_zero_point,
3250   float output_scale,
3251   uint8_t output_min,
3252   uint8_t output_max,
3253   uint32_t flags,
3254   xnn_operator_t* average_pooling_op_out);
3255 
3256 enum xnn_status xnn_setup_average_pooling2d_nhwc_qu8(
3257   xnn_operator_t average_pooling_op,
3258   size_t batch_size,
3259   size_t input_height,
3260   size_t input_width,
3261   const uint8_t* input,
3262   uint8_t* output,
3263   pthreadpool_t threadpool);
3264 
3265 enum xnn_status xnn_create_convolution2d_nhwc_qu8(
3266   uint32_t input_padding_top,
3267   uint32_t input_padding_right,
3268   uint32_t input_padding_bottom,
3269   uint32_t input_padding_left,
3270   uint32_t kernel_height,
3271   uint32_t kernel_width,
3272   uint32_t subsampling_height,
3273   uint32_t subsampling_width,
3274   uint32_t dilation_height,
3275   uint32_t dilation_width,
3276   uint32_t groups,
3277   size_t group_input_channels,
3278   size_t group_output_channels,
3279   size_t input_channel_stride,
3280   size_t output_channel_stride,
3281   uint8_t input_zero_point,
3282   float input_scale,
3283   uint8_t kernel_zero_point,
3284   float kernel_scale,
3285   const uint8_t* kernel,
3286   const int32_t* bias,
3287   uint8_t output_zero_point,
3288   float output_scale,
3289   uint8_t output_min,
3290   uint8_t output_max,
3291   uint32_t flags,
3292   xnn_caches_t caches,
3293   xnn_operator_t* convolution_op_out);
3294 
3295 enum xnn_status xnn_setup_convolution2d_nhwc_qu8(
3296   xnn_operator_t convolution_op,
3297   size_t batch_size,
3298   size_t input_height,
3299   size_t input_width,
3300   const uint8_t* input,
3301   uint8_t* output,
3302   pthreadpool_t threadpool);
3303 
3304 enum xnn_status xnn_create_deconvolution2d_nhwc_qu8(
3305   uint32_t output_padding_top,
3306   uint32_t output_padding_right,
3307   uint32_t output_padding_bottom,
3308   uint32_t output_padding_left,
3309   uint32_t kernel_height,
3310   uint32_t kernel_width,
3311   uint32_t stride_height,
3312   uint32_t stride_width,
3313   uint32_t dilation_height,
3314   uint32_t dilation_width,
3315   uint32_t groups,
3316   size_t group_input_channels,
3317   size_t group_output_channels,
3318   size_t input_pixel_stride,
3319   size_t output_pixel_stride,
3320   uint8_t input_zero_point,
3321   float input_scale,
3322   uint8_t kernel_zero_point,
3323   float kernel_scale,
3324   const uint8_t* kernel,
3325   const int32_t* bias,
3326   uint8_t output_zero_point,
3327   float output_scale,
3328   uint8_t output_min,
3329   uint8_t output_max,
3330   uint32_t flags,
3331   xnn_caches_t caches,
3332   xnn_operator_t* deconvolution_op_out);
3333 
3334 enum xnn_status xnn_setup_deconvolution2d_nhwc_qu8(
3335   xnn_operator_t deconvolution_op,
3336   size_t batch_size,
3337   size_t input_height,
3338   size_t input_width,
3339   uint32_t adjustment_height,
3340   uint32_t adjustment_width,
3341   const uint8_t* input,
3342   uint8_t* output,
3343   pthreadpool_t threadpool);
3344 
3345 enum xnn_status xnn_create_fully_connected_nc_qu8(
3346   size_t input_channels,
3347   size_t output_channels,
3348   size_t input_stride,
3349   size_t output_stride,
3350   uint8_t input_zero_point,
3351   float input_scale,
3352   uint8_t kernel_zero_point,
3353   float kernel_scale,
3354   const uint8_t* kernel,
3355   const int32_t* bias,
3356   uint8_t output_zero_point,
3357   float output_scale,
3358   uint8_t output_min,
3359   uint8_t output_max,
3360   uint32_t flags,
3361   xnn_caches_t caches,
3362   xnn_operator_t* fully_connected_op_out);
3363 
3364 enum xnn_status xnn_setup_fully_connected_nc_qu8(
3365   xnn_operator_t fully_connected_op,
3366   size_t batch_size,
3367   const uint8_t* input,
3368   uint8_t* output,
3369   pthreadpool_t threadpool);
3370 
3371 enum xnn_status xnn_create_global_average_pooling_nwc_qu8(
3372   size_t channels,
3373   size_t input_stride,
3374   size_t output_stride,
3375   uint8_t input_zero_point,
3376   float input_scale,
3377   uint8_t output_zero_point,
3378   float output_scale,
3379   uint8_t output_min,
3380   uint8_t output_max,
3381   uint32_t flags,
3382   xnn_operator_t* global_average_pooling_op_out);
3383 
3384 enum xnn_status xnn_setup_global_average_pooling_nwc_qu8(
3385   xnn_operator_t global_average_pooling_op,
3386   size_t batch_size,
3387   size_t width,
3388   const uint8_t* input,
3389   uint8_t* output,
3390   pthreadpool_t threadpool);
3391 
3392 enum xnn_status xnn_create_leaky_relu_nc_qu8(
3393   size_t channels,
3394   size_t input_stride,
3395   size_t output_stride,
3396   float negative_slope,
3397   uint8_t input_zero_point,
3398   float input_scale,
3399   uint8_t output_zero_point,
3400   float output_scale,
3401   uint32_t flags,
3402   xnn_operator_t* leaky_relu_op_out);
3403 
3404 enum xnn_status xnn_setup_leaky_relu_nc_qu8(
3405   xnn_operator_t leaky_relu_op,
3406   size_t batch_size,
3407   const uint8_t* input,
3408   uint8_t* output,
3409   pthreadpool_t threadpool);
3410 
3411 enum xnn_status xnn_create_multiply_nd_qu8(
3412   uint8_t input1_zero_point,
3413   float input1_scale,
3414   uint8_t input2_zero_point,
3415   float input2_scale,
3416   uint8_t output_zero_point,
3417   float output_scale,
3418   uint8_t output_min,
3419   uint8_t output_max,
3420   uint32_t flags,
3421   xnn_operator_t* multiply_op_out);
3422 
3423 enum xnn_status xnn_setup_multiply_nd_qu8(
3424   xnn_operator_t multiply_op,
3425   size_t num_input1_dims,
3426   const size_t* input1_shape,
3427   size_t num_input2_dims,
3428   const size_t* input2_shape,
3429   const uint8_t* input1,
3430   const uint8_t* input2,
3431   uint8_t* output,
3432   pthreadpool_t threadpool);
3433 
3434 enum xnn_status xnn_create_sigmoid_nc_qu8(
3435   size_t channels,
3436   size_t input_stride,
3437   size_t output_stride,
3438   uint8_t input_zero_point,
3439   float input_scale,
3440   uint8_t output_zero_point,
3441   float output_scale,
3442   uint8_t output_min,
3443   uint8_t output_max,
3444   uint32_t flags,
3445   xnn_operator_t* sigmoid_op_out);
3446 
3447 enum xnn_status xnn_setup_sigmoid_nc_qu8(
3448   xnn_operator_t sigmoid_op,
3449   size_t batch_size,
3450   const uint8_t* input,
3451   uint8_t* output,
3452   pthreadpool_t threadpool);
3453 
3454 enum xnn_status xnn_create_softmax_nc_qu8(
3455   size_t channels,
3456   size_t input_stride,
3457   size_t output_stride,
3458   float input_scale,
3459   uint8_t output_zero_point,
3460   float output_scale,
3461   uint32_t flags,
3462   xnn_operator_t* softmax_op_out);
3463 
3464 enum xnn_status xnn_setup_softmax_nc_qu8(
3465   xnn_operator_t softmax_op,
3466   size_t batch_size,
3467   const uint8_t* input,
3468   uint8_t* output,
3469   pthreadpool_t threadpool);
3470 
3471 enum xnn_status xnn_create_subtract_nd_qu8(
3472   uint8_t input1_zero_point,
3473   float input1_scale,
3474   uint8_t input2_zero_point,
3475   float input2_scale,
3476   uint8_t output_zero_point,
3477   float output_scale,
3478   uint8_t output_min,
3479   uint8_t output_max,
3480   uint32_t flags,
3481   xnn_operator_t* subtract_op_out);
3482 
3483 enum xnn_status xnn_setup_subtract_nd_qu8(
3484   xnn_operator_t subtract_op,
3485   size_t num_input1_dims,
3486   const size_t* input1_shape,
3487   size_t num_input2_dims,
3488   const size_t* input2_shape,
3489   const uint8_t* input1,
3490   const uint8_t* input2,
3491   uint8_t* output,
3492   pthreadpool_t threadpool);
3493 
3494 enum xnn_status xnn_create_tanh_nc_qu8(
3495   size_t channels,
3496   size_t input_stride,
3497   size_t output_stride,
3498   uint8_t input_zero_point,
3499   float input_scale,
3500   uint8_t output_zero_point,
3501   float output_scale,
3502   uint8_t output_min,
3503   uint8_t output_max,
3504   uint32_t flags,
3505   xnn_operator_t* tanh_op_out);
3506 
3507 enum xnn_status xnn_setup_tanh_nc_qu8(
3508   xnn_operator_t tanh_op,
3509   size_t batch_size,
3510   const uint8_t* input,
3511   uint8_t* output,
3512   pthreadpool_t threadpool);
3513 
3514 #endif  // XNN_NO_QU8_OPERATORS
3515 
3516 #ifndef XNN_NO_S8_OPERATORS
3517 
3518 enum xnn_status xnn_create_clamp_nc_s8(
3519   size_t channels,
3520   size_t input_stride,
3521   size_t output_stride,
3522   int8_t output_min,
3523   int8_t output_max,
3524   uint32_t flags,
3525   xnn_operator_t* clamp_op_out);
3526 
3527 enum xnn_status xnn_setup_clamp_nc_s8(
3528   xnn_operator_t clamp_op,
3529   size_t batch_size,
3530   const int8_t* input,
3531   int8_t* output,
3532   pthreadpool_t threadpool);
3533 
3534 enum xnn_status xnn_create_max_pooling2d_nhwc_s8(
3535   uint32_t input_padding_top,
3536   uint32_t input_padding_right,
3537   uint32_t input_padding_bottom,
3538   uint32_t input_padding_left,
3539   uint32_t pooling_height,
3540   uint32_t pooling_width,
3541   uint32_t stride_height,
3542   uint32_t stride_width,
3543   uint32_t dilation_height,
3544   uint32_t dilation_width,
3545   size_t channels,
3546   size_t input_pixel_stride,
3547   size_t output_pixel_stride,
3548   int8_t output_min,
3549   int8_t output_max,
3550   uint32_t flags,
3551   xnn_operator_t* max_pooling_op_out);
3552 
3553 enum xnn_status xnn_setup_max_pooling2d_nhwc_s8(
3554   xnn_operator_t max_pooling_op,
3555   size_t batch_size,
3556   size_t input_height,
3557   size_t input_width,
3558   const int8_t* input,
3559   int8_t* output,
3560   pthreadpool_t threadpool);
3561 
3562 enum xnn_status xnn_create_resize_bilinear2d_nhwc_s8(
3563   size_t channels,
3564   size_t input_pixel_stride,
3565   size_t output_pixel_stride,
3566   uint32_t flags,
3567   xnn_operator_t* resize_op_out);
3568 
3569 enum xnn_status xnn_setup_resize_bilinear2d_nhwc_s8(
3570   xnn_operator_t resize_op,
3571   size_t batch_size,
3572   size_t input_height,
3573   size_t input_width,
3574   size_t output_height,
3575   size_t output_width,
3576   const int8_t* input,
3577   int8_t* output,
3578   pthreadpool_t threadpool);
3579 
3580 #endif  // XNN_NO_S8_OPERATORS
3581 
3582 #ifndef XNN_NO_U8_OPERATORS
3583 
3584 enum xnn_status xnn_create_clamp_nc_u8(
3585   size_t channels,
3586   size_t input_stride,
3587   size_t output_stride,
3588   uint8_t output_min,
3589   uint8_t output_max,
3590   uint32_t flags,
3591   xnn_operator_t* clamp_op_out);
3592 
3593 enum xnn_status xnn_setup_clamp_nc_u8(
3594   xnn_operator_t clamp_op,
3595   size_t batch_size,
3596   const uint8_t* input,
3597   uint8_t* output,
3598   pthreadpool_t threadpool);
3599 
3600 enum xnn_status xnn_create_max_pooling2d_nhwc_u8(
3601   uint32_t input_padding_top,
3602   uint32_t input_padding_right,
3603   uint32_t input_padding_bottom,
3604   uint32_t input_padding_left,
3605   uint32_t pooling_height,
3606   uint32_t pooling_width,
3607   uint32_t stride_height,
3608   uint32_t stride_width,
3609   uint32_t dilation_height,
3610   uint32_t dilation_width,
3611   size_t channels,
3612   size_t input_pixel_stride,
3613   size_t output_pixel_stride,
3614   uint8_t output_min,
3615   uint8_t output_max,
3616   uint32_t flags,
3617   xnn_operator_t* max_pooling_op_out);
3618 
3619 enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
3620   xnn_operator_t max_pooling_op,
3621   size_t batch_size,
3622   size_t input_height,
3623   size_t input_width,
3624   const uint8_t* input,
3625   uint8_t* output,
3626   pthreadpool_t threadpool);
3627 
3628 enum xnn_status xnn_create_resize_bilinear2d_nhwc_u8(
3629   size_t channels,
3630   size_t input_pixel_stride,
3631   size_t output_pixel_stride,
3632   uint32_t flags,
3633   xnn_operator_t* resize_op_out);
3634 
3635 enum xnn_status xnn_setup_resize_bilinear2d_nhwc_u8(
3636   xnn_operator_t resize_op,
3637   size_t batch_size,
3638   size_t input_height,
3639   size_t input_width,
3640   size_t output_height,
3641   size_t output_width,
3642   const uint8_t* input,
3643   uint8_t* output,
3644   pthreadpool_t threadpool);
3645 
3646 #endif  // XNN_NO_U8_OPERATORS
3647 
3648 #ifndef XNN_NO_X8_OPERATORS
3649 
3650 enum xnn_status xnn_create_copy_nc_x8(
3651   size_t channels,
3652   size_t input_stride,
3653   size_t output_stride,
3654   uint32_t flags,
3655   xnn_operator_t* copy_op_out);
3656 
3657 enum xnn_status xnn_setup_copy_nc_x8(
3658   xnn_operator_t copy_op,
3659   size_t batch_size,
3660   const void* input,
3661   void* output,
3662   pthreadpool_t threadpool);
3663 
3664 enum xnn_status xnn_create_channel_shuffle_nc_x8(
3665   size_t groups,
3666   size_t group_channels,
3667   size_t input_stride,
3668   size_t output_stride,
3669   uint32_t flags,
3670   xnn_operator_t* channel_shuffle_op_out);
3671 
3672 enum xnn_status xnn_setup_channel_shuffle_nc_x8(
3673   xnn_operator_t channel_shuffle_op,
3674   size_t batch_size,
3675   const void* input,
3676   void* output,
3677   pthreadpool_t threadpool);
3678 
3679 enum xnn_status xnn_create_constant_pad_nd_x8(
3680   const void* padding_value,
3681   uint32_t flags,
3682   xnn_operator_t* constant_pad_op_out);
3683 
3684 enum xnn_status xnn_setup_constant_pad_nd_x8(
3685   xnn_operator_t constant_pad_op,
3686   size_t num_dims,
3687   const size_t* input_shape,
3688   const size_t* pre_padding,
3689   const size_t* post_padding,
3690   const void* input,
3691   void* output,
3692   pthreadpool_t threadpool);
3693 
3694 enum xnn_status xnn_create_depth_to_space_nhwc_x8(
3695   size_t output_channels,
3696   size_t input_channel_stride,
3697   size_t output_channel_stride,
3698   uint32_t block_size,
3699   uint32_t flags,
3700   xnn_operator_t* depth_to_space_op_out);
3701 
3702 enum xnn_status xnn_setup_depth_to_space_nhwc_x8(
3703   xnn_operator_t depth_to_space_op,
3704   size_t batch_size,
3705   size_t input_height,
3706   size_t input_width,
3707   const void* input,
3708   void* output,
3709   pthreadpool_t threadpool);
3710 
3711 enum xnn_status xnn_create_space_to_depth_nhwc_x8(
3712   size_t input_channels,
3713   size_t input_channel_stride,
3714   size_t output_channel_stride,
3715   uint32_t block_size,
3716   uint32_t flags,
3717   xnn_operator_t* space_to_depth_op_out);
3718 
3719 enum xnn_status xnn_setup_space_to_depth_nhwc_x8(
3720   xnn_operator_t space_to_depth_op,
3721   size_t batch_size,
3722   size_t input_height,
3723   size_t input_width,
3724   const void* input,
3725   void* output,
3726   pthreadpool_t threadpool);
3727 
3728 enum xnn_status xnn_create_transpose_nd_x8(
3729     uint32_t flags,
3730     xnn_operator_t* transpose_op_out);
3731 
3732 enum xnn_status xnn_setup_transpose_nd_x8(
3733     xnn_operator_t transpose_op,
3734     const void* input,
3735     void* output,
3736     const size_t num_dims,
3737     const size_t* input_shape,
3738     const size_t* output_perm,
3739     pthreadpool_t threadpool);
3740 
3741 enum xnn_status xnn_run_transpose_nd_x8(
3742     uint32_t flags,
3743     const void* input,
3744     void* output,
3745     const size_t num_dims,
3746     const size_t* input_shape,
3747     const size_t* output_perm,
3748     pthreadpool_t threadpool);
3749 
3750 #endif  // XNN_NO_X8_OPERATORS
3751 
3752 #ifndef XNN_NO_CVT_OPERATORS
3753 
3754 enum xnn_status xnn_create_convert_nc_f16_f32(
3755   size_t channels,
3756   size_t input_stride,
3757   size_t output_stride,
3758   uint32_t flags,
3759   xnn_operator_t* convert_op_out);
3760 
3761 enum xnn_status xnn_setup_convert_nc_f16_f32(
3762   xnn_operator_t convert_op,
3763   size_t batch_size,
3764   const void* input,
3765   float* output,
3766   pthreadpool_t threadpool);
3767 
3768 enum xnn_status xnn_create_convert_nc_f32_f16(
3769   size_t channels,
3770   size_t input_stride,
3771   size_t output_stride,
3772   uint32_t flags,
3773   xnn_operator_t* convert_op_out);
3774 
3775 enum xnn_status xnn_setup_convert_nc_f32_f16(
3776   xnn_operator_t convert_op,
3777   size_t batch_size,
3778   const float* input,
3779   void* output,
3780   pthreadpool_t threadpool);
3781 
3782 enum xnn_status xnn_create_convert_nc_f32_qs8(
3783   size_t channels,
3784   size_t input_stride,
3785   size_t output_stride,
3786   float output_scale,
3787   int8_t output_zero_point,
3788   int8_t output_min,
3789   int8_t output_max,
3790   uint32_t flags,
3791   xnn_operator_t* convert_op_out);
3792 
3793 enum xnn_status xnn_setup_convert_nc_f32_qs8(
3794   xnn_operator_t convert_op,
3795   size_t batch_size,
3796   const float* input,
3797   int8_t* output,
3798   pthreadpool_t threadpool);
3799 
3800 enum xnn_status xnn_create_convert_nc_f32_qu8(
3801   size_t channels,
3802   size_t input_stride,
3803   size_t output_stride,
3804   float output_scale,
3805   uint8_t output_zero_point,
3806   uint8_t output_min,
3807   uint8_t output_max,
3808   uint32_t flags,
3809   xnn_operator_t* convert_op_out);
3810 
3811 enum xnn_status xnn_setup_convert_nc_f32_qu8(
3812   xnn_operator_t convert_op,
3813   size_t batch_size,
3814   const float* input,
3815   uint8_t* output,
3816   pthreadpool_t threadpool);
3817 
3818 enum xnn_status xnn_create_convert_nc_qs8(
3819   size_t channels,
3820   size_t input_stride,
3821   size_t output_stride,
3822   float input_scale,
3823   int8_t input_zero_point,
3824   float output_scale,
3825   int8_t output_zero_point,
3826   uint32_t flags,
3827   xnn_operator_t* convert_op_out);
3828 
3829 enum xnn_status xnn_setup_convert_nc_qs8(
3830   xnn_operator_t convert_op,
3831   size_t batch_size,
3832   const int8_t* input,
3833   int8_t* output,
3834   pthreadpool_t threadpool);
3835 
3836 enum xnn_status xnn_create_convert_nc_qs8_f32(
3837   size_t channels,
3838   size_t input_stride,
3839   size_t output_stride,
3840   float input_scale,
3841   int8_t input_zero_point,
3842   uint32_t flags,
3843   xnn_operator_t* convert_op_out);
3844 
3845 enum xnn_status xnn_setup_convert_nc_qs8_f32(
3846   xnn_operator_t convert_op,
3847   size_t batch_size,
3848   const int8_t* input,
3849   float* output,
3850   pthreadpool_t threadpool);
3851 
3852 enum xnn_status xnn_create_convert_nc_qu8(
3853   size_t channels,
3854   size_t input_stride,
3855   size_t output_stride,
3856   float input_scale,
3857   uint8_t input_zero_point,
3858   float output_scale,
3859   uint8_t output_zero_point,
3860   uint32_t flags,
3861   xnn_operator_t* convert_op_out);
3862 
3863 enum xnn_status xnn_setup_convert_nc_qu8(
3864   xnn_operator_t convert_op,
3865   size_t batch_size,
3866   const uint8_t* input,
3867   uint8_t* output,
3868   pthreadpool_t threadpool);
3869 
3870 enum xnn_status xnn_create_convert_nc_qu8_f32(
3871   size_t channels,
3872   size_t input_stride,
3873   size_t output_stride,
3874   float input_scale,
3875   uint8_t input_zero_point,
3876   uint32_t flags,
3877   xnn_operator_t* convert_op_out);
3878 
3879 enum xnn_status xnn_setup_convert_nc_qu8_f32(
3880   xnn_operator_t convert_op,
3881   size_t batch_size,
3882   const uint8_t* input,
3883   float* output,
3884   pthreadpool_t threadpool);
3885 
3886 #endif  // XNN_NO_CVT_OPERATORS
3887 
3888 #ifdef __cplusplus
3889 }  // extern "C"
3890 #endif
3891