• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 #include <math.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <string.h>
16 
17 #include <fp16.h>
18 
19 #include <xnnpack.h>
20 #include <xnnpack/allocator.h>
21 #include <xnnpack/cache.h>
22 #include <xnnpack/common.h>
23 #include <xnnpack/compute.h>
24 #include <xnnpack/indirection.h>
25 #include <xnnpack/log.h>
26 #include <xnnpack/math.h>
27 #include <xnnpack/operator.h>
28 #include <xnnpack/pack.h>
29 #include <xnnpack/params.h>
30 #include <xnnpack/post-operation.h>
31 #include <xnnpack/microparams-init.h>
32 
33 #ifndef XNN_ENABLE_GEMM_M_SPECIALIZATION
34 #error "XNN_ENABLE_GEMM_M_SPECIALIZATION is not defined"
35 #endif
36 
compute_output_dimension_with_tf_same_padding(size_t input_dimension,size_t subsampling_dimension)37 static inline size_t compute_output_dimension_with_tf_same_padding(
38     size_t input_dimension,
39     size_t subsampling_dimension)
40 {
41   return divide_round_up(input_dimension, subsampling_dimension);
42 }
43 
find_dwconv_ukernel(size_t kernel_size,const struct dwconv_parameters * ukernel,size_t num_ukernels)44 static inline const struct dwconv_parameters* find_dwconv_ukernel(
45     size_t kernel_size,
46     const struct dwconv_parameters* ukernel,
47     size_t num_ukernels)
48 {
49   while (num_ukernels-- != 0) {
50     if (ukernel->primary_tile == kernel_size) {
51       return ukernel;
52     }
53     ukernel++;
54   }
55   return NULL;
56 }
57 
58 #if XNN_PLATFORM_JIT
cached_code_at_offset(xnn_operator_t op,size_t offset)59 static inline uintptr_t cached_code_at_offset(xnn_operator_t op, size_t offset)
60 {
61   return (uintptr_t)op->code_cache->cache.code.start + offset;
62 }
63 
get_generated_gemm(struct xnn_hmp_gemm_codegen generators,struct jit_gemm_params * jit_gemm_params,size_t mr,size_t group_output_channels,size_t nr,size_t group_input_channels,size_t log2_input_element_size,struct xnn_code_cache * code_cache)64 static size_t get_generated_gemm(
65     struct xnn_hmp_gemm_codegen generators,
66     struct jit_gemm_params *jit_gemm_params,
67     size_t mr,
68     size_t group_output_channels,
69     size_t nr,
70     size_t group_input_channels,
71     size_t log2_input_element_size,
72     struct xnn_code_cache* code_cache)
73 {
74   size_t offset = XNN_CACHE_NOT_FOUND;
75   xnn_jit_gemm_code_generator_function generator = generators.function[XNN_UARCH_DEFAULT];
76   if (generator == NULL) {
77     goto error;
78   }
79 
80   enum xnn_status status = xnn_status_success;
81 
82   status = xnn_reserve_code_memory(&code_cache->cache.code, XNN_DEFAULT_MICROKERNEL_SIZE);
83   if (xnn_status_success != status) {
84     xnn_log_error("failed to ensure sufficient space in the code buffer for a microkernel");
85     goto error;
86   }
87 
88   const size_t old_size = code_cache->cache.code.size;
89   void* old_code = (uint8_t*) code_cache->cache.code.start + old_size;
90   status = generator(&code_cache->cache.code, mr, group_output_channels % nr,
91                      group_input_channels << log2_input_element_size,
92                      jit_gemm_params);
93 
94   if (xnn_status_success != status) {
95     xnn_log_error("failed to generate GEMM microkernel");
96     goto error;
97   }
98 
99   const size_t new_size = code_cache->cache.code.size;
100   return xnn_get_or_insert_code_cache(code_cache, old_code, new_size - old_size);
101 
102 error:
103   return offset;
104 }
105 
generate_gemms_up_to_max_mr(size_t max_mr,struct gemm_codegens generators,struct jit_gemm_params * jit_gemm_params,size_t group_output_channels,size_t nr,size_t group_input_channels,size_t log2_input_element_size,xnn_operator_t convolution_op)106 static void generate_gemms_up_to_max_mr(
107     size_t max_mr,
108     struct gemm_codegens generators,
109     struct jit_gemm_params *jit_gemm_params,
110     size_t group_output_channels,
111     size_t nr,
112     size_t group_input_channels,
113     size_t log2_input_element_size,
114     xnn_operator_t convolution_op)
115 {
116   assert(XNN_MAX_MR >= max_mr);
117   if (convolution_op->code_cache == NULL) {
118     return;
119   }
120   convolution_op->ukernel.gemm.gemm_cases[0].generated_code_offset[XNN_UARCH_DEFAULT] =
121       get_generated_gemm(generators.gemm1, jit_gemm_params, 1, group_output_channels, nr, group_input_channels,
122                          log2_input_element_size, convolution_op->code_cache);
123   for (size_t mr = 2; mr <= max_mr; mr++) {
124     convolution_op->ukernel.gemm.gemm_cases[mr - 1].generated_code_offset[XNN_UARCH_DEFAULT] =
125         get_generated_gemm(generators.gemm, jit_gemm_params, mr, group_output_channels, nr, group_input_channels,
126                            log2_input_element_size, convolution_op->code_cache);
127   }
128 }
129 
get_generated_igemm(struct xnn_hmp_igemm_codegen generators,struct jit_gemm_params * jit_gemm_params,size_t group_output_channels,size_t nr,size_t group_input_channels,size_t log2_input_element_size,size_t kernel_size,size_t mr,struct xnn_code_cache * code_cache)130 static size_t get_generated_igemm(
131     struct xnn_hmp_igemm_codegen generators,
132     struct jit_gemm_params *jit_gemm_params,
133     size_t group_output_channels,
134     size_t nr,
135     size_t group_input_channels,
136     size_t log2_input_element_size,
137     size_t kernel_size,
138     size_t mr,
139     struct xnn_code_cache* code_cache)
140 {
141   size_t offset = XNN_CACHE_NOT_FOUND;
142   xnn_jit_igemm_code_generator_function generator = generators.function[XNN_UARCH_DEFAULT];
143   if (generator == NULL) {
144     goto error;
145   }
146   enum xnn_status status = xnn_status_success;
147 
148   status = xnn_reserve_code_memory(&code_cache->cache.code, XNN_DEFAULT_MICROKERNEL_SIZE);
149   if (xnn_status_success != status) {
150     xnn_log_error("failed to ensure sufficient space in code buffer for microkernel");
151     goto error;
152   }
153 
154   const size_t old_size = code_cache->cache.code.size;
155   void* old_code = (uint8_t*) code_cache->cache.code.start + old_size;
156   status = generator(&code_cache->cache.code, mr, group_output_channels % nr,
157                      group_input_channels << log2_input_element_size,
158                      kernel_size * mr * sizeof(void*), jit_gemm_params);
159   if (status != xnn_status_success) {
160     xnn_log_error("failed to generate IGEMM microkernel");
161     goto error;
162   }
163 
164   const size_t new_size = code_cache->cache.code.size;
165   return xnn_get_or_insert_code_cache(code_cache, old_code, new_size - old_size);
166 
167 error:
168   return offset;
169 }
170 
generate_igemms_up_to_max_mr(size_t max_mr,struct gemm_codegens generators,struct jit_gemm_params * jit_gemm_params,size_t group_output_channels,size_t nr,size_t group_input_channels,size_t log2_input_element_size,size_t kernel_size,xnn_operator_t convolution_op)171 static void generate_igemms_up_to_max_mr(
172     size_t max_mr,
173     struct gemm_codegens generators,
174     struct jit_gemm_params *jit_gemm_params,
175     size_t group_output_channels,
176     size_t nr,
177     size_t group_input_channels,
178     size_t log2_input_element_size,
179     size_t kernel_size,
180     xnn_operator_t convolution_op)
181 {
182   assert(XNN_MAX_MR >= max_mr);
183   if (convolution_op->code_cache == NULL) {
184     return;
185   }
186   convolution_op->ukernel.igemm.igemm_cases[0].generated_code_offset[XNN_UARCH_DEFAULT] =
187       get_generated_igemm(generators.igemm1, jit_gemm_params, group_output_channels, nr, group_input_channels,
188                           log2_input_element_size, kernel_size, 1, convolution_op->code_cache);
189   for (size_t mr = 2; mr <= max_mr; mr++) {
190     convolution_op->ukernel.igemm.igemm_cases[mr - 1].generated_code_offset[XNN_UARCH_DEFAULT] =
191       get_generated_igemm(generators.igemm, jit_gemm_params, group_output_channels, nr, group_input_channels,
192                           log2_input_element_size, kernel_size, mr, convolution_op->code_cache);
193   }
194 }
195 #endif  // XNN_PLATFORM_JIT
196 
create_convolution2d_nhwc(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const void * kernel,const void * bias,uint32_t flags,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,xnn_pack_vmulcaddc_w_function pack_vmulcaddc_w,xnn_pack_dwconv_hwg_w_function pack_dwconv_hwg_w,xnn_pack_dwconv_ghw_w_function pack_dwconv_ghw_w,xnn_pack_gemm_goi_w_function pack_gemm_goi_w,xnn_pack_conv_kgo_w_function pack_conv_kgo_w,xnn_pack_conv_goki_w_function pack_conv_goki_w,const void * packing_params,int input_padding_byte,int packed_weights_padding_byte,size_t extra_weights_bytes,xnn_init_qc8_scale_params_fn init_scale_params,const float * scale_params,const void * gemm_params,size_t gemm_params_size,const void * dwconv_params,size_t dwconv_params_size,const void * vmulcaddc_params,size_t vmulcaddc_params_size,const struct gemm_parameters * gemm_parameters,const struct dwconv_parameters * dwconv_ukernel,const struct vmulcaddc_parameters * vmulcaddc_parameters,struct jit_gemm_params * jit_gemm_params,bool linear_activation,bool relu_activation,uint32_t datatype_init_flags,enum xnn_operator_type operator_type,size_t num_post_operations,void * post_operation_params,xnn_caches_t caches,xnn_operator_t * convolution_op_out)197 static enum xnn_status create_convolution2d_nhwc(
198     uint32_t input_padding_top,
199     uint32_t input_padding_right,
200     uint32_t input_padding_bottom,
201     uint32_t input_padding_left,
202     uint32_t kernel_height,
203     uint32_t kernel_width,
204     uint32_t subsampling_height,
205     uint32_t subsampling_width,
206     uint32_t dilation_height,
207     uint32_t dilation_width,
208     uint32_t groups,
209     size_t group_input_channels,
210     size_t group_output_channels,
211     size_t input_channel_stride,
212     size_t output_channel_stride,
213     const void* kernel,
214     const void* bias,
215     uint32_t flags,
216     uint32_t log2_input_element_size,
217     uint32_t log2_filter_element_size,
218     uint32_t bias_element_size,
219     xnn_pack_vmulcaddc_w_function pack_vmulcaddc_w,
220     xnn_pack_dwconv_hwg_w_function pack_dwconv_hwg_w,
221     xnn_pack_dwconv_ghw_w_function pack_dwconv_ghw_w,
222     xnn_pack_gemm_goi_w_function pack_gemm_goi_w,
223     xnn_pack_conv_kgo_w_function pack_conv_kgo_w,
224     xnn_pack_conv_goki_w_function pack_conv_goki_w,
225     const void* packing_params,
226     int input_padding_byte,
227     int packed_weights_padding_byte,
228     size_t extra_weights_bytes,
229     xnn_init_qc8_scale_params_fn init_scale_params,
230     const float* scale_params,
231     const void* gemm_params,
232     size_t gemm_params_size,
233     const void* dwconv_params,
234     size_t dwconv_params_size,
235     const void* vmulcaddc_params,
236     size_t vmulcaddc_params_size,
237     const struct gemm_parameters* gemm_parameters,
238     const struct dwconv_parameters* dwconv_ukernel,
239     const struct vmulcaddc_parameters* vmulcaddc_parameters,
240     struct jit_gemm_params* jit_gemm_params,
241     bool linear_activation,
242     bool relu_activation,
243     uint32_t datatype_init_flags,
244     enum xnn_operator_type operator_type,
245     size_t num_post_operations,
246     void* post_operation_params,
247     xnn_caches_t caches,
248     xnn_operator_t* convolution_op_out)
249 {
250   xnn_operator_t convolution_op = NULL;
251   enum xnn_status status = xnn_status_uninitialized;
252 
253   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
254     xnn_log_error(
255       "failed to create %s operator: XNNPACK is not initialized",
256       xnn_operator_type_to_string(operator_type));
257     goto error;
258   }
259 
260   status = xnn_status_unsupported_hardware;
261 
262   if ((xnn_params.init_flags & datatype_init_flags) != datatype_init_flags) {
263     xnn_log_error(
264       "failed to create %s operator: operations on data type are not supported",
265       xnn_operator_type_to_string(operator_type));
266     goto error;
267   }
268 
269   status = xnn_status_invalid_parameter;
270 
271   if (kernel_width == 0 || kernel_height == 0) {
272     xnn_log_error(
273       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
274       xnn_operator_type_to_string(operator_type), kernel_width, kernel_height);
275     goto error;
276   }
277 
278   if (subsampling_width == 0 || subsampling_height == 0) {
279     xnn_log_error(
280       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " subsampling: subsampling dimensions must be non-zero",
281       xnn_operator_type_to_string(operator_type), subsampling_width, subsampling_height);
282     goto error;
283   }
284 
285   if (dilation_width == 0 || dilation_height == 0) {
286     xnn_log_error(
287       "failed to create %s operator with %" PRIu32 "x%" PRIu32 " dilation: dilation dimensions must be non-zero",
288       xnn_operator_type_to_string(operator_type), dilation_width, dilation_height);
289     goto error;
290   }
291 
292   if (groups == 0) {
293     xnn_log_error(
294       "failed to create %s operator with %" PRIu32 " groups: number of groups must be non-zero",
295       xnn_operator_type_to_string(operator_type), groups);
296     goto error;
297   }
298 
299   if (group_input_channels == 0) {
300     xnn_log_error(
301       "failed to create %s operator with %zu input channels per group: number of channels must be non-zero",
302       xnn_operator_type_to_string(operator_type), group_input_channels);
303     goto error;
304   }
305 
306   if (group_output_channels == 0) {
307     xnn_log_error(
308       "failed to create %s operator with %zu output channels per group: number of channels must be non-zero",
309       xnn_operator_type_to_string(operator_type), group_output_channels);
310     goto error;
311   }
312 
313   const size_t input_channels = groups * group_input_channels;
314   if (input_channel_stride < input_channels) {
315     xnn_log_error(
316       "failed to create %s operator with input channel stride of %zu: "
317       "stride must be at least as large as the number of input channels (%" PRIu32 "x%zu)",
318       xnn_operator_type_to_string(operator_type),
319       input_channel_stride, groups, group_input_channels);
320     goto error;
321   }
322 
323   const size_t output_channels = groups * group_output_channels;
324   if (output_channel_stride < output_channels) {
325     xnn_log_error(
326       "failed to create %s operator with output channel stride of %zu: "
327       "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
328       xnn_operator_type_to_string(operator_type),
329       output_channel_stride, groups, group_output_channels);
330     goto error;
331   }
332 
333   if ((flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) != 0 && group_input_channels != 1) {
334     xnn_log_error(
335       "failed to create depthwise %s operator with %zu input channels per group: "
336       "depthwise convolution must have exactly 1 input channel per group",
337       xnn_operator_type_to_string(operator_type), group_input_channels);
338     goto error;
339   }
340 
341   const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
342   if ((flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
343     if (any_padding) {
344       xnn_log_error(
345         "failed to create %s operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
346         "TensorFlow SAME padding can't be combined with explicit padding specification",
347         xnn_operator_type_to_string(operator_type),
348         input_padding_top, input_padding_left, input_padding_bottom, input_padding_right);
349       goto error;
350     }
351   }
352 
353   status = xnn_status_out_of_memory;
354 
355   convolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
356   if (convolution_op == NULL) {
357     xnn_log_error(
358       "failed to allocate %zu bytes for %s operator descriptor",
359       sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
360     goto error;
361   }
362 
363   if (caches != NULL) {
364     convolution_op->weights_cache = caches->weights_cache;
365     convolution_op->code_cache = caches->code_cache;
366   }
367 
368   const size_t kernel_size = kernel_height * kernel_width;
369 
370   enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_default;
371   const bool unit_subsampling = (subsampling_width | subsampling_height) == 1;
372   if (group_input_channels == 1 && group_output_channels == 1 && kernel_size == 1 && unit_subsampling && !any_padding && vmulcaddc_parameters != NULL) {
373     ukernel_type = xnn_ukernel_type_vmulcaddc;
374   } else if (group_input_channels == 1 && group_output_channels == 1 && dwconv_ukernel != NULL)
375   {
376     ukernel_type = xnn_ukernel_type_dwconv;
377   } else if (kernel_size == 1 && unit_subsampling && !any_padding) {
378     ukernel_type = xnn_ukernel_type_gemm;
379   } else {
380     ukernel_type = xnn_ukernel_type_igemm;
381   }
382   assert(ukernel_type != xnn_ukernel_type_default);
383 
384   if (num_post_operations != 0 && ukernel_type != xnn_ukernel_type_gemm) {
385     xnn_log_error(
386         "convolution with post operations not support for these parameters: "
387         "kernel_size: %zu unit_subsampling: %d padding: %d",
388         kernel_size, unit_subsampling, any_padding);
389     goto error;
390   }
391 
392   size_t zero_size = 0;
393   switch (ukernel_type) {
394     case xnn_ukernel_type_vmulcaddc:
395     {
396       assert(vmulcaddc_parameters != NULL);
397       assert(vmulcaddc_params != NULL);
398 
399       const size_t c_stride = round_up_po2(groups, vmulcaddc_parameters->channel_tile);
400       const size_t packed_weights_size = ((UINT32_C(1) << log2_filter_element_size) + bias_element_size) * c_stride;
401       size_t aligned_total_weights_size = round_up_po2(packed_weights_size, XNN_ALLOCATION_ALIGNMENT);
402       void* weights_ptr = xnn_get_pointer_to_write_weights(
403           convolution_op, aligned_total_weights_size, packed_weights_padding_byte);
404       if (weights_ptr == NULL) {
405         xnn_log_error("failed to reserve or allocated %zu bytes for %s operator vmulcaddc packed weights",
406                       aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
407         goto error;
408       }
409 
410       pack_vmulcaddc_w(
411         groups, vmulcaddc_parameters->channel_tile,
412         kernel, bias, weights_ptr, packing_params);
413 
414       if (use_weights_cache(convolution_op)) {
415         convolution_op->packed_weights.offset = xnn_get_or_insert_weights_cache(
416             convolution_op->weights_cache, weights_ptr, aligned_total_weights_size);
417       }
418 
419       memcpy(&convolution_op->params, vmulcaddc_params, vmulcaddc_params_size);
420 
421       convolution_op->ukernel.vmulcaddc = (struct xnn_ukernel_vmulcaddc) {
422         .function = vmulcaddc_parameters->ukernel,
423         .mr = vmulcaddc_parameters->row_tile,
424       };
425       break;
426     }
427     case xnn_ukernel_type_dwconv:
428     {
429       assert(dwconv_ukernel != NULL);
430       assert(dwconv_ukernel->primary_tile == kernel_size);
431 
432       const size_t c_stride = round_up_po2(groups, dwconv_ukernel->channel_tile);
433       const size_t packed_weights_size = ((kernel_size << log2_filter_element_size) + bias_element_size + extra_weights_bytes) * c_stride;
434       size_t aligned_total_weights_size = round_up_po2(packed_weights_size, XNN_ALLOCATION_ALIGNMENT);
435       void* weights_ptr = xnn_get_pointer_to_write_weights(
436           convolution_op, aligned_total_weights_size, packed_weights_padding_byte);
437       if (weights_ptr == NULL) {
438         xnn_log_error("failed to reserve or allocated %zu bytes for %s operator dwconv packed weights",
439                       aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
440         goto error;
441       }
442       memcpy(&convolution_op->params, dwconv_params, dwconv_params_size);
443 
444       if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
445         pack_dwconv_hwg_w(
446           dwconv_ukernel->primary_tile,
447           kernel_height, kernel_width,
448           groups, dwconv_ukernel->channel_tile,
449           kernel, bias, weights_ptr,
450           dwconv_ukernel->channel_tile * extra_weights_bytes,
451           packing_params);
452       } else {
453         pack_dwconv_ghw_w(
454           dwconv_ukernel->primary_tile,
455           kernel_height, kernel_width,
456           groups, dwconv_ukernel->channel_tile,
457           kernel, bias, weights_ptr,
458           dwconv_ukernel->channel_tile * extra_weights_bytes,
459           packing_params);
460       }
461 
462       if (scale_params != NULL) {
463         assert(init_scale_params != NULL);
464 
465         init_scale_params(
466           groups, dwconv_ukernel->channel_tile,
467           dwconv_ukernel->channel_tile * ((kernel_size << log2_filter_element_size) + bias_element_size + extra_weights_bytes),
468           scale_params,
469           (void*) ((uintptr_t) weights_ptr + dwconv_ukernel->channel_tile * ((kernel_size << log2_filter_element_size) + bias_element_size)));
470       }
471 
472       if (use_weights_cache(convolution_op)) {
473         convolution_op->packed_weights.offset = xnn_get_or_insert_weights_cache(
474             convolution_op->weights_cache, weights_ptr, aligned_total_weights_size);
475       }
476 
477       const union dwconv_fused_ukernels* ukernels = &dwconv_ukernel->minmax;
478       if (linear_activation && dwconv_ukernel->linear.unipass != NULL) {
479         ukernels = &dwconv_ukernel->linear;
480       }
481       convolution_op->ukernel.dwconv = (struct xnn_ukernel_dwconv) {
482         .unipass_function = ukernels->unipass,
483         .primary_tile = dwconv_ukernel->primary_tile,
484         .incremental_tile = dwconv_ukernel->incremental_tile,
485       };
486 
487       zero_size = XNN_EXTRA_BYTES + (c_stride << log2_input_element_size);
488       break;
489     }
490     case xnn_ukernel_type_gemm:
491     case xnn_ukernel_type_igemm:
492     {
493       const uint32_t nr = gemm_parameters->nr;
494       const uint32_t kr = UINT32_C(1) << gemm_parameters->log2_kr;
495       const uint32_t sr = UINT32_C(1) << gemm_parameters->log2_sr;
496       const size_t n_stride = round_up(group_output_channels, nr);
497       const size_t k_stride = round_up_po2(group_input_channels, kr * sr);
498 
499       const size_t packed_group_weights_size = ((kernel_size * k_stride << log2_filter_element_size) + bias_element_size + extra_weights_bytes) * n_stride;
500       const size_t aligned_total_weights_size = round_up_po2(packed_group_weights_size * groups, XNN_ALLOCATION_ALIGNMENT);
501       void* weights_ptr = xnn_get_pointer_to_write_weights(
502         convolution_op, aligned_total_weights_size, packed_weights_padding_byte);
503       if (weights_ptr == NULL) {
504         xnn_log_error("failed to reserve or allocated %zu bytes for %s operator gemm packed weights",
505                       aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
506         goto error;
507       }
508       memcpy(&convolution_op->params, gemm_params, gemm_params_size);
509       convolution_op->num_post_operation_params = num_post_operations;
510       convolution_op->post_operation_params = post_operation_params;
511 
512       const struct gemm_fused_ukernels* gemm_ukernels = &gemm_parameters->minmax;
513       const uint32_t mr = gemm_parameters->mr;
514       if (linear_activation && gemm_parameters->linear.gemm[mr - 1].function[XNN_UARCH_DEFAULT] != NULL) {
515         gemm_ukernels = &gemm_parameters->linear;
516       } else if (relu_activation && gemm_parameters->relu.gemm[mr - 1].function[XNN_UARCH_DEFAULT] != NULL) {
517         gemm_ukernels = &gemm_parameters->relu;
518       }
519       switch (ukernel_type) {
520         case xnn_ukernel_type_gemm:
521             pack_gemm_goi_w(
522                 groups, group_output_channels, group_input_channels,
523                 nr, kr, sr,
524                 kernel, bias, weights_ptr, gemm_parameters->nr * extra_weights_bytes, packing_params);
525           convolution_op->ukernel.gemm = (struct xnn_ukernel_gemm) {
526             .mr = mr,
527             .nr = nr,
528             .kr = kr,
529             .sr = sr,
530           };
531 
532           assert(XNN_MAX_MR >= mr);
533           for (size_t i = 0; i < mr; i++) {
534             convolution_op->ukernel.gemm.gemm_cases[i] = gemm_ukernels->gemm[i];
535           }
536 
537           #if XNN_PLATFORM_JIT
538             generate_gemms_up_to_max_mr(
539               mr, gemm_parameters->generator, jit_gemm_params, group_output_channels, nr,
540               group_input_channels, log2_input_element_size, convolution_op);
541           #endif  // XNN_PLATFORM_JIT
542 
543           break;
544         case xnn_ukernel_type_igemm:
545           if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
546             pack_conv_kgo_w(
547               groups, group_output_channels, kernel_size,
548               nr, kr, sr,
549               kernel, bias, weights_ptr, gemm_parameters->nr * extra_weights_bytes, packing_params);
550           } else {
551             pack_conv_goki_w(
552               groups, group_output_channels, kernel_size, group_input_channels,
553               nr, kr, sr,
554               kernel, bias, weights_ptr, gemm_parameters->nr * extra_weights_bytes, packing_params);
555           }
556           convolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
557             .mr = mr,
558             .nr = nr,
559             .kr = kr,
560             .sr = sr,
561           };
562 
563           assert(XNN_MAX_MR >= mr);
564           for (size_t i = 0; i < mr; i++) {
565             convolution_op->ukernel.igemm.igemm_cases[i] = gemm_ukernels->igemm[i];
566           }
567 
568           #if XNN_PLATFORM_JIT
569             generate_igemms_up_to_max_mr(
570               mr, gemm_parameters->generator, jit_gemm_params, group_output_channels, nr,
571               group_input_channels, log2_input_element_size, kernel_size, convolution_op);
572           #endif  // XNN_PLATFORM_JIT
573 
574           break;
575         default:
576           XNN_UNREACHABLE;
577       }
578 
579       if (scale_params != NULL) {
580         assert(init_scale_params != NULL);
581 
582         void* group_weights = (void*)
583           ((uintptr_t) weights_ptr + gemm_parameters->nr * ((kernel_size * k_stride << log2_filter_element_size) + bias_element_size));
584         const size_t weights_stride = (kernel_size * k_stride << log2_filter_element_size) + bias_element_size + extra_weights_bytes;
585         for (uint32_t group = 0; group < groups; group++) {
586           init_scale_params(
587             group_output_channels, gemm_parameters->nr,
588             gemm_parameters->nr * weights_stride,
589             scale_params, group_weights);
590           scale_params += group_output_channels;
591           group_weights = (void*) ((uintptr_t) group_weights + n_stride * weights_stride);
592         }
593       }
594 
595       if (use_weights_cache(convolution_op)) {
596         convolution_op->packed_weights.offset = xnn_get_or_insert_weights_cache(
597             convolution_op->weights_cache, weights_ptr, aligned_total_weights_size);
598       }
599 
600       zero_size = XNN_EXTRA_BYTES + (k_stride << log2_input_element_size);
601       break;
602     }
603     default:
604       XNN_UNREACHABLE;
605   }
606 
607   const bool tf_same_padding = (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0 && kernel_size != 1;
608   if (any_padding || tf_same_padding) {
609     convolution_op->zero_buffer = xnn_allocate_simd_memory(zero_size);
610     if (convolution_op->zero_buffer == NULL) {
611       xnn_log_error(
612         "failed to allocate %zu bytes for %s operator zero padding",
613         zero_size, xnn_operator_type_to_string(operator_type));
614       goto error;
615     }
616     memset(convolution_op->zero_buffer, input_padding_byte, zero_size);
617   }
618 
619   convolution_op->padding_top = input_padding_top;
620   convolution_op->padding_right = input_padding_right;
621   convolution_op->padding_bottom = input_padding_bottom;
622   convolution_op->padding_left = input_padding_left;
623 
624   convolution_op->kernel_height = kernel_height;
625   convolution_op->kernel_width = kernel_width;
626   convolution_op->stride_height = subsampling_height;
627   convolution_op->stride_width = subsampling_width;
628   convolution_op->dilation_height = dilation_height;
629   convolution_op->dilation_width = dilation_width;
630   convolution_op->groups = groups;
631   convolution_op->group_input_channels = group_input_channels;
632   convolution_op->group_output_channels = group_output_channels;
633   convolution_op->input_pixel_stride = input_channel_stride;
634   convolution_op->output_pixel_stride = output_channel_stride;
635 
636   convolution_op->type = operator_type;
637   convolution_op->ukernel.type = ukernel_type;
638   convolution_op->flags = flags & ~XNN_FLAG_TENSORFLOW_SAME_PADDING;
639   if (tf_same_padding) {
640     convolution_op->flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
641   }
642 
643   convolution_op->state = xnn_run_state_invalid;
644 
645   *convolution_op_out = convolution_op;
646   return xnn_status_success;
647 
648 error:
649   xnn_delete_operator(convolution_op);
650   return status;
651 }
652 
xnn_create_convolution2d_nhwc_qu8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,uint8_t input_zero_point,float input_scale,uint8_t kernel_zero_point,float kernel_scale,const uint8_t * kernel,const int32_t * bias,uint8_t output_zero_point,float output_scale,uint8_t output_min,uint8_t output_max,uint32_t flags,xnn_caches_t caches,xnn_operator_t * convolution_op_out)653 enum xnn_status xnn_create_convolution2d_nhwc_qu8(
654     uint32_t input_padding_top,
655     uint32_t input_padding_right,
656     uint32_t input_padding_bottom,
657     uint32_t input_padding_left,
658     uint32_t kernel_height,
659     uint32_t kernel_width,
660     uint32_t subsampling_height,
661     uint32_t subsampling_width,
662     uint32_t dilation_height,
663     uint32_t dilation_width,
664     uint32_t groups,
665     size_t group_input_channels,
666     size_t group_output_channels,
667     size_t input_channel_stride,
668     size_t output_channel_stride,
669     uint8_t input_zero_point,
670     float input_scale,
671     uint8_t kernel_zero_point,
672     float kernel_scale,
673     const uint8_t* kernel,
674     const int32_t* bias,
675     uint8_t output_zero_point,
676     float output_scale,
677     uint8_t output_min,
678     uint8_t output_max,
679     uint32_t flags,
680     xnn_caches_t caches,
681     xnn_operator_t* convolution_op_out)
682 {
683   if (input_scale <= 0.0f || !isnormal(input_scale)) {
684     xnn_log_error(
685       "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
686       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), input_scale);
687     return xnn_status_invalid_parameter;
688   }
689 
690   if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
691     xnn_log_error(
692       "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
693       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), kernel_scale);
694     return xnn_status_invalid_parameter;
695   }
696 
697   if (output_scale <= 0.0f || !isnormal(output_scale)) {
698     xnn_log_error(
699       "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
700       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), output_scale);
701     return xnn_status_invalid_parameter;
702   }
703 
704   if (output_min >= output_max) {
705     xnn_log_error(
706       "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
707       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), output_min, output_max);
708     return xnn_status_invalid_parameter;
709   }
710 
711   const float requantization_scale = input_scale * kernel_scale / output_scale;
712   if (requantization_scale >= 256.0f) {
713     xnn_log_error(
714       "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
715       "requantization scale %.7g is greater or equal to 256.0",
716       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8),
717       input_scale, kernel_scale, output_scale, requantization_scale);
718     return xnn_status_unsupported_parameter;
719   }
720 
721   const struct xnn_qu8_packing_params packing_params = {
722     .input_zero_point = input_zero_point,
723     .kernel_zero_point = kernel_zero_point,
724   };
725 
726 
727   union xnn_qu8_conv_minmax_params gemm_params;
728   if XNN_LIKELY(xnn_params.qu8.gemm.init.qu8 != NULL) {
729     xnn_params.qu8.gemm.init.qu8(&gemm_params,
730       kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max);
731   }
732 
733   union xnn_qu8_conv_minmax_params dwconv_params;
734   const struct dwconv_parameters* dwconv_ukernel =
735     find_dwconv_ukernel(kernel_height * kernel_width, xnn_params.qu8.dwconv, XNN_MAX_QU8_DWCONV_UKERNELS);
736   if XNN_LIKELY(dwconv_ukernel != NULL) {
737     dwconv_ukernel->init.qu8(&dwconv_params,
738       kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max);
739   }
740 
741   return create_convolution2d_nhwc(
742     input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
743     kernel_height, kernel_width,
744     subsampling_height, subsampling_width,
745     dilation_height, dilation_width,
746     groups, group_input_channels, group_output_channels,
747     input_channel_stride, output_channel_stride,
748     kernel, bias, flags,
749     0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
750     0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
751     sizeof(int32_t) /* sizeof(bias element) */,
752     (xnn_pack_vmulcaddc_w_function) NULL,
753     (xnn_pack_dwconv_hwg_w_function) xnn_pack_qu8_dwconv_hwg_w,
754     (xnn_pack_dwconv_ghw_w_function) xnn_pack_qu8_dwconv_ghw_w,
755     (xnn_pack_gemm_goi_w_function) xnn_pack_qu8_gemm_goi_w,
756     (xnn_pack_conv_kgo_w_function) xnn_pack_qu8_conv_kgo_w,
757     (xnn_pack_conv_goki_w_function) xnn_pack_qu8_conv_goki_w,
758     &packing_params, input_zero_point /* input padding byte */, kernel_zero_point /* packed weights padding byte */,
759     0 /* extra weights bytes */, NULL /* init scale params fn */, NULL /* scale params */,
760     &gemm_params, sizeof(gemm_params),
761     &dwconv_params, sizeof(dwconv_params),
762     NULL /* vmulcaddc params */, 0,
763     &xnn_params.qu8.gemm, dwconv_ukernel, NULL /* vmulcaddc parameters */,
764     NULL /* jit_gemm_params */,
765     false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_QU8,
766     xnn_operator_type_convolution_nhwc_qu8,
767     0, NULL,
768     caches,
769     convolution_op_out);
770 }
771 
xnn_create_convolution2d_nhwc_qs8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,int8_t input_zero_point,float input_scale,float kernel_scale,const int8_t * kernel,const int32_t * bias,int8_t output_zero_point,float output_scale,int8_t output_min,int8_t output_max,uint32_t flags,xnn_caches_t caches,xnn_operator_t * convolution_op_out)772 enum xnn_status xnn_create_convolution2d_nhwc_qs8(
773     uint32_t input_padding_top,
774     uint32_t input_padding_right,
775     uint32_t input_padding_bottom,
776     uint32_t input_padding_left,
777     uint32_t kernel_height,
778     uint32_t kernel_width,
779     uint32_t subsampling_height,
780     uint32_t subsampling_width,
781     uint32_t dilation_height,
782     uint32_t dilation_width,
783     uint32_t groups,
784     size_t group_input_channels,
785     size_t group_output_channels,
786     size_t input_channel_stride,
787     size_t output_channel_stride,
788     int8_t input_zero_point,
789     float input_scale,
790     float kernel_scale,
791     const int8_t* kernel,
792     const int32_t* bias,
793     int8_t output_zero_point,
794     float output_scale,
795     int8_t output_min,
796     int8_t output_max,
797     uint32_t flags,
798     xnn_caches_t caches,
799     xnn_operator_t* convolution_op_out)
800 {
801   if (input_scale <= 0.0f || !isnormal(input_scale)) {
802     xnn_log_error(
803       "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
804       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), input_scale);
805     return xnn_status_invalid_parameter;
806   }
807 
808   if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
809     xnn_log_error(
810       "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
811       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), kernel_scale);
812     return xnn_status_invalid_parameter;
813   }
814 
815   if (output_scale <= 0.0f || !isnormal(output_scale)) {
816     xnn_log_error(
817       "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
818       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), output_scale);
819     return xnn_status_invalid_parameter;
820   }
821 
822   if (output_min >= output_max) {
823     xnn_log_error(
824       "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: range min must be below range max",
825       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), output_min, output_max);
826     return xnn_status_invalid_parameter;
827   }
828 
829   const float requantization_scale = input_scale * kernel_scale / output_scale;
830   if (requantization_scale >= 256.0f) {
831     xnn_log_error(
832       "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
833       "requantization scale %.7g is greater or equal to 256.0",
834       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8),
835       input_scale, kernel_scale, output_scale, requantization_scale);
836     return xnn_status_unsupported_parameter;
837   }
838 
839   const struct xnn_qs8_packing_params packing_params = { .input_zero_point = input_zero_point, };
840 
841   union xnn_qs8_conv_minmax_params gemm_params;
842   if XNN_LIKELY(xnn_params.qs8.gemm.init.qs8 != NULL) {
843     xnn_params.qs8.gemm.init.qs8(&gemm_params,
844       requantization_scale, output_zero_point, output_min, output_max);
845   }
846 
847   union xnn_qs8_conv_minmax_params dwconv_params;
848   const struct dwconv_parameters* dwconv_ukernel =
849     find_dwconv_ukernel(kernel_height * kernel_width, xnn_params.qs8.dwconv, XNN_MAX_QS8_DWCONV_UKERNELS);
850   if XNN_LIKELY(dwconv_ukernel != NULL) {
851     dwconv_ukernel->init.qs8(&dwconv_params,
852       requantization_scale, output_zero_point, output_min, output_max);
853   }
854 
855   return create_convolution2d_nhwc(
856     input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
857     kernel_height, kernel_width,
858     subsampling_height, subsampling_width,
859     dilation_height, dilation_width,
860     groups, group_input_channels, group_output_channels,
861     input_channel_stride, output_channel_stride,
862     kernel, bias, flags,
863     0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
864     0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
865     sizeof(int32_t) /* sizeof(bias element) */,
866     (xnn_pack_vmulcaddc_w_function) NULL,
867     (xnn_pack_dwconv_hwg_w_function) xnn_pack_qs8_dwconv_hwg_w,
868     (xnn_pack_dwconv_ghw_w_function) xnn_pack_qs8_dwconv_ghw_w,
869     (xnn_pack_gemm_goi_w_function) xnn_pack_qs8_gemm_goi_w,
870     (xnn_pack_conv_kgo_w_function) xnn_pack_qs8_conv_kgo_w,
871     (xnn_pack_conv_goki_w_function) xnn_pack_qs8_conv_goki_w,
872     &packing_params, input_zero_point /* input padding byte */, 0 /* packed weights padding byte */,
873     0 /* extra weights bytes */, NULL /* init scale params fn */, NULL /* scale params */,
874     &gemm_params, sizeof(gemm_params),
875     &dwconv_params, sizeof(dwconv_params),
876     NULL /* vmulcaddc params */, 0,
877     &xnn_params.qs8.gemm, dwconv_ukernel, NULL /* vmulcaddc parameters */,
878     NULL /* jit_gemm_params */,
879     false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_QS8,
880     xnn_operator_type_convolution_nhwc_qs8,
881     0, NULL,
882     caches,
883     convolution_op_out);
884 }
885 
xnn_create_convolution2d_nhwc_qc8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,int8_t input_zero_point,float input_scale,const float * kernel_scale,const int8_t * kernel,const int32_t * bias,int8_t output_zero_point,float output_scale,int8_t output_min,int8_t output_max,uint32_t flags,xnn_caches_t caches,xnn_operator_t * convolution_op_out)886 enum xnn_status xnn_create_convolution2d_nhwc_qc8(
887     uint32_t input_padding_top,
888     uint32_t input_padding_right,
889     uint32_t input_padding_bottom,
890     uint32_t input_padding_left,
891     uint32_t kernel_height,
892     uint32_t kernel_width,
893     uint32_t subsampling_height,
894     uint32_t subsampling_width,
895     uint32_t dilation_height,
896     uint32_t dilation_width,
897     uint32_t groups,
898     size_t group_input_channels,
899     size_t group_output_channels,
900     size_t input_channel_stride,
901     size_t output_channel_stride,
902     int8_t input_zero_point,
903     float input_scale,
904     const float* kernel_scale,
905     const int8_t* kernel,
906     const int32_t* bias,
907     int8_t output_zero_point,
908     float output_scale,
909     int8_t output_min,
910     int8_t output_max,
911     uint32_t flags,
912     xnn_caches_t caches,
913     xnn_operator_t* convolution_op_out)
914 {
915   if (input_scale <= 0.0f || !isnormal(input_scale)) {
916     xnn_log_error(
917       "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
918       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), input_scale);
919     return xnn_status_invalid_parameter;
920   }
921 
922   for (size_t output_channel = 0; output_channel < groups * group_output_channels; output_channel++) {
923     if (kernel_scale[output_channel] <= 0.0f || !isnormal(kernel_scale[output_channel])) {
924       xnn_log_error(
925         "failed to create %s operator with %.7g kernel scale in output channel #%zu: "
926         "scale must be finite, normalized, and positive",
927         xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), kernel_scale[output_channel],
928         output_channel);
929       return xnn_status_invalid_parameter;
930     }
931   }
932 
933   if (output_scale <= 0.0f || !isnormal(output_scale)) {
934     xnn_log_error(
935       "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
936       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), output_scale);
937     return xnn_status_invalid_parameter;
938   }
939 
940   if (output_min >= output_max) {
941     xnn_log_error(
942       "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: range min must be below range max",
943       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), output_min, output_max);
944     return xnn_status_invalid_parameter;
945   }
946 
947   float* requantization_scale = XNN_SIMD_ALLOCA(groups * group_output_channels * sizeof(float));
948   for (size_t output_channel = 0; output_channel < groups * group_output_channels; output_channel++) {
949     requantization_scale[output_channel] = input_scale * kernel_scale[output_channel] / output_scale;
950     if (requantization_scale[output_channel] >= 256.0f) {
951       xnn_log_error(
952         "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale in output channel #%zu: "
953         "requantization scale %.7g is greater or equal to 256.0",
954         xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8),
955         input_scale, kernel_scale[output_channel], output_scale,
956         output_channel, requantization_scale[output_channel]);
957       return xnn_status_unsupported_parameter;
958     }
959   }
960 
961   const struct xnn_qs8_packing_params packing_params = { .input_zero_point = input_zero_point, };
962 
963   union xnn_qc8_conv_minmax_params gemm_params;
964   if XNN_LIKELY(xnn_params.qc8.gemm.init.qc8 != NULL) {
965     xnn_params.qc8.gemm.init.qc8(&gemm_params,
966       output_zero_point, output_min, output_max);
967   }
968 
969   union xnn_qc8_conv_minmax_params dwconv_params;
970   const struct dwconv_parameters* dwconv_ukernel =
971     find_dwconv_ukernel(kernel_height * kernel_width, xnn_params.qc8.dwconv, XNN_MAX_QC8_DWCONV_UKERNELS);
972   if XNN_LIKELY(dwconv_ukernel != NULL) {
973     dwconv_ukernel->init.qc8(&dwconv_params,
974       output_zero_point, output_min, output_max);
975   }
976 
977   return create_convolution2d_nhwc(
978     input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
979     kernel_height, kernel_width,
980     subsampling_height, subsampling_width,
981     dilation_height, dilation_width,
982     groups, group_input_channels, group_output_channels,
983     input_channel_stride, output_channel_stride,
984     kernel, bias, flags,
985     0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
986     0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
987     sizeof(int32_t) /* sizeof(bias element) */,
988     (xnn_pack_vmulcaddc_w_function) NULL,
989     (xnn_pack_dwconv_hwg_w_function) xnn_pack_qs8_dwconv_hwg_w,
990     (xnn_pack_dwconv_ghw_w_function) xnn_pack_qs8_dwconv_ghw_w,
991     (xnn_pack_gemm_goi_w_function) xnn_pack_qs8_gemm_goi_w,
992     (xnn_pack_conv_kgo_w_function) xnn_pack_qs8_conv_kgo_w,
993     (xnn_pack_conv_goki_w_function) xnn_pack_qs8_conv_goki_w,
994     &packing_params, input_zero_point /* input padding byte */, 0 /* packed weights padding byte */,
995     sizeof(float) /* extra weights bytes */, xnn_init_qc8_scale_fp32_params, requantization_scale,
996     &gemm_params, sizeof(gemm_params),
997     &dwconv_params, sizeof(dwconv_params),
998     NULL /* vmulcaddc params */, 0,
999     &xnn_params.qc8.gemm, dwconv_ukernel, NULL /* vmulcaddc parameters */,
1000     NULL /* jit_gemm_params */,
1001     false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_QC8,
1002     xnn_operator_type_convolution_nhwc_qc8,
1003     0, NULL,
1004     caches,
1005     convolution_op_out);
1006 }
1007 
xnn_create_convolution2d_nhwc_f16(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const void * kernel,const void * bias,float output_min,float output_max,uint32_t flags,xnn_caches_t caches,xnn_operator_t * convolution_op_out)1008 enum xnn_status xnn_create_convolution2d_nhwc_f16(
1009     uint32_t input_padding_top,
1010     uint32_t input_padding_right,
1011     uint32_t input_padding_bottom,
1012     uint32_t input_padding_left,
1013     uint32_t kernel_height,
1014     uint32_t kernel_width,
1015     uint32_t subsampling_height,
1016     uint32_t subsampling_width,
1017     uint32_t dilation_height,
1018     uint32_t dilation_width,
1019     uint32_t groups,
1020     size_t group_input_channels,
1021     size_t group_output_channels,
1022     size_t input_channel_stride,
1023     size_t output_channel_stride,
1024     const void* kernel,
1025     const void* bias,
1026     float output_min,
1027     float output_max,
1028     uint32_t flags,
1029     xnn_caches_t caches,
1030     xnn_operator_t* convolution_op_out)
1031 {
1032   if (isnan(output_min)) {
1033     xnn_log_error(
1034       "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
1035       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
1036     return xnn_status_invalid_parameter;
1037   }
1038 
1039   if (isnan(output_max)) {
1040     xnn_log_error(
1041       "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
1042       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
1043     return xnn_status_invalid_parameter;
1044   }
1045 
1046   const uint16_t fp16_output_min = fp16_ieee_from_fp32_value(output_min);
1047   const uint16_t fp16_output_max = fp16_ieee_from_fp32_value(output_max);
1048   const float rounded_output_min = fp16_ieee_to_fp32_value(fp16_output_min);
1049   const float rounded_output_max = fp16_ieee_to_fp32_value(fp16_output_max);
1050   if (rounded_output_min >= rounded_output_max) {
1051     xnn_log_error(
1052       "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
1053       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16), rounded_output_min, rounded_output_max);
1054     return xnn_status_invalid_parameter;
1055   }
1056 
1057   union xnn_f16_minmax_params gemm_params;
1058   if XNN_LIKELY(xnn_params.f16.gemm.init.f16 != NULL) {
1059     xnn_params.f16.gemm.init.f16(&gemm_params,
1060       fp16_output_min, fp16_output_max);
1061   }
1062 
1063   union xnn_f16_minmax_params dwconv_params;
1064   const struct dwconv_parameters* dwconv_ukernel =
1065     find_dwconv_ukernel(kernel_height * kernel_width, xnn_params.f16.dwconv, XNN_MAX_F16_DWCONV_UKERNELS);
1066   if XNN_LIKELY(dwconv_ukernel != NULL) {
1067     dwconv_ukernel->init.f16(&dwconv_params, fp16_output_min, fp16_output_max);
1068   }
1069 
1070   union xnn_f16_minmax_params vmulcaddc_params;
1071   if XNN_LIKELY(xnn_params.f16.vmulcaddc.init.f16 != NULL) {
1072     xnn_params.f16.vmulcaddc.init.f16(&vmulcaddc_params, fp16_output_min, fp16_output_max);
1073   }
1074 
1075   xnn_pack_vmulcaddc_w_function pack_vmulcaddc_w = (xnn_pack_vmulcaddc_w_function) xnn_pack_f16_vmulcaddc_w;
1076   xnn_pack_dwconv_hwg_w_function pack_dwconv_hwg_w = (xnn_pack_dwconv_hwg_w_function) xnn_pack_f16_dwconv_hwg_w;
1077   xnn_pack_dwconv_ghw_w_function pack_dwconv_ghw_w = (xnn_pack_dwconv_ghw_w_function) xnn_pack_f16_dwconv_ghw_w;
1078   xnn_pack_gemm_goi_w_function pack_gemm_goi_w = (xnn_pack_gemm_goi_w_function) xnn_pack_f16_gemm_goi_w;
1079   xnn_pack_conv_kgo_w_function pack_conv_kgo_w = (xnn_pack_conv_kgo_w_function) xnn_pack_f16_conv_kgo_w;
1080   xnn_pack_conv_goki_w_function pack_conv_goki_w = (xnn_pack_conv_goki_w_function) xnn_pack_f16_conv_goki_w;
1081   if (flags & XNN_FLAG_FP32_STATIC_WEIGHTS) {
1082     pack_vmulcaddc_w = (xnn_pack_vmulcaddc_w_function) xnn_pack_f32_to_f16_vmulcaddc_w;
1083     pack_dwconv_hwg_w = (xnn_pack_dwconv_hwg_w_function) xnn_pack_f32_to_f16_dwconv_hwg_w;
1084     pack_dwconv_ghw_w = (xnn_pack_dwconv_ghw_w_function) xnn_pack_f32_to_f16_dwconv_ghw_w;
1085     pack_gemm_goi_w = (xnn_pack_gemm_goi_w_function) xnn_pack_f32_to_f16_gemm_goi_w;
1086     pack_conv_kgo_w = (xnn_pack_conv_kgo_w_function) xnn_pack_f32_to_f16_conv_kgo_w;
1087     pack_conv_goki_w = (xnn_pack_conv_goki_w_function) xnn_pack_f32_to_f16_conv_goki_w;
1088   }
1089 
1090   return create_convolution2d_nhwc(
1091     input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
1092     kernel_height, kernel_width,
1093     subsampling_height, subsampling_width,
1094     dilation_height, dilation_width,
1095     groups, group_input_channels, group_output_channels,
1096     input_channel_stride, output_channel_stride,
1097     kernel, bias, flags,
1098     1 /* log2(sizeof(input element)) = log2(sizeof(uint16_t)) */,
1099     1 /* log2(sizeof(filter element)) = log2(sizeof(uint16_t)) */,
1100     sizeof(uint16_t) /* sizeof(bias element) */,
1101     pack_vmulcaddc_w,
1102     pack_dwconv_hwg_w,
1103     pack_dwconv_ghw_w,
1104     pack_gemm_goi_w,
1105     pack_conv_kgo_w,
1106     pack_conv_goki_w,
1107     NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
1108     0 /* extra weights bytes */, NULL /* init scale params fn */, NULL /* scale params */,
1109     &gemm_params, sizeof(gemm_params),
1110     &dwconv_params, sizeof(dwconv_params),
1111     &vmulcaddc_params, sizeof(vmulcaddc_params),
1112     &xnn_params.f16.gemm, dwconv_ukernel, &xnn_params.f16.vmulcaddc,
1113     NULL /* jit_gemm_params */,
1114     false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_F16,
1115     xnn_operator_type_convolution_nhwc_f16,
1116     0, NULL,
1117     caches,
1118     convolution_op_out);
1119 }
1120 
xnn_create_convolution2d_nhwc_f32(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const float * kernel,const float * bias,float output_min,float output_max,uint32_t flags,xnn_caches_t caches,xnn_operator_t * convolution_op_out)1121 enum xnn_status xnn_create_convolution2d_nhwc_f32(
1122     uint32_t input_padding_top,
1123     uint32_t input_padding_right,
1124     uint32_t input_padding_bottom,
1125     uint32_t input_padding_left,
1126     uint32_t kernel_height,
1127     uint32_t kernel_width,
1128     uint32_t subsampling_height,
1129     uint32_t subsampling_width,
1130     uint32_t dilation_height,
1131     uint32_t dilation_width,
1132     uint32_t groups,
1133     size_t group_input_channels,
1134     size_t group_output_channels,
1135     size_t input_channel_stride,
1136     size_t output_channel_stride,
1137     const float* kernel,
1138     const float* bias,
1139     float output_min,
1140     float output_max,
1141     uint32_t flags,
1142     xnn_caches_t caches,
1143     xnn_operator_t* convolution_op_out)
1144 {
1145   if (isnan(output_min)) {
1146     xnn_log_error(
1147       "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
1148       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1149     return xnn_status_invalid_parameter;
1150   }
1151 
1152   if (isnan(output_max)) {
1153     xnn_log_error(
1154       "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
1155       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1156     return xnn_status_invalid_parameter;
1157   }
1158 
1159   if (output_min >= output_max) {
1160     xnn_log_error(
1161       "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
1162       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32), output_min, output_max);
1163     return xnn_status_invalid_parameter;
1164   }
1165 
1166   const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max);
1167   const bool relu_activation = (output_max == INFINITY) && (output_min == 0.0f);
1168 
1169   const struct gemm_parameters* gemm_parameters = &xnn_params.f32.gemm;
1170   if (gemm_parameters->nr > group_output_channels) {
1171     // Default micro-kernel is suboptimal. Try to find a better micro-kernel.
1172 
1173     if (xnn_params.f32.gemm2.minmax.igemm[gemm_parameters->mr].function[XNN_UARCH_DEFAULT] != NULL) {
1174       gemm_parameters = &xnn_params.f32.gemm2;
1175     }
1176   }
1177 
1178   union xnn_f32_minmax_params gemm_params;
1179   if XNN_LIKELY(gemm_parameters->init.f32 != NULL) {
1180     gemm_parameters->init.f32(&gemm_params, output_min, output_max);
1181   }
1182 
1183   struct jit_gemm_params jit_gemm_params = {
1184     .f32_minmax = {
1185       .min = output_min,
1186       .max = output_max
1187     }
1188   };
1189 
1190   union xnn_f32_minmax_params dwconv_params;
1191   const struct dwconv_parameters* dwconv_ukernel =
1192     find_dwconv_ukernel(kernel_height * kernel_width, xnn_params.f32.dwconv, XNN_MAX_F32_DWCONV_UKERNELS);
1193   if XNN_LIKELY(dwconv_ukernel != NULL) {
1194     dwconv_ukernel->init.f32(&dwconv_params, output_min, output_max);
1195   }
1196 
1197   union xnn_f32_minmax_params vmulcaddc_params;
1198   if XNN_LIKELY(xnn_params.f32.vmulcaddc.init.f32 != NULL) {
1199     xnn_params.f32.vmulcaddc.init.f32(&vmulcaddc_params, output_min, output_max);
1200   }
1201 
1202   return create_convolution2d_nhwc(
1203     input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
1204     kernel_height, kernel_width,
1205     subsampling_height, subsampling_width,
1206     dilation_height, dilation_width,
1207     groups, group_input_channels, group_output_channels,
1208     input_channel_stride, output_channel_stride,
1209     kernel, bias, flags,
1210     2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1211     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1212     sizeof(float) /* sizeof(bias element) */,
1213     (xnn_pack_vmulcaddc_w_function) xnn_pack_f32_vmulcaddc_w,
1214     (xnn_pack_dwconv_hwg_w_function) xnn_pack_f32_dwconv_hwg_w,
1215     (xnn_pack_dwconv_ghw_w_function) xnn_pack_f32_dwconv_ghw_w,
1216     (xnn_pack_gemm_goi_w_function) xnn_pack_f32_gemm_goi_w,
1217     (xnn_pack_conv_kgo_w_function) xnn_pack_f32_conv_kgo_w,
1218     (xnn_pack_conv_goki_w_function) xnn_pack_f32_conv_goki_w,
1219     NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
1220     0 /* extra weights bytes */, NULL /* init scale params fn */, NULL /* scale params */,
1221     &gemm_params, sizeof(gemm_params),
1222     &dwconv_params, sizeof(dwconv_params),
1223     &vmulcaddc_params, sizeof(vmulcaddc_params),
1224     gemm_parameters, dwconv_ukernel, &xnn_params.f32.vmulcaddc,
1225     &jit_gemm_params,
1226     linear_activation, relu_activation, XNN_INIT_FLAG_F32,
1227     xnn_operator_type_convolution_nhwc_f32,
1228     0, NULL,
1229     caches,
1230     convolution_op_out);
1231 }
1232 
xnn_create_fused_convolution2d_nhwc_f32(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const float * kernel,const float * bias,size_t num_post_operations,struct xnn_post_operation * post_operations,uint32_t flags,xnn_caches_t caches,xnn_operator_t * convolution_op_out)1233 enum xnn_status xnn_create_fused_convolution2d_nhwc_f32(
1234     uint32_t input_padding_top,
1235     uint32_t input_padding_right,
1236     uint32_t input_padding_bottom,
1237     uint32_t input_padding_left,
1238     uint32_t kernel_height,
1239     uint32_t kernel_width,
1240     uint32_t subsampling_height,
1241     uint32_t subsampling_width,
1242     uint32_t dilation_height,
1243     uint32_t dilation_width,
1244     uint32_t groups,
1245     size_t group_input_channels,
1246     size_t group_output_channels,
1247     size_t input_channel_stride,
1248     size_t output_channel_stride,
1249     const float* kernel,
1250     const float* bias,
1251     size_t num_post_operations,
1252     struct xnn_post_operation* post_operations,
1253     uint32_t flags,
1254     xnn_caches_t caches,
1255     xnn_operator_t* convolution_op_out)
1256 {
1257   #if !XNN_ENABLE_JIT
1258     xnn_log_error(
1259       "failed to create %s operator: convolution with post operations available only if JIT is enabled",
1260       xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1261     return xnn_status_invalid_parameter;
1262   #endif
1263 
1264   // Convolution is specified with linear activation, any clamping should be specified as a post operator.
1265   const float output_max = INFINITY;
1266   const float output_min = -INFINITY;
1267 
1268   struct jit_gemm_params jit_gemm_params = {
1269     .f32_minmax = {
1270       .min = output_min,
1271       .max = output_max
1272     },
1273     .num_post_operations = num_post_operations,
1274     .post_operations = post_operations,
1275   };
1276 
1277   char* post_operation_params = allocate_and_initialize_post_operation_params(num_post_operations, post_operations);
1278 
1279   union xnn_f32_minmax_params gemm_params;
1280   if XNN_LIKELY(xnn_params.f32.gemm.init.f32 != NULL) {
1281     xnn_params.f32.gemm.init.f32(&gemm_params, output_min, output_max);
1282   }
1283 
1284   union xnn_f32_minmax_params dwconv_params;
1285   const struct dwconv_parameters* dwconv_ukernel =
1286     find_dwconv_ukernel(kernel_height * kernel_width, xnn_params.f32.dwconv, XNN_MAX_F32_DWCONV_UKERNELS);
1287   if XNN_LIKELY(dwconv_ukernel != NULL) {
1288     dwconv_ukernel->init.f32(&dwconv_params, output_min, output_max);
1289   }
1290 
1291   union xnn_f32_minmax_params vmulcaddc_params;
1292   if XNN_LIKELY(xnn_params.f32.vmulcaddc.init.f32 != NULL) {
1293     xnn_params.f32.vmulcaddc.init.f32(&vmulcaddc_params, output_min, output_max);
1294   }
1295 
1296   return create_convolution2d_nhwc(
1297     input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
1298     kernel_height, kernel_width,
1299     subsampling_height, subsampling_width,
1300     dilation_height, dilation_width,
1301     groups, group_input_channels, group_output_channels,
1302     input_channel_stride, output_channel_stride,
1303     kernel, bias, flags,
1304     2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1305     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1306     sizeof(float) /* sizeof(bias element) */,
1307     (xnn_pack_vmulcaddc_w_function) xnn_pack_f32_vmulcaddc_w,
1308     (xnn_pack_dwconv_hwg_w_function) xnn_pack_f32_dwconv_hwg_w,
1309     (xnn_pack_dwconv_ghw_w_function) xnn_pack_f32_dwconv_ghw_w,
1310     (xnn_pack_gemm_goi_w_function) xnn_pack_f32_gemm_goi_w,
1311     (xnn_pack_conv_kgo_w_function) xnn_pack_f32_conv_kgo_w,
1312     (xnn_pack_conv_goki_w_function) xnn_pack_f32_conv_goki_w,
1313     NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
1314     0 /* extra weights bytes */, NULL /* init scale params fn */, NULL /* scale params */,
1315     (void*) &gemm_params, sizeof(gemm_params),
1316     &dwconv_params, sizeof(dwconv_params),
1317     &vmulcaddc_params, sizeof(vmulcaddc_params),
1318     &xnn_params.f32.gemm, dwconv_ukernel, &xnn_params.f32.vmulcaddc,
1319     &jit_gemm_params,
1320     true /* linear_activation */, false /* relu_activation */, XNN_INIT_FLAG_F32,
1321     xnn_operator_type_convolution_nhwc_f32,
1322     num_post_operations, post_operation_params,
1323     caches,
1324     convolution_op_out);
1325 }
1326 
setup_convolution2d_nhwc(xnn_operator_t convolution_op,enum xnn_operator_type expected_operator_type,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,uint32_t datatype_init_flags,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t extra_weights_elements_size,uint32_t log2_output_element_size,size_t num_threads)1327 static enum xnn_status setup_convolution2d_nhwc(
1328   xnn_operator_t convolution_op,
1329   enum xnn_operator_type expected_operator_type,
1330   size_t batch_size,
1331   size_t input_height,
1332   size_t input_width,
1333   const void* input,
1334   void* output,
1335   uint32_t datatype_init_flags,
1336   uint32_t log2_input_element_size,
1337   uint32_t log2_filter_element_size,
1338   uint32_t extra_weights_elements_size,
1339   uint32_t log2_output_element_size,
1340   size_t num_threads)
1341 {
1342   if (convolution_op->type != expected_operator_type) {
1343     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1344       xnn_operator_type_to_string(expected_operator_type),
1345       xnn_operator_type_to_string(convolution_op->type));
1346     return xnn_status_invalid_parameter;
1347   }
1348   convolution_op->state = xnn_run_state_invalid;
1349 
1350   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
1351     xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
1352       xnn_operator_type_to_string(convolution_op->type));
1353     return xnn_status_uninitialized;
1354   }
1355 
1356   if ((xnn_params.init_flags & datatype_init_flags) != datatype_init_flags) {
1357     xnn_log_error(
1358       "failed to create %s operator: operations on data type are not supported",
1359       xnn_operator_type_to_string(convolution_op->type));
1360     return xnn_status_unsupported_hardware;
1361   }
1362 
1363   if (input_width == 0 || input_height == 0) {
1364     xnn_log_error(
1365       "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
1366       xnn_operator_type_to_string(convolution_op->type), input_width, input_height);
1367     return xnn_status_invalid_parameter;
1368   }
1369 
1370   if (batch_size == 0) {
1371     convolution_op->state = xnn_run_state_skip;
1372     return xnn_status_success;
1373   }
1374 
1375   if (convolution_op->weights_cache != NULL && !xnn_weights_cache_is_finalized(convolution_op->weights_cache)) {
1376     xnn_log_error("failed to setup %s operator: weights cache is not finalized",
1377       xnn_operator_type_to_string(convolution_op->type));
1378     return xnn_status_invalid_state;
1379   }
1380 
1381   convolution_op->batch_size = batch_size;
1382   convolution_op->input_height = input_height;
1383   convolution_op->input_width = input_width;
1384   convolution_op->input = input;
1385 
1386   if (convolution_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
1387     convolution_op->output_height = compute_output_dimension_with_tf_same_padding(
1388         input_height, convolution_op->stride_height);
1389     convolution_op->output_width = compute_output_dimension_with_tf_same_padding(
1390         input_width, convolution_op->stride_width);
1391 
1392     const uint32_t effective_kernel_height = (convolution_op->kernel_height - 1) * convolution_op->dilation_height + 1;
1393     const uint32_t effective_kernel_width = (convolution_op->kernel_width - 1) * convolution_op->dilation_width + 1;
1394     const size_t total_padding_height =
1395       (convolution_op->output_height - 1) * convolution_op->stride_height + effective_kernel_height - input_height;
1396     const size_t total_padding_width =
1397       (convolution_op->output_width - 1) * convolution_op->stride_width + effective_kernel_width - input_width;
1398     convolution_op->padding_top = total_padding_height / 2;
1399     convolution_op->padding_left = total_padding_width / 2;
1400     convolution_op->padding_bottom = total_padding_height - convolution_op->padding_top;
1401     convolution_op->padding_right = total_padding_width - convolution_op->padding_left;
1402   } else {
1403     convolution_op->output_height = xnn_compute_convolution_output_dimension(
1404         convolution_op->padding_top + input_height + convolution_op->padding_bottom,
1405         convolution_op->kernel_height,
1406         convolution_op->dilation_height,
1407         convolution_op->stride_height);
1408     convolution_op->output_width = xnn_compute_convolution_output_dimension(
1409         convolution_op->padding_left + input_width + convolution_op->padding_right,
1410         convolution_op->kernel_width,
1411         convolution_op->dilation_width,
1412         convolution_op->stride_width);
1413   }
1414   convolution_op->output = output;
1415 
1416   switch (convolution_op->ukernel.type) {
1417     case xnn_ukernel_type_gemm:
1418     {
1419       // Convolution maps directly to GEMM and doesn't use indirection buffer.
1420 
1421       const size_t output_height = convolution_op->output_height;
1422       const size_t output_width = convolution_op->output_width;
1423       const size_t output_size = output_height * output_width;
1424       const size_t batch_output_size = batch_size * output_size;
1425 
1426       const size_t groups = convolution_op->groups;
1427       const size_t group_input_channels = convolution_op->group_input_channels;
1428       const size_t w_stride = extra_weights_elements_size +
1429         (round_up_po2(group_input_channels, convolution_op->ukernel.gemm.kr * convolution_op->ukernel.gemm.sr) << log2_filter_element_size);
1430       const size_t group_output_channels = convolution_op->group_output_channels;
1431 
1432       uint32_t mr = convolution_op->ukernel.gemm.mr;
1433       const uint32_t nr = convolution_op->ukernel.gemm.nr;
1434       struct xnn_hmp_gemm_ukernel *gemm_cases = convolution_op->ukernel.gemm.gemm_cases;
1435 
1436       #if XNN_ENABLE_GEMM_M_SPECIALIZATION
1437         mr = xnn_get_heuristic_mr_gemm(batch_output_size, mr, nr, gemm_cases);
1438       #else
1439         if (batch_output_size == 1 && gemm_cases[0].function[XNN_UARCH_DEFAULT] != NULL) {
1440           mr = 1;
1441         }
1442       #endif
1443 
1444       #if XNN_PLATFORM_JIT
1445         if (convolution_op->code_cache != NULL) {
1446           const size_t jit_code_offset = gemm_cases[mr - 1].generated_code_offset[XNN_UARCH_DEFAULT];
1447           if (jit_code_offset != XNN_CACHE_NOT_FOUND) {
1448             gemm_cases[mr - 1].function[XNN_UARCH_DEFAULT] =
1449                 (xnn_gemm_ukernel_function) cached_code_at_offset(convolution_op, jit_code_offset);
1450           }
1451         }
1452       #endif  // XNN_PLATFORM_JIT
1453       struct xnn_hmp_gemm_ukernel gemm_ukernel = gemm_cases[mr - 1];
1454 
1455       convolution_op->context.gemm = (struct gemm_context) {
1456           .k_scaled = group_input_channels << log2_input_element_size,
1457           .a = input,
1458           .a_stride = convolution_op->input_pixel_stride << log2_input_element_size,
1459           .packed_w = packed_weights(convolution_op),
1460           .w_stride = w_stride,
1461           .wg_stride = w_stride * round_up(group_output_channels, nr),
1462           .c = output,
1463           .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1464           .cn_stride = nr << log2_output_element_size,
1465           .cg_stride = group_output_channels << log2_output_element_size,
1466           .log2_csize = log2_output_element_size,
1467           .ukernel = gemm_ukernel,
1468       };
1469       memcpy(&convolution_op->context.gemm.params, &convolution_op->params, sizeof(convolution_op->context.gemm.params));
1470       if (convolution_op->num_post_operation_params == 0) {
1471         convolution_op->context.gemm.fused_params = &convolution_op->context.gemm.params;
1472       } else {
1473         convolution_op->context.gemm.fused_params = convolution_op->post_operation_params;
1474       }
1475 
1476       #if XNN_TEST_MODE
1477         const size_t nc = nr;
1478       #else
1479         size_t nc = group_output_channels;
1480         if (num_threads > 1) {
1481           const size_t num_other_tiles = groups * divide_round_up(batch_output_size, mr);
1482           const size_t target_tiles_per_thread = 5;
1483           const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
1484           if (max_nc < nc) {
1485             nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
1486           }
1487         }
1488       #endif
1489       if (groups == 1) {
1490         #if XNN_MAX_UARCH_TYPES > 1
1491           if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) {
1492             convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d_with_uarch;
1493             convolution_op->compute.task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_gemm;
1494           } else {
1495             convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
1496             convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
1497           }
1498         #else
1499           convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
1500           convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
1501         #endif
1502         convolution_op->compute.range[0] = batch_output_size;
1503         convolution_op->compute.range[1] = group_output_channels;
1504         convolution_op->compute.tile[0] = mr;
1505         convolution_op->compute.tile[1] = nc;
1506       } else {
1507         #if XNN_MAX_UARCH_TYPES > 1
1508           if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) {
1509             convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1510             convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_gemm;
1511           } else {
1512             convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1513             convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm;
1514           }
1515         #else
1516           convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1517           convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm;
1518         #endif
1519         convolution_op->compute.range[0] = groups;
1520         convolution_op->compute.range[1] = batch_output_size;
1521         convolution_op->compute.range[2] = group_output_channels;
1522         convolution_op->compute.tile[0] = mr;
1523         convolution_op->compute.tile[1] = nc;
1524       }
1525       convolution_op->state = xnn_run_state_ready;
1526 
1527       return xnn_status_success;
1528     }
1529     case xnn_ukernel_type_igemm:
1530     {
1531       const size_t groups = convolution_op->groups;
1532       const size_t kernel_height = convolution_op->kernel_height;
1533       const size_t kernel_width = convolution_op->kernel_width;
1534       const size_t kernel_size = kernel_height * kernel_width;
1535       const size_t output_height = convolution_op->output_height;
1536       const size_t output_width = convolution_op->output_width;
1537       const size_t output_size = output_height * output_width;
1538 
1539       uint32_t mr = convolution_op->ukernel.igemm.mr;
1540       const uint32_t nr = convolution_op->ukernel.igemm.nr;
1541       struct xnn_hmp_igemm_ukernel* igemm_cases = convolution_op->ukernel.igemm.igemm_cases;
1542 
1543       #if XNN_ENABLE_GEMM_M_SPECIALIZATION
1544         mr = xnn_get_heuristic_mr_igemm(output_size, mr, nr, igemm_cases);
1545       #else
1546         if (output_size == 1 && igemm_cases[0].function[XNN_UARCH_DEFAULT] != NULL) {
1547           mr = 1;
1548         }
1549       #endif
1550 
1551       #if XNN_PLATFORM_JIT
1552         if (convolution_op->code_cache != NULL) {
1553           const size_t jit_code_offset = igemm_cases[mr - 1].generated_code_offset[XNN_UARCH_DEFAULT];
1554           if (jit_code_offset != XNN_CACHE_NOT_FOUND) {
1555             igemm_cases[mr - 1].function[XNN_UARCH_DEFAULT] =
1556                 (xnn_igemm_ukernel_function) cached_code_at_offset(convolution_op, jit_code_offset);
1557           }
1558         }
1559       #endif  // XNN_PLATFORM_JIT
1560       struct xnn_hmp_igemm_ukernel igemm_ukernel = igemm_cases[mr - 1];
1561 
1562       const size_t tiled_output_size = round_up(output_size, mr);
1563       const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
1564 
1565       if (input_height != convolution_op->last_input_height ||
1566           input_width != convolution_op->last_input_width)
1567       {
1568         const void** indirection_buffer = (const void**) xnn_reallocate_memory((void*) convolution_op->indirection_buffer, indirection_buffer_size);
1569         if (indirection_buffer == NULL) {
1570           xnn_log_error(
1571             "failed to allocate %zu bytes for %s operator indirection buffer",
1572             indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type));
1573           return xnn_status_out_of_memory;
1574         }
1575         convolution_op->indirection_buffer = indirection_buffer;
1576         convolution_op->last_input = input;
1577         convolution_op->last_input_height = input_height;
1578         convolution_op->last_input_width = input_width;
1579 
1580         xnn_indirection_init_conv2d(convolution_op, mr, log2_input_element_size);
1581       }
1582 
1583       const size_t group_input_channels = convolution_op->group_input_channels;
1584       const size_t w_stride = extra_weights_elements_size +
1585         (round_up_po2(group_input_channels, convolution_op->ukernel.igemm.kr * convolution_op->ukernel.igemm.sr) * kernel_size << log2_filter_element_size);
1586       const size_t group_output_channels = convolution_op->group_output_channels;
1587       convolution_op->context.igemm = (struct igemm_context) {
1588           .ks = kernel_size,
1589           .ks_scaled = kernel_size * mr * sizeof(void*),
1590           .kc = group_input_channels << log2_input_element_size,
1591           .w_stride = w_stride,
1592           .indirect_a = convolution_op->indirection_buffer,
1593           .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) convolution_op->last_input),
1594           .zero = convolution_op->zero_buffer,
1595           .packed_w = packed_weights(convolution_op),
1596           .c = convolution_op->output,
1597           .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1598           .cn_stride = nr << log2_output_element_size,
1599           .ga_stride = group_input_channels << log2_input_element_size,
1600           .gw_stride = w_stride * round_up(group_output_channels, nr),
1601           .gc_stride = group_output_channels << log2_output_element_size,
1602           .ba_stride = input_height * input_width * convolution_op->input_pixel_stride << log2_input_element_size,
1603           .bc_stride = output_size * convolution_op->output_pixel_stride << log2_output_element_size,
1604           .log2_csize = log2_output_element_size,
1605           .ukernel = igemm_ukernel,
1606       };
1607       memcpy(&convolution_op->context.igemm.params, &convolution_op->params, sizeof(convolution_op->context.igemm.params));
1608 
1609       #if XNN_TEST_MODE
1610         const size_t nc = nr;
1611       #else
1612         size_t nc = group_output_channels;
1613         if (num_threads > 1) {
1614           const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
1615           const size_t target_tiles_per_thread = 5;
1616           const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
1617           if (max_nc < nc) {
1618             nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
1619           }
1620         }
1621       #endif
1622       if (groups == 1) {
1623         #if XNN_MAX_UARCH_TYPES > 1
1624           if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) {
1625             if (batch_size > 1) {
1626               convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1627               convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_batch_hmp_igemm;
1628             } else {
1629               convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d_with_uarch;
1630               convolution_op->compute.task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_igemm;
1631             }
1632           } else {
1633             if (batch_size > 1) {
1634               convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1635               convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
1636             } else {
1637               convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
1638               convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
1639             }
1640           }
1641         #else
1642           if (batch_size > 1) {
1643             convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1644             convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
1645           } else {
1646             convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
1647             convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
1648           }
1649         #endif
1650         if (batch_size > 1) {
1651           convolution_op->compute.range[0] = batch_size;
1652           convolution_op->compute.range[1] = output_size;
1653           convolution_op->compute.range[2] = group_output_channels;
1654         } else {
1655           convolution_op->compute.range[0] = output_size;
1656           convolution_op->compute.range[1] = group_output_channels;
1657         }
1658         convolution_op->compute.tile[0] = mr;
1659         convolution_op->compute.tile[1] = nc;
1660       } else {
1661         #if XNN_MAX_UARCH_TYPES > 1
1662           if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) {
1663             if (batch_size > 1) {
1664               convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d_with_uarch;
1665               convolution_op->compute.task_4d_tile_2d_with_id = (pthreadpool_task_4d_tile_2d_with_id_t) xnn_compute_hmp_grouped_batch_igemm;
1666             } else {
1667               convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1668               convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_igemm;
1669             }
1670           } else {
1671             if (batch_size > 1) {
1672               convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
1673               convolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
1674             } else {
1675               convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1676               convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
1677             }
1678           }
1679         #else
1680           if (batch_size > 1) {
1681             convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
1682             convolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
1683           } else {
1684             convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1685             convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
1686           }
1687         #endif
1688         if (batch_size > 1) {
1689           convolution_op->compute.range[0] = batch_size;
1690           convolution_op->compute.range[1] = groups;
1691           convolution_op->compute.range[2] = output_size;
1692           convolution_op->compute.range[3] = group_output_channels;
1693         } else {
1694           convolution_op->compute.range[0] = groups;
1695           convolution_op->compute.range[1] = output_size;
1696           convolution_op->compute.range[2] = group_output_channels;
1697         }
1698         convolution_op->compute.tile[0] = mr;
1699         convolution_op->compute.tile[1] = nc;
1700       }
1701       convolution_op->state = xnn_run_state_ready;
1702 
1703       return xnn_status_success;
1704     }
1705     case xnn_ukernel_type_dwconv:
1706     {
1707       const size_t kernel_height = convolution_op->kernel_height;
1708       const size_t kernel_width = convolution_op->kernel_width;
1709       const size_t kernel_size = kernel_height * kernel_width;
1710       const size_t output_height = convolution_op->output_height;
1711       const size_t output_width = convolution_op->output_width;
1712       const size_t step_width = convolution_op->dilation_width == 1 ? convolution_op->stride_width : kernel_width;
1713       const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
1714       const size_t primary_tile = convolution_op->ukernel.dwconv.primary_tile;
1715       if (input_height != convolution_op->last_input_height || input_width != convolution_op->last_input_width) {
1716         // Micro-kernel will read (primary_tile - kernel_size) elements after the end of indirection buffer.
1717         const size_t indirection_buffer_size =
1718           sizeof(void*) * (primary_tile - kernel_size + output_height * step_height);
1719 
1720         const void** indirection_buffer =
1721           (const void**) xnn_reallocate_memory(convolution_op->indirection_buffer, indirection_buffer_size);
1722         if (indirection_buffer == NULL) {
1723           xnn_log_error("failed to allocate %zu bytes for %s operator indirection buffer",
1724             indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type));
1725           return xnn_status_out_of_memory;
1726         }
1727         convolution_op->indirection_buffer = indirection_buffer;
1728 
1729         xnn_indirection_init_dwconv2d(convolution_op, step_height, step_width, primary_tile, log2_input_element_size);
1730 
1731         convolution_op->last_input = input;
1732         convolution_op->last_input_height = input_height;
1733         convolution_op->last_input_width = input_width;
1734       }
1735 
1736       const size_t groups = convolution_op->groups;
1737       convolution_op->context.dwconv = (struct dwconv_context) {
1738           .indirect_input = convolution_op->indirection_buffer,
1739           .indirect_input_width_stride = kernel_height * step_width * sizeof(void*),
1740           .indirect_input_height_stride = step_height * sizeof(void*),
1741           .input_offset = (size_t) ((uintptr_t) input - (uintptr_t) convolution_op->last_input),
1742           .input_batch_stride = (input_height * input_width * convolution_op->input_pixel_stride) << log2_input_element_size,
1743           .packed_weights = packed_weights(convolution_op),
1744           .output = convolution_op->output,
1745           .output_batch_stride = (output_height * output_width * convolution_op->output_pixel_stride) << log2_output_element_size,
1746           .output_height_stride = (output_width * convolution_op->output_pixel_stride) << log2_output_element_size,
1747           .output_width = output_width,
1748           .groups = groups,
1749           .zero = convolution_op->zero_buffer,
1750           .output_increment = (convolution_op->output_pixel_stride - groups) << log2_output_element_size,
1751           .unipass_ukernel = convolution_op->ukernel.dwconv.unipass_function,
1752       };
1753       memcpy(&convolution_op->context.dwconv.params, &convolution_op->params, sizeof(convolution_op->context.dwconv.params));
1754 
1755       convolution_op->compute.type = xnn_parallelization_type_2d;
1756       convolution_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_dwconv_unipass;
1757       convolution_op->compute.range[0] = batch_size;
1758       convolution_op->compute.range[1] = output_height;
1759       convolution_op->state = xnn_run_state_ready;
1760 
1761       return xnn_status_success;
1762     }
1763     case xnn_ukernel_type_vmulcaddc:
1764     {
1765       const size_t batch_output_size = batch_size * convolution_op->output_height * convolution_op->output_width;
1766 
1767       convolution_op->context.vmulcaddc = (struct vmulcaddc_context) {
1768           .n = convolution_op->groups << log2_input_element_size,
1769           .x = input,
1770           .x_stride = convolution_op->input_pixel_stride << log2_input_element_size,
1771           .w = packed_weights(convolution_op),
1772           .y = output,
1773           .y_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1774           .ukernel = convolution_op->ukernel.vmulcaddc.function,
1775       };
1776       memcpy(&convolution_op->context.vmulcaddc.params, &convolution_op->params, sizeof(convolution_op->context.vmulcaddc.params));
1777 
1778       #if XNN_TEST_MODE
1779         const size_t mc = convolution_op->ukernel.vmulcaddc.mr;
1780       #else
1781         size_t mc = batch_output_size;
1782         if (num_threads > 1) {
1783           const size_t target_tiles_per_thread = 5;
1784           const size_t max_mc = divide_round_up(batch_output_size, num_threads * target_tiles_per_thread);
1785           if (max_mc < mc) {
1786             const uint32_t mr = convolution_op->ukernel.vmulcaddc.mr;
1787             mc = min(mc, divide_round_up(mc, max_mc * mr) * mr);
1788           }
1789         }
1790       #endif
1791       convolution_op->compute.type = xnn_parallelization_type_1d_tile_1d;
1792       convolution_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_vmulcaddc;
1793       convolution_op->compute.range[0] = batch_output_size;
1794       convolution_op->compute.tile[0] = mc;
1795       convolution_op->state = xnn_run_state_ready;
1796 
1797       return xnn_status_success;
1798     }
1799     default:
1800       XNN_UNREACHABLE;
1801   }
1802 }
1803 
xnn_setup_convolution2d_nhwc_qu8(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const uint8_t * input,uint8_t * output,pthreadpool_t threadpool)1804 enum xnn_status xnn_setup_convolution2d_nhwc_qu8(
1805     xnn_operator_t convolution_op,
1806     size_t batch_size,
1807     size_t input_height,
1808     size_t input_width,
1809     const uint8_t* input,
1810     uint8_t* output,
1811     pthreadpool_t threadpool)
1812 {
1813   return setup_convolution2d_nhwc(
1814     convolution_op, xnn_operator_type_convolution_nhwc_qu8,
1815     batch_size, input_height, input_width,
1816     input, output,
1817     XNN_INIT_FLAG_QU8,
1818     0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
1819     0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
1820     sizeof(int32_t) /* sizeof(extra weights elements) */,
1821     0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
1822     pthreadpool_get_threads_count(threadpool));
1823 }
1824 
xnn_setup_convolution2d_nhwc_qs8(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const int8_t * input,int8_t * output,pthreadpool_t threadpool)1825 enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
1826     xnn_operator_t convolution_op,
1827     size_t batch_size,
1828     size_t input_height,
1829     size_t input_width,
1830     const int8_t* input,
1831     int8_t* output,
1832     pthreadpool_t threadpool)
1833 {
1834   return setup_convolution2d_nhwc(
1835     convolution_op, xnn_operator_type_convolution_nhwc_qs8,
1836     batch_size, input_height, input_width,
1837     input, output,
1838     XNN_INIT_FLAG_QS8,
1839     0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
1840     0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
1841     sizeof(int32_t) /* sizeof(extra weights elements) */,
1842     0 /* log2(sizeof(output element)) = log2(sizeof(int8_t)) */,
1843     pthreadpool_get_threads_count(threadpool));
1844 }
1845 
xnn_setup_convolution2d_nhwc_qc8(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const int8_t * input,int8_t * output,pthreadpool_t threadpool)1846 enum xnn_status xnn_setup_convolution2d_nhwc_qc8(
1847     xnn_operator_t convolution_op,
1848     size_t batch_size,
1849     size_t input_height,
1850     size_t input_width,
1851     const int8_t* input,
1852     int8_t* output,
1853     pthreadpool_t threadpool)
1854 {
1855   return setup_convolution2d_nhwc(
1856     convolution_op, xnn_operator_type_convolution_nhwc_qc8,
1857     batch_size, input_height, input_width,
1858     input, output,
1859     XNN_INIT_FLAG_QC8,
1860     0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
1861     0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
1862     sizeof(int32_t) + sizeof(float) /* sizeof(extra weights elements) */,
1863     0 /* log2(sizeof(output element)) = log2(sizeof(int8_t)) */,
1864     pthreadpool_get_threads_count(threadpool));
1865 }
1866 
xnn_setup_convolution2d_nhwc_f16(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,pthreadpool_t threadpool)1867 enum xnn_status xnn_setup_convolution2d_nhwc_f16(
1868     xnn_operator_t convolution_op,
1869     size_t batch_size,
1870     size_t input_height,
1871     size_t input_width,
1872     const void* input,
1873     void* output,
1874     pthreadpool_t threadpool)
1875 {
1876   return setup_convolution2d_nhwc(
1877     convolution_op, xnn_operator_type_convolution_nhwc_f16,
1878     batch_size, input_height, input_width,
1879     input, output,
1880     XNN_INIT_FLAG_F16,
1881     1 /* log2(sizeof(input element)) = log2(sizeof(uint16_t)) */,
1882     1 /* log2(sizeof(filter element)) = log2(sizeof(uint16_t)) */,
1883     sizeof(uint16_t) /* sizeof(extra weights elements) */,
1884     1 /* log2(sizeof(output element)) = log2(sizeof(uint16_t)) */,
1885     pthreadpool_get_threads_count(threadpool));
1886 }
1887 
xnn_setup_convolution2d_nhwc_f32(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const float * input,float * output,pthreadpool_t threadpool)1888 enum xnn_status xnn_setup_convolution2d_nhwc_f32(
1889     xnn_operator_t convolution_op,
1890     size_t batch_size,
1891     size_t input_height,
1892     size_t input_width,
1893     const float* input,
1894     float* output,
1895     pthreadpool_t threadpool)
1896 {
1897   return setup_convolution2d_nhwc(
1898     convolution_op, xnn_operator_type_convolution_nhwc_f32,
1899     batch_size, input_height, input_width,
1900     input, output,
1901     XNN_INIT_FLAG_F32,
1902     2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1903     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1904     sizeof(float) /* sizeof(extra weights elements) */,
1905     2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
1906     pthreadpool_get_threads_count(threadpool));
1907 }
1908