• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 #include <math.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <string.h>
16 
17 #include <xnnpack.h>
18 #include <xnnpack/allocator.h>
19 #include <xnnpack/common.h>
20 #include <xnnpack/compute.h>
21 #include <xnnpack/indirection.h>
22 #include <xnnpack/log.h>
23 #include <xnnpack/math.h>
24 #include <xnnpack/operator.h>
25 #include <xnnpack/pack.h>
26 #include <xnnpack/params-init.h>
27 #include <xnnpack/params.h>
28 
29 
compute_output_dimension(size_t padded_input_dimension,size_t kernel_dimension,size_t dilation_dimension,size_t subsampling_dimension)30 static inline size_t compute_output_dimension(
31     size_t padded_input_dimension,
32     size_t kernel_dimension,
33     size_t dilation_dimension,
34     size_t subsampling_dimension)
35 {
36   const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
37   return doz(padded_input_dimension, effective_kernel_dimension) / subsampling_dimension + 1;
38 }
39 
compute_output_dimension_with_tf_same_padding(size_t input_dimension,size_t subsampling_dimension)40 static inline size_t compute_output_dimension_with_tf_same_padding(
41     size_t input_dimension,
42     size_t subsampling_dimension)
43 {
44   return divide_round_up(input_dimension, subsampling_dimension);
45 }
46 
find_dwigemm_ukernel(size_t kernel_size,const struct dwconv_parameters * ukernel,size_t num_ukernels)47 static const struct dwconv_parameters* find_dwigemm_ukernel(
48     size_t kernel_size,
49     const struct dwconv_parameters* ukernel,
50     size_t num_ukernels)
51 {
52   while (num_ukernels-- != 0) {
53     if (ukernel->mr == kernel_size) {
54       return ukernel;
55     }
56     ukernel++;
57   }
58   return NULL;
59 }
60 
xnn_create_convolution2d_nhwc_q8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,uint8_t input_zero_point,float input_scale,uint8_t kernel_zero_point,float kernel_scale,const uint8_t * kernel,const int32_t * bias,uint8_t output_zero_point,float output_scale,uint8_t output_min,uint8_t output_max,uint32_t flags,xnn_operator_t * convolution_op_out)61 enum xnn_status xnn_create_convolution2d_nhwc_q8(
62     uint32_t input_padding_top,
63     uint32_t input_padding_right,
64     uint32_t input_padding_bottom,
65     uint32_t input_padding_left,
66     uint32_t kernel_height,
67     uint32_t kernel_width,
68     uint32_t subsampling_height,
69     uint32_t subsampling_width,
70     uint32_t dilation_height,
71     uint32_t dilation_width,
72     uint32_t groups,
73     size_t group_input_channels,
74     size_t group_output_channels,
75     size_t input_pixel_stride,
76     size_t output_pixel_stride,
77     uint8_t input_zero_point,
78     float input_scale,
79     uint8_t kernel_zero_point,
80     float kernel_scale,
81     const uint8_t* kernel,
82     const int32_t* bias,
83     uint8_t output_zero_point,
84     float output_scale,
85     uint8_t output_min,
86     uint8_t output_max,
87     uint32_t flags,
88     xnn_operator_t* convolution_op_out)
89 {
90   xnn_operator_t convolution_op = NULL;
91   enum xnn_status status = xnn_status_uninitialized;
92 
93   if (!xnn_params.initialized) {
94     xnn_log_error("failed to create Convolution operator: XNNPACK is not initialized");
95     goto error;
96   }
97 
98   status = xnn_status_invalid_parameter;
99 
100   if (kernel_width == 0 || kernel_height == 0) {
101     xnn_log_error(
102       "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
103       kernel_width, kernel_height);
104     goto error;
105   }
106 
107   if (subsampling_width == 0 || subsampling_height == 0) {
108     xnn_log_error(
109       "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " subsampling: "
110       "subsampling dimensions must be non-zero",
111       subsampling_width, subsampling_height);
112     goto error;
113   }
114 
115   if (dilation_width == 0 || dilation_height == 0) {
116     xnn_log_error(
117       "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
118       "dilation dimensions must be non-zero",
119       dilation_width, dilation_height);
120     goto error;
121   }
122 
123   if (groups == 0) {
124     xnn_log_error(
125       "failed to create Convolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
126     goto error;
127   }
128 
129   if (group_input_channels == 0) {
130     xnn_log_error(
131       "failed to create Convolution operator with %zu input channels per group: "
132       "number of channels must be non-zero",
133       group_input_channels);
134     goto error;
135   }
136 
137   if (group_output_channels == 0) {
138     xnn_log_error(
139       "failed to create Convolution operator with %zu output channels per group: "
140       "number of channels must be non-zero",
141       group_output_channels);
142     goto error;
143   }
144 
145   const size_t input_channels = groups * group_input_channels;
146   if (input_pixel_stride < input_channels) {
147     xnn_log_error(
148       "failed to create Convolution operator with input pixel stride of %zu: "
149       "stride must be at least as large as the number of input channels (%" PRIu32 "x%zu)",
150       input_pixel_stride, groups, group_input_channels);
151     goto error;
152   }
153 
154   const size_t output_channels = groups * group_output_channels;
155   if (output_pixel_stride < output_channels) {
156     xnn_log_error(
157       "failed to create Convolution operator with output pixel stride of %zu: "
158       "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
159       output_pixel_stride, groups, group_output_channels);
160     goto error;
161   }
162 
163   if (input_scale <= 0.0f || !isnormal(input_scale)) {
164     xnn_log_error(
165       "failed to create Convolution operator with %.7g input scale: scale must be finite, normalized, and positive",
166       input_scale);
167     goto error;
168   }
169 
170   if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
171     xnn_log_error(
172       "failed to create Convolution operator with %.7g kernel scale: scale must be finite, normalized, and positive",
173       kernel_scale);
174     goto error;
175   }
176 
177   if (output_scale <= 0.0f || !isnormal(output_scale)) {
178     xnn_log_error(
179       "failed to create Convolution operator with %.7g output scale: scale must be finite, normalized, and positive",
180       output_scale);
181     goto error;
182   }
183 
184   if (output_min >= output_max) {
185     xnn_log_error(
186       "failed to create Convolution operator with [%" PRIu8 ", %" PRIu8 "] output range: "
187       "range min must be below range max",
188       output_min, output_max);
189     goto error;
190   }
191 
192   if ((flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) != 0 && group_input_channels != 1) {
193     xnn_log_error(
194       "failed to create Depthwise Convolution operator with %zu input channels per group: "
195       "Depthwise Convolution must have exactly 1 input channel per group",
196       group_input_channels);
197     goto error;
198   }
199 
200   const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
201   if ((flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
202     if (any_padding) {
203       xnn_log_error(
204         "failed to create Convolution operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
205         "TensorFlow SAME padding can't be combined with explicit padding specification",
206         input_padding_top, input_padding_left, input_padding_bottom, input_padding_right);
207       goto error;
208     }
209   }
210 
211   status = xnn_status_unsupported_parameter;
212 
213   const float convolution_scale = input_scale * kernel_scale / output_scale;
214   if (convolution_scale >= 1.0f) {
215     xnn_log_error(
216       "failed to create Convolution operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
217       "convolution scale %.7g is greater or equal to 1.0",
218       input_scale, kernel_scale, output_scale, convolution_scale);
219     goto error;
220   }
221 
222   status = xnn_status_out_of_memory;
223 
224   convolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
225   if (convolution_op == NULL) {
226     xnn_log_error("failed to allocate %zu bytes for Convolution operator descriptor", sizeof(struct xnn_operator));
227     goto error;
228   }
229 
230   const size_t kernel_size = kernel_height * kernel_width;
231 
232   enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_none;
233   const struct dwconv_parameters* dwconv_parameters = NULL;
234   if (group_input_channels == 1 && group_output_channels == 1 && groups > 1 &&
235       (dwconv_parameters = find_dwigemm_ukernel(kernel_size, xnn_params.q8.dwconv, XNN_MAX_Q8_DWCONV_UKERNELS)) != NULL)
236   {
237     ukernel_type = xnn_ukernel_type_dwconv;
238   } else if (kernel_size == 1 && subsampling_height == 1 && subsampling_width == 1 && !any_padding) {
239     ukernel_type = xnn_ukernel_type_gemm;
240   } else {
241     ukernel_type = xnn_ukernel_type_igemm;
242   }
243 
244   size_t zero_size = 0;
245   switch (ukernel_type) {
246     case xnn_ukernel_type_dwconv:
247     {
248       assert(dwconv_parameters != NULL);
249       assert(dwconv_parameters->mr == kernel_size);
250 
251       const uint32_t c_stride = round_up_po2(groups, dwconv_parameters->cr);
252       const size_t packed_weights_size = (sizeof(uint8_t) * kernel_size + sizeof(int32_t)) * c_stride;
253       convolution_op->packed_weights = xnn_allocate_simd_memory(packed_weights_size);
254       if (convolution_op->packed_weights == NULL) {
255         xnn_log_error("failed to allocate %zu bytes for packed weights", packed_weights_size);
256         goto error;
257       }
258 
259       if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
260         xnn_pack_q8_dwconv_hwg_w(
261           kernel_height, kernel_width,
262           groups, dwconv_parameters->cr,
263           input_zero_point, kernel_zero_point,
264           kernel, bias, convolution_op->packed_weights);
265       } else {
266         xnn_pack_q8_dwconv_ghw_w(
267           kernel_height, kernel_width,
268           groups, dwconv_parameters->cr,
269           input_zero_point, kernel_zero_point,
270           kernel, bias, convolution_op->packed_weights);
271       }
272 
273       convolution_op->ukernel.dwconv = (struct xnn_ukernel_dwconv) {
274         .unipass_function = dwconv_parameters->up,
275         .mr = dwconv_parameters->mr,
276         .qr = dwconv_parameters->qr,
277       };
278 
279       zero_size = sizeof(uint8_t) * c_stride + XNN_EXTRA_BYTES;
280       break;
281     }
282     case xnn_ukernel_type_gemm:
283     case xnn_ukernel_type_igemm:
284     {
285       const uint32_t nr = xnn_params.q8.gemm.nr;
286       const uint32_t kr = UINT32_C(1) << xnn_params.q8.gemm.log2_kr;
287       const uint32_t n_stride = round_up(group_output_channels, nr);
288       const uint32_t k_stride = round_up_po2(group_input_channels, kr);
289 
290       const size_t packed_group_weights_size =
291         (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride;
292       convolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
293       if (convolution_op->packed_weights == NULL) {
294         xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
295         goto error;
296       }
297       memset(convolution_op->packed_weights, kernel_zero_point, packed_group_weights_size * groups);
298 
299       switch (ukernel_type) {
300         case xnn_ukernel_type_gemm:
301           xnn_pack_q8_gemm_goi_w(
302               groups, group_output_channels, group_input_channels,
303               nr, kr,
304               input_zero_point, kernel_zero_point,
305               kernel, bias, convolution_op->packed_weights);
306           convolution_op->ukernel.gemm = (struct xnn_ukernel_gemm) {
307             .mr = xnn_params.q8.gemm.mr,
308             .nr = nr,
309             .kr = kr,
310             .default_function = xnn_params.q8.gemm.gemm,
311           };
312           break;
313         case xnn_ukernel_type_igemm:
314           if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
315             xnn_pack_q8_conv_kgo_w(
316                 groups, group_output_channels, kernel_size,
317                 nr, kr,
318                 input_zero_point, kernel_zero_point,
319                 kernel, bias, convolution_op->packed_weights);
320           } else {
321             xnn_pack_q8_conv_goki_w(
322                 groups, group_output_channels, kernel_size, group_input_channels,
323                 nr, kr,
324                 input_zero_point, kernel_zero_point,
325                 kernel, bias, convolution_op->packed_weights);
326           }
327           convolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
328             .mr = xnn_params.q8.gemm.mr,
329             .nr = nr,
330             .kr = kr,
331             .default_function = xnn_params.q8.gemm.igemm,
332           };
333           break;
334         default:
335           XNN_UNREACHABLE;
336       }
337 
338       zero_size = sizeof(uint8_t) * k_stride + XNN_EXTRA_BYTES;
339       break;
340     }
341     default:
342       XNN_UNREACHABLE;
343   }
344 
345   const bool tf_same_padding = (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0 && kernel_size != 1;
346   if (any_padding || tf_same_padding) {
347     void* zero_buffer = xnn_allocate_simd_memory(zero_size);
348     if (zero_buffer == NULL) {
349       xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
350       goto error;
351     }
352     memset(zero_buffer, input_zero_point, zero_size);
353     convolution_op->zero_buffer = zero_buffer;
354   }
355 
356   convolution_op->padding_top = input_padding_top;
357   convolution_op->padding_right = input_padding_right;
358   convolution_op->padding_bottom = input_padding_bottom;
359   convolution_op->padding_left = input_padding_left;
360 
361   convolution_op->kernel_height = kernel_height;
362   convolution_op->kernel_width = kernel_width;
363   convolution_op->stride_height = subsampling_height;
364   convolution_op->stride_width = subsampling_width;
365   convolution_op->dilation_height = dilation_height;
366   convolution_op->dilation_width = dilation_width;
367   convolution_op->groups = groups;
368   convolution_op->group_input_channels = group_input_channels;
369   convolution_op->group_output_channels = group_output_channels;
370   convolution_op->input_pixel_stride = input_pixel_stride;
371   convolution_op->output_pixel_stride = output_pixel_stride;
372 
373   convolution_op->kernel_zero_point = kernel_zero_point;
374 
375   convolution_op->q8_gemm_params =
376     xnn_init_q8_gemm_params(
377       input_zero_point, kernel_zero_point,
378       convolution_scale, output_zero_point, output_min, output_max);
379 
380   convolution_op->type = xnn_operator_type_convolution_nhwc_q8;
381   convolution_op->ukernel.type = ukernel_type;
382   if (tf_same_padding) {
383     convolution_op->flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
384   }
385 
386   convolution_op->state = xnn_run_state_invalid;
387 
388   *convolution_op_out = convolution_op;
389   return xnn_status_success;
390 
391 error:
392   xnn_delete_operator(convolution_op);
393   return status;
394 }
395 
xnn_create_convolution2d_nhwc_f32(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_pixel_stride,size_t output_pixel_stride,const float * kernel,const float * bias,float output_min,float output_max,uint32_t flags,xnn_operator_t * convolution_op_out)396 enum xnn_status xnn_create_convolution2d_nhwc_f32(
397     uint32_t input_padding_top,
398     uint32_t input_padding_right,
399     uint32_t input_padding_bottom,
400     uint32_t input_padding_left,
401     uint32_t kernel_height,
402     uint32_t kernel_width,
403     uint32_t subsampling_height,
404     uint32_t subsampling_width,
405     uint32_t dilation_height,
406     uint32_t dilation_width,
407     uint32_t groups,
408     size_t group_input_channels,
409     size_t group_output_channels,
410     size_t input_pixel_stride,
411     size_t output_pixel_stride,
412     const float* kernel,
413     const float* bias,
414     float output_min,
415     float output_max,
416     uint32_t flags,
417     xnn_operator_t* convolution_op_out)
418 {
419   xnn_operator_t convolution_op = NULL;
420   enum xnn_status status = xnn_status_uninitialized;
421 
422   if (!xnn_params.initialized) {
423     xnn_log_error("failed to create Convolution operator: XNNPACK is not initialized");
424     goto error;
425   }
426 
427   status = xnn_status_invalid_parameter;
428 
429   if (kernel_width == 0 || kernel_height == 0) {
430     xnn_log_error(
431       "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
432       kernel_width, kernel_height);
433     goto error;
434   }
435 
436   if (subsampling_width == 0 || subsampling_height == 0) {
437     xnn_log_error(
438       "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " subsampling: "
439       "subsampling dimensions must be non-zero",
440       subsampling_width, subsampling_height);
441     goto error;
442   }
443 
444   if (dilation_width == 0 || dilation_height == 0) {
445     xnn_log_error(
446       "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
447       "dilation dimensions must be non-zero",
448       dilation_width, dilation_height);
449     goto error;
450   }
451 
452   if (groups == 0) {
453     xnn_log_error(
454       "failed to create Convolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
455     goto error;
456   }
457 
458   if (group_input_channels == 0) {
459     xnn_log_error(
460       "failed to create Convolution operator with %zu input channels per group: "
461       "number of channels must be non-zero",
462       group_input_channels);
463     goto error;
464   }
465 
466   if (group_output_channels == 0) {
467     xnn_log_error(
468       "failed to create Convolution operator with %zu output channels per group: "
469       "number of channels must be non-zero",
470       group_output_channels);
471     goto error;
472   }
473 
474   const size_t input_channels = groups * group_input_channels;
475   if (input_pixel_stride < input_channels) {
476     xnn_log_error(
477       "failed to create Convolution operator with input pixel stride of %zu: "
478       "stride must be at least as large as the number of input channels (%" PRIu32 "x%zu)",
479       input_pixel_stride, groups, group_input_channels);
480     goto error;
481   }
482 
483   const size_t output_channels = groups * group_output_channels;
484   if (output_pixel_stride < output_channels) {
485     xnn_log_error(
486       "failed to create Convolution operator with output pixel stride of %zu: "
487       "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
488       output_pixel_stride, groups, group_output_channels);
489     goto error;
490   }
491 
492   if (isnan(output_min)) {
493     xnn_log_error(
494       "failed to create Convolution operator with NaN output lower bound: lower bound must be non-NaN");
495     goto error;
496   }
497 
498   if (isnan(output_max)) {
499     xnn_log_error(
500       "failed to create Convolution operator with NaN output upper bound: upper bound must be non-NaN");
501     goto error;
502   }
503 
504   if (output_min >= output_max) {
505     xnn_log_error(
506       "failed to create Convolution operator with [%.7g, %.7g] output range: "
507       "lower bound must be below upper bound",
508       output_min, output_max);
509     goto error;
510   }
511 
512   if ((flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) != 0 && group_input_channels != 1) {
513     xnn_log_error(
514       "failed to create Depthwise Convolution operator with %zu input channels per group: "
515       "Depthwise Convolution must have exactly 1 input channel per group",
516       group_input_channels);
517     goto error;
518   }
519 
520   const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
521   if ((flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
522     if (any_padding) {
523       xnn_log_error(
524         "failed to create Convolution operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
525         "TensorFlow SAME padding can't be combined with explicit padding specification",
526         input_padding_top, input_padding_left, input_padding_bottom, input_padding_right);
527       goto error;
528     }
529   }
530 
531   status = xnn_status_out_of_memory;
532 
533   convolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
534   if (convolution_op == NULL) {
535     xnn_log_error("failed to allocate %zu bytes for Convolution operator descriptor", sizeof(struct xnn_operator));
536     goto error;
537   }
538 
539   const size_t kernel_size = kernel_height * kernel_width;
540 
541   enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_none;
542   const struct dwconv_parameters* dwconv_parameters = NULL;
543   const bool unit_subsampling = (subsampling_width | subsampling_height) == 1;
544   if (group_input_channels == 1 && group_output_channels == 1 && kernel_size == 1 && unit_subsampling && !any_padding) {
545     ukernel_type = xnn_ukernel_type_vmulcaddc;
546   } else if (group_input_channels == 1 && group_output_channels == 1 && (dwconv_parameters =
547                find_dwigemm_ukernel(kernel_size, xnn_params.f32.dwconv, XNN_MAX_F32_DWCONV_UKERNELS)) != NULL)
548   {
549     ukernel_type = xnn_ukernel_type_dwconv;
550   } else if (kernel_size == 1 && unit_subsampling && !any_padding) {
551     ukernel_type = xnn_ukernel_type_gemm;
552   } else {
553     ukernel_type = xnn_ukernel_type_igemm;
554   }
555 
556   size_t zero_size = 0;
557   switch (ukernel_type) {
558     case xnn_ukernel_type_vmulcaddc:
559     {
560       const uint32_t c_stride = round_up_po2(groups, xnn_params.f32.vmulcaddc.channel_tile);
561       const size_t packed_weights_size = 2 * sizeof(float) * c_stride;
562       convolution_op->packed_weights = xnn_allocate_simd_memory(packed_weights_size);
563       if (convolution_op->packed_weights == NULL) {
564         xnn_log_error("failed to allocate %zu bytes for packed weights", packed_weights_size);
565         goto error;
566       }
567 
568       xnn_pack_f32_vmulcaddc_w(
569         groups, xnn_params.f32.vmulcaddc.channel_tile,
570         kernel, bias, convolution_op->packed_weights);
571 
572       convolution_op->ukernel.vmulcaddc = (struct xnn_ukernel_vmulcaddc) {
573         .function = xnn_params.f32.vmulcaddc.ukernel,
574         .mr = xnn_params.f32.vmulcaddc.row_tile,
575       };
576       break;
577     }
578     case xnn_ukernel_type_dwconv:
579     {
580       assert(dwconv_parameters != NULL);
581       assert(dwconv_parameters->mr == kernel_size);
582 
583       const uint32_t c_stride = round_up_po2(groups, dwconv_parameters->cr);
584       const size_t packed_weights_size = (kernel_size + 1) * sizeof(float) * c_stride;
585       convolution_op->packed_weights = xnn_allocate_simd_memory(packed_weights_size);
586       if (convolution_op->packed_weights == NULL) {
587         xnn_log_error("failed to allocate %zu bytes for packed weights", packed_weights_size);
588         goto error;
589       }
590 
591       if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
592         xnn_pack_f32_dwconv_hwg_w(
593           kernel_height, kernel_width,
594           groups, dwconv_parameters->cr,
595           kernel, bias, convolution_op->packed_weights);
596       } else {
597         xnn_pack_f32_dwconv_ghw_w(
598           kernel_height, kernel_width,
599           groups, dwconv_parameters->cr,
600           kernel, bias, convolution_op->packed_weights);
601       }
602 
603       convolution_op->ukernel.dwconv = (struct xnn_ukernel_dwconv) {
604         .unipass_function = dwconv_parameters->up,
605         .mr = dwconv_parameters->mr,
606         .qr = dwconv_parameters->qr,
607       };
608 
609       zero_size = sizeof(float) * c_stride;
610       break;
611     }
612     case xnn_ukernel_type_gemm:
613     case xnn_ukernel_type_igemm:
614     {
615       const uint32_t nr = xnn_params.f32.gemm.nr;
616       const uint32_t kr = UINT32_C(1) << xnn_params.f32.gemm.log2_kr;
617       const uint32_t sr = UINT32_C(1) << xnn_params.f32.gemm.log2_sr;
618       const uint32_t n_stride = round_up(group_output_channels, nr);
619       const uint32_t k_stride = round_up_po2(group_input_channels, kr);
620 
621       const size_t packed_group_weights_size = (kernel_size * k_stride + 1) * sizeof(float) * n_stride;
622       convolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
623       if (convolution_op->packed_weights == NULL) {
624         xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
625         goto error;
626       }
627       memset(convolution_op->packed_weights, 0, packed_group_weights_size * groups);
628 
629       switch (ukernel_type) {
630         case xnn_ukernel_type_gemm:
631           xnn_pack_f32_gemm_goi_w(
632               groups, group_output_channels, group_input_channels,
633               nr, kr, sr,
634               kernel, bias, convolution_op->packed_weights);
635           convolution_op->ukernel.gemm = (struct xnn_ukernel_gemm) {
636             .mr = xnn_params.f32.gemm.mr,
637             .nr = nr,
638             .kr = kr,
639             .default_function = xnn_params.f32.gemm.gemm,
640             .mr1_function = xnn_params.f32.gemm.gemm1,
641           };
642           break;
643         case xnn_ukernel_type_igemm:
644           if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
645             xnn_pack_f32_conv_kgo_w(
646               groups, group_output_channels, kernel_size,
647               nr, kr,
648               kernel, bias, convolution_op->packed_weights);
649           } else {
650             xnn_pack_f32_conv_goki_w(
651               groups, group_output_channels, kernel_size, group_input_channels,
652               nr, kr, sr,
653               kernel, bias, convolution_op->packed_weights);
654           }
655           convolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
656             .mr = xnn_params.f32.gemm.mr,
657             .nr = nr,
658             .kr = kr,
659             .default_function = xnn_params.f32.gemm.igemm,
660             .mr1_function = xnn_params.f32.gemm.igemm1,
661           };
662           break;
663         default:
664           XNN_UNREACHABLE;
665       }
666 
667       zero_size = sizeof(float) * k_stride;
668       break;
669     }
670     default:
671       XNN_UNREACHABLE;
672   }
673 
674   const bool tf_same_padding = (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0 && kernel_size != 1;
675   if (any_padding || tf_same_padding) {
676     void* zero_buffer = xnn_allocate_zero_simd_memory(zero_size);
677     if (zero_buffer == NULL) {
678       xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
679       goto error;
680     }
681     convolution_op->zero_buffer = zero_buffer;
682   }
683 
684   convolution_op->padding_top = input_padding_top;
685   convolution_op->padding_right = input_padding_right;
686   convolution_op->padding_bottom = input_padding_bottom;
687   convolution_op->padding_left = input_padding_left;
688 
689   convolution_op->kernel_height = kernel_height;
690   convolution_op->kernel_width = kernel_width;
691   convolution_op->stride_height = subsampling_height;
692   convolution_op->stride_width = subsampling_width;
693   convolution_op->dilation_height = dilation_height;
694   convolution_op->dilation_width = dilation_width;
695   convolution_op->groups = groups;
696   convolution_op->group_input_channels = group_input_channels;
697   convolution_op->group_output_channels = group_output_channels;
698   convolution_op->input_pixel_stride = input_pixel_stride;
699   convolution_op->output_pixel_stride = output_pixel_stride;
700 
701   convolution_op->f32_output_params = xnn_init_f32_output_params(output_min, output_max);
702 
703   convolution_op->type = xnn_operator_type_convolution_nhwc_f32;
704   convolution_op->ukernel.type = ukernel_type;
705   if (tf_same_padding) {
706     convolution_op->flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
707   }
708 
709   convolution_op->state = xnn_run_state_invalid;
710 
711   *convolution_op_out = convolution_op;
712   return xnn_status_success;
713 
714 error:
715   xnn_delete_operator(convolution_op);
716   return status;
717 }
718 
setup_convolution2d_nhwc(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * params,size_t num_threads)719 static enum xnn_status setup_convolution2d_nhwc(
720   xnn_operator_t convolution_op,
721   size_t batch_size,
722   size_t input_height,
723   size_t input_width,
724   const void* input,
725   void* output,
726   uint32_t log2_input_element_size,
727   uint32_t log2_filter_element_size,
728   uint32_t bias_element_size,
729   uint32_t log2_output_element_size,
730   const void* params,
731   size_t num_threads)
732 {
733   convolution_op->state = xnn_run_state_invalid;
734 
735   if (!xnn_params.initialized) {
736     xnn_log_error("failed to setup Convolution operator: XNNPACK is not initialized");
737     return xnn_status_uninitialized;
738   }
739 
740   if (input_width == 0 || input_height == 0) {
741     xnn_log_error(
742       "failed to setup Convolution operator with %zux%zu input: input dimensions must be non-zero",
743       input_width, input_height);
744     return xnn_status_invalid_parameter;
745   }
746 
747   if (batch_size == 0) {
748     convolution_op->state = xnn_run_state_skip;
749     return xnn_status_success;
750   }
751 
752   convolution_op->batch_size = batch_size;
753   convolution_op->input_height = input_height;
754   convolution_op->input_width = input_width;
755   convolution_op->input = input;
756 
757   if (convolution_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
758     convolution_op->output_height = compute_output_dimension_with_tf_same_padding(
759         input_height, convolution_op->stride_height);
760     convolution_op->output_width = compute_output_dimension_with_tf_same_padding(
761         input_width, convolution_op->stride_width);
762 
763     const uint32_t effective_kernel_height = (convolution_op->kernel_height - 1) * convolution_op->dilation_height + 1;
764     const uint32_t effective_kernel_width = (convolution_op->kernel_width - 1) * convolution_op->dilation_width + 1;
765     const uint32_t total_padding_height =
766       (convolution_op->output_height - 1) * convolution_op->stride_height + effective_kernel_height - input_height;
767     const uint32_t total_padding_width =
768       (convolution_op->output_width - 1) * convolution_op->stride_width + effective_kernel_width - input_width;
769     convolution_op->padding_top = total_padding_height / 2;
770     convolution_op->padding_left = total_padding_width / 2;
771     convolution_op->padding_bottom = total_padding_height - convolution_op->padding_top;
772     convolution_op->padding_right = total_padding_width - convolution_op->padding_left;
773   } else {
774     convolution_op->output_height = compute_output_dimension(
775         convolution_op->padding_top + input_height + convolution_op->padding_bottom,
776         convolution_op->kernel_height,
777         convolution_op->dilation_height,
778         convolution_op->stride_height);
779     convolution_op->output_width = compute_output_dimension(
780         convolution_op->padding_left + input_width + convolution_op->padding_right,
781         convolution_op->kernel_width,
782         convolution_op->dilation_width,
783         convolution_op->stride_width);
784   }
785   convolution_op->output = output;
786 
787   switch (convolution_op->ukernel.type) {
788     case xnn_ukernel_type_gemm:
789     {
790       // Convolution maps directly to GEMM and doesn't use indirection buffer.
791 
792       const size_t output_height = convolution_op->output_height;
793       const size_t output_width = convolution_op->output_width;
794       const size_t output_size = output_height * output_width;
795       const size_t batch_output_size = batch_size * output_size;
796 
797       const size_t groups = convolution_op->groups;
798       const size_t group_input_channels = convolution_op->group_input_channels;
799       const size_t w_stride = (round_up_po2(group_input_channels, convolution_op->ukernel.gemm.kr) << log2_filter_element_size) + bias_element_size;
800       const size_t group_output_channels = convolution_op->group_output_channels;
801 
802       uint32_t mr = convolution_op->ukernel.gemm.mr;
803       const uint32_t nr = convolution_op->ukernel.gemm.nr;
804       xnn_gemm_ukernel_function gemm_ukernel = convolution_op->ukernel.gemm.default_function;
805       if (batch_output_size == 1 && convolution_op->ukernel.gemm.mr1_function != NULL) {
806         mr = 1;
807         gemm_ukernel = convolution_op->ukernel.gemm.mr1_function;
808       }
809 
810       convolution_op->context.gemm = (struct gemm_context) {
811           .k_scaled = group_input_channels << log2_input_element_size,
812           .a = input,
813           .a_stride = convolution_op->input_pixel_stride << log2_input_element_size,
814           .packed_w = convolution_op->packed_weights,
815           .w_stride = w_stride,
816           .wg_stride = w_stride * round_up(group_output_channels, nr),
817           .c = output,
818           .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
819           .cn_stride = nr << log2_output_element_size,
820           .cg_stride = group_output_channels << log2_output_element_size,
821           .log2_csize = log2_output_element_size,
822           .ukernel = gemm_ukernel,
823       };
824       memcpy(&convolution_op->context.gemm.params, params, sizeof(convolution_op->context.gemm.params));
825 
826       size_t nc = group_output_channels;
827       if (num_threads > 1) {
828         const size_t num_other_tiles = groups * divide_round_up(batch_output_size, mr);
829         const size_t target_tiles_per_thread = 5;
830         const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
831         if (max_nc < nc) {
832           nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
833         }
834       }
835       if (groups == 1) {
836         convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
837         convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
838         convolution_op->compute.range[0] = batch_output_size;
839         convolution_op->compute.range[1] = group_output_channels;
840         convolution_op->compute.tile[0] = mr;
841         convolution_op->compute.tile[1] = nc;
842       } else {
843         convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
844         convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_ggemm;
845         convolution_op->compute.range[0] = groups;
846         convolution_op->compute.range[1] = batch_output_size;
847         convolution_op->compute.range[2] = group_output_channels;
848         convolution_op->compute.tile[0] = mr;
849         convolution_op->compute.tile[1] = nc;
850       }
851       convolution_op->state = xnn_run_state_ready;
852 
853       return xnn_status_success;
854     }
855     case xnn_ukernel_type_igemm:
856     {
857       const size_t groups = convolution_op->groups;
858       const size_t kernel_height = convolution_op->kernel_height;
859       const size_t kernel_width = convolution_op->kernel_width;
860       const size_t kernel_size = kernel_height * kernel_width;
861       const size_t output_height = convolution_op->output_height;
862       const size_t output_width = convolution_op->output_width;
863       const size_t output_size = output_height * output_width;
864 
865       uint32_t mr = convolution_op->ukernel.igemm.mr;
866       const uint32_t nr = convolution_op->ukernel.igemm.nr;
867       xnn_igemm_ukernel_function igemm_ukernel = convolution_op->ukernel.igemm.default_function;
868       if (output_size == 1 && convolution_op->ukernel.igemm.mr1_function != NULL) {
869         mr = 1;
870         igemm_ukernel = convolution_op->ukernel.igemm.mr1_function;
871       }
872 
873       const size_t tiled_output_size = round_up(output_size, mr);
874       const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
875 
876       if (input_height != convolution_op->last_input_height ||
877           input_width != convolution_op->last_input_width)
878       {
879         const void** indirection_buffer = (const void**) xnn_reallocate_memory(convolution_op->indirection_buffer, indirection_buffer_size);
880         if (indirection_buffer == NULL) {
881           xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
882           return xnn_status_out_of_memory;
883         }
884         convolution_op->indirection_buffer = indirection_buffer;
885         convolution_op->last_input = input;
886         convolution_op->last_input_height = input_height;
887         convolution_op->last_input_width = input_width;
888 
889         xnn_indirection_init_conv2d(convolution_op, mr, log2_input_element_size);
890       }
891 
892       const size_t group_input_channels = convolution_op->group_input_channels;
893       const size_t w_stride = (round_up_po2(group_input_channels, convolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size) + bias_element_size;
894       const size_t group_output_channels = convolution_op->group_output_channels;
895       convolution_op->context.igemm = (struct igemm_context) {
896           .ks = kernel_size,
897           .ks_scaled = kernel_size * mr * sizeof(void*),
898           .kc = group_input_channels << log2_input_element_size,
899           .w_stride = w_stride,
900           .indirect_a = convolution_op->indirection_buffer,
901           .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) convolution_op->last_input),
902           .zero = convolution_op->zero_buffer,
903           .packed_w = convolution_op->packed_weights,
904           .c = convolution_op->output,
905           .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
906           .cn_stride = nr << log2_output_element_size,
907           .ga_stride = group_input_channels << log2_input_element_size,
908           .gw_stride = w_stride * round_up(group_output_channels, nr),
909           .gc_stride = group_output_channels << log2_output_element_size,
910           .ba_stride = input_height * input_width * convolution_op->input_pixel_stride << log2_input_element_size,
911           .bc_stride = output_size * convolution_op->output_pixel_stride << log2_output_element_size,
912           .log2_csize = log2_output_element_size,
913           .ukernel = igemm_ukernel,
914       };
915       memcpy(&convolution_op->context.igemm.params, params, sizeof(convolution_op->context.igemm.params));
916 
917       size_t nc = group_output_channels;
918       if (num_threads > 1) {
919         const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
920         const size_t target_tiles_per_thread = 5;
921         const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
922         if (max_nc < nc) {
923           nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
924         }
925       }
926       if (groups == 1) {
927         convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
928         convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_igemm;
929         convolution_op->compute.range[0] = batch_size;
930         convolution_op->compute.range[1] = output_size;
931         convolution_op->compute.range[2] = group_output_channels;
932         convolution_op->compute.tile[0] = mr;
933         convolution_op->compute.tile[1] = nc;
934       } else {
935         convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
936         convolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_gigemm;
937         convolution_op->compute.range[0] = batch_size;
938         convolution_op->compute.range[1] = groups;
939         convolution_op->compute.range[2] = output_size;
940         convolution_op->compute.range[3] = group_output_channels;
941         convolution_op->compute.tile[0] = mr;
942         convolution_op->compute.tile[1] = nc;
943       }
944       convolution_op->state = xnn_run_state_ready;
945 
946       return xnn_status_success;
947     }
948     case xnn_ukernel_type_dwconv:
949     {
950       size_t valid_batch_size = 0;
951       if (input == convolution_op->last_input &&
952           input_height == convolution_op->last_input_height &&
953           input_width == convolution_op->last_input_width)
954       {
955         valid_batch_size = convolution_op->valid_batch_size;
956         if (batch_size <= valid_batch_size) {
957           convolution_op->compute.range[0] = batch_size * convolution_op->output_height;
958           convolution_op->context.dwconv.output = output;
959           convolution_op->state = xnn_run_state_ready;
960           return xnn_status_success;
961         }
962       }
963 
964       const size_t kernel_height = convolution_op->kernel_height;
965       const size_t kernel_width = convolution_op->kernel_width;
966       const size_t kernel_size = kernel_height * kernel_width;
967       const size_t output_height = convolution_op->output_height;
968       const size_t output_width = convolution_op->output_width;
969       const size_t step_width = convolution_op->dilation_width == 1 ? convolution_op->stride_width : kernel_width;
970       const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
971       const size_t indirection_buffer_size = sizeof(void*) * batch_size * output_height * step_height;
972 
973       const void** indirection_buffer =
974         (const void**) xnn_reallocate_memory(convolution_op->indirection_buffer, indirection_buffer_size);
975       if (indirection_buffer == NULL) {
976         xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
977         return xnn_status_out_of_memory;
978       }
979       convolution_op->indirection_buffer = indirection_buffer;
980 
981       xnn_indirection_init_dwconv2d(convolution_op, valid_batch_size, step_height, step_width, log2_input_element_size);
982 
983       const size_t groups = convolution_op->groups;
984       convolution_op->context.dwconv = (struct dwconv_context) {
985           .groups = groups,
986           .indirection_buffer = convolution_op->indirection_buffer,
987           .indirection_buffer_row_stride = step_height,
988           .indirection_buffer_col_stride = kernel_height * step_width * sizeof(void*),
989           .packed_weights = convolution_op->packed_weights,
990           .output = convolution_op->output,
991           .output_width = output_width,
992           .output_row_stride = output_width * convolution_op->output_pixel_stride << log2_output_element_size,
993           .output_col_increment = (convolution_op->output_pixel_stride - groups) << log2_output_element_size,
994           .unipass_ukernel = convolution_op->ukernel.dwconv.unipass_function,
995       };
996       memcpy(&convolution_op->context.dwconv.params, params, sizeof(convolution_op->context.dwconv.params));
997 
998       convolution_op->compute.type = xnn_parallelization_type_1d;
999       convolution_op->compute.task_1d = (pthreadpool_task_1d_t) xnn_compute_dwconv_unipass;
1000       convolution_op->compute.range[0] = batch_size * output_height;
1001       convolution_op->state = xnn_run_state_ready;
1002 
1003       convolution_op->last_input = input;
1004       convolution_op->last_input_height = input_height;
1005       convolution_op->last_input_width = input_width;
1006       convolution_op->valid_batch_size = max(valid_batch_size, batch_size);
1007 
1008       return xnn_status_success;
1009     }
1010     case xnn_ukernel_type_vmulcaddc:
1011     {
1012       const size_t batch_output_size = batch_size * convolution_op->output_height * convolution_op->output_width;
1013 
1014       convolution_op->context.vmulcaddc = (struct vmulcaddc_context) {
1015           .n = convolution_op->groups << log2_input_element_size,
1016           .x = input,
1017           .x_stride = convolution_op->input_pixel_stride << log2_input_element_size,
1018           .w = convolution_op->packed_weights,
1019           .y = output,
1020           .y_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1021           .ukernel = convolution_op->ukernel.vmulcaddc.function,
1022       };
1023       memcpy(&convolution_op->context.vmulcaddc.params, params, sizeof(convolution_op->context.vmulcaddc.params));
1024 
1025       size_t mc = batch_output_size;
1026       if (num_threads > 1) {
1027         const size_t target_tiles_per_thread = 5;
1028         const size_t max_mc = divide_round_up(batch_output_size, num_threads * target_tiles_per_thread);
1029         if (max_mc < mc) {
1030           const uint32_t mr = convolution_op->ukernel.vmulcaddc.mr;
1031           mc = min(mc, divide_round_up(mc, max_mc * mr) * mr);
1032         }
1033       }
1034       convolution_op->compute.type = xnn_parallelization_type_1d_tile_1d;
1035       convolution_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_vmulcaddc;
1036       convolution_op->compute.range[0] = batch_output_size;
1037       convolution_op->compute.tile[0] = mc;
1038       convolution_op->state = xnn_run_state_ready;
1039 
1040       return xnn_status_success;
1041     }
1042     default:
1043       XNN_UNREACHABLE;
1044   }
1045 }
1046 
xnn_setup_convolution2d_nhwc_q8(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const uint8_t * input,uint8_t * output,pthreadpool_t threadpool)1047 enum xnn_status xnn_setup_convolution2d_nhwc_q8(
1048     xnn_operator_t convolution_op,
1049     size_t batch_size,
1050     size_t input_height,
1051     size_t input_width,
1052     const uint8_t* input,
1053     uint8_t* output,
1054     pthreadpool_t threadpool)
1055 {
1056   if (convolution_op->type != xnn_operator_type_convolution_nhwc_q8) {
1057     xnn_log_error("failed to setup Convolution (NHWC, Q8) operator: operator type mismatch");
1058     return xnn_status_invalid_parameter;
1059   }
1060 
1061   return setup_convolution2d_nhwc(
1062     convolution_op,
1063     batch_size, input_height, input_width,
1064     input, output,
1065     0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
1066     0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
1067     sizeof(int32_t) /* sizeof(bias element) */,
1068     0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
1069     &convolution_op->q8_gemm_params,
1070     pthreadpool_get_threads_count(threadpool));
1071 }
1072 
xnn_setup_convolution2d_nhwc_f32(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const float * input,float * output,pthreadpool_t threadpool)1073 enum xnn_status xnn_setup_convolution2d_nhwc_f32(
1074     xnn_operator_t convolution_op,
1075     size_t batch_size,
1076     size_t input_height,
1077     size_t input_width,
1078     const float* input,
1079     float* output,
1080     pthreadpool_t threadpool)
1081 {
1082   if (convolution_op->type != xnn_operator_type_convolution_nhwc_f32) {
1083     xnn_log_error("failed to setup Convolution (NHWC, F32) operator: operator type mismatch");
1084     return xnn_status_invalid_parameter;
1085   }
1086 
1087   return setup_convolution2d_nhwc(
1088     convolution_op,
1089     batch_size, input_height, input_width,
1090     input, output,
1091     2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1092     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1093     sizeof(float) /* sizeof(bias element) */,
1094     2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
1095     &convolution_op->f32_output_params,
1096     pthreadpool_get_threads_count(threadpool));
1097 }
1098