1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <assert.h>
10 #include <math.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <string.h>
16
17 #include <fp16.h>
18
19 #include <xnnpack.h>
20 #include <xnnpack/allocator.h>
21 #include <xnnpack/cache.h>
22 #include <xnnpack/common.h>
23 #include <xnnpack/compute.h>
24 #include <xnnpack/indirection.h>
25 #include <xnnpack/log.h>
26 #include <xnnpack/math.h>
27 #include <xnnpack/operator.h>
28 #include <xnnpack/pack.h>
29 #include <xnnpack/params.h>
30 #include <xnnpack/post-operation.h>
31 #include <xnnpack/microparams-init.h>
32
33 #ifndef XNN_ENABLE_GEMM_M_SPECIALIZATION
34 #error "XNN_ENABLE_GEMM_M_SPECIALIZATION is not defined"
35 #endif
36
compute_output_dimension_with_tf_same_padding(size_t input_dimension,size_t subsampling_dimension)37 static inline size_t compute_output_dimension_with_tf_same_padding(
38 size_t input_dimension,
39 size_t subsampling_dimension)
40 {
41 return divide_round_up(input_dimension, subsampling_dimension);
42 }
43
find_dwconv_ukernel(size_t kernel_size,const struct dwconv_parameters * ukernel,size_t num_ukernels)44 static inline const struct dwconv_parameters* find_dwconv_ukernel(
45 size_t kernel_size,
46 const struct dwconv_parameters* ukernel,
47 size_t num_ukernels)
48 {
49 while (num_ukernels-- != 0) {
50 if (ukernel->primary_tile == kernel_size) {
51 return ukernel;
52 }
53 ukernel++;
54 }
55 return NULL;
56 }
57
58 #if XNN_PLATFORM_JIT
cached_code_at_offset(xnn_operator_t op,size_t offset)59 static inline uintptr_t cached_code_at_offset(xnn_operator_t op, size_t offset)
60 {
61 return (uintptr_t)op->code_cache->cache.code.start + offset;
62 }
63
get_generated_gemm(struct xnn_hmp_gemm_codegen generators,struct jit_gemm_params * jit_gemm_params,size_t mr,size_t group_output_channels,size_t nr,size_t group_input_channels,size_t log2_input_element_size,struct xnn_code_cache * code_cache)64 static size_t get_generated_gemm(
65 struct xnn_hmp_gemm_codegen generators,
66 struct jit_gemm_params *jit_gemm_params,
67 size_t mr,
68 size_t group_output_channels,
69 size_t nr,
70 size_t group_input_channels,
71 size_t log2_input_element_size,
72 struct xnn_code_cache* code_cache)
73 {
74 size_t offset = XNN_CACHE_NOT_FOUND;
75 xnn_jit_gemm_code_generator_function generator = generators.function[XNN_UARCH_DEFAULT];
76 if (generator == NULL) {
77 goto error;
78 }
79
80 enum xnn_status status = xnn_status_success;
81
82 status = xnn_reserve_code_memory(&code_cache->cache.code, XNN_DEFAULT_MICROKERNEL_SIZE);
83 if (xnn_status_success != status) {
84 xnn_log_error("failed to ensure sufficient space in the code buffer for a microkernel");
85 goto error;
86 }
87
88 const size_t old_size = code_cache->cache.code.size;
89 void* old_code = (uint8_t*) code_cache->cache.code.start + old_size;
90 status = generator(&code_cache->cache.code, mr, group_output_channels % nr,
91 group_input_channels << log2_input_element_size,
92 jit_gemm_params);
93
94 if (xnn_status_success != status) {
95 xnn_log_error("failed to generate GEMM microkernel");
96 goto error;
97 }
98
99 const size_t new_size = code_cache->cache.code.size;
100 return xnn_get_or_insert_code_cache(code_cache, old_code, new_size - old_size);
101
102 error:
103 return offset;
104 }
105
generate_gemms_up_to_max_mr(size_t max_mr,struct gemm_codegens generators,struct jit_gemm_params * jit_gemm_params,size_t group_output_channels,size_t nr,size_t group_input_channels,size_t log2_input_element_size,xnn_operator_t convolution_op)106 static void generate_gemms_up_to_max_mr(
107 size_t max_mr,
108 struct gemm_codegens generators,
109 struct jit_gemm_params *jit_gemm_params,
110 size_t group_output_channels,
111 size_t nr,
112 size_t group_input_channels,
113 size_t log2_input_element_size,
114 xnn_operator_t convolution_op)
115 {
116 assert(XNN_MAX_MR >= max_mr);
117 if (convolution_op->code_cache == NULL) {
118 return;
119 }
120 convolution_op->ukernel.gemm.gemm_cases[0].generated_code_offset[XNN_UARCH_DEFAULT] =
121 get_generated_gemm(generators.gemm1, jit_gemm_params, 1, group_output_channels, nr, group_input_channels,
122 log2_input_element_size, convolution_op->code_cache);
123 for (size_t mr = 2; mr <= max_mr; mr++) {
124 convolution_op->ukernel.gemm.gemm_cases[mr - 1].generated_code_offset[XNN_UARCH_DEFAULT] =
125 get_generated_gemm(generators.gemm, jit_gemm_params, mr, group_output_channels, nr, group_input_channels,
126 log2_input_element_size, convolution_op->code_cache);
127 }
128 }
129
get_generated_igemm(struct xnn_hmp_igemm_codegen generators,struct jit_gemm_params * jit_gemm_params,size_t group_output_channels,size_t nr,size_t group_input_channels,size_t log2_input_element_size,size_t kernel_size,size_t mr,struct xnn_code_cache * code_cache)130 static size_t get_generated_igemm(
131 struct xnn_hmp_igemm_codegen generators,
132 struct jit_gemm_params *jit_gemm_params,
133 size_t group_output_channels,
134 size_t nr,
135 size_t group_input_channels,
136 size_t log2_input_element_size,
137 size_t kernel_size,
138 size_t mr,
139 struct xnn_code_cache* code_cache)
140 {
141 size_t offset = XNN_CACHE_NOT_FOUND;
142 xnn_jit_igemm_code_generator_function generator = generators.function[XNN_UARCH_DEFAULT];
143 if (generator == NULL) {
144 goto error;
145 }
146 enum xnn_status status = xnn_status_success;
147
148 status = xnn_reserve_code_memory(&code_cache->cache.code, XNN_DEFAULT_MICROKERNEL_SIZE);
149 if (xnn_status_success != status) {
150 xnn_log_error("failed to ensure sufficient space in code buffer for microkernel");
151 goto error;
152 }
153
154 const size_t old_size = code_cache->cache.code.size;
155 void* old_code = (uint8_t*) code_cache->cache.code.start + old_size;
156 status = generator(&code_cache->cache.code, mr, group_output_channels % nr,
157 group_input_channels << log2_input_element_size,
158 kernel_size * mr * sizeof(void*), jit_gemm_params);
159 if (status != xnn_status_success) {
160 xnn_log_error("failed to generate IGEMM microkernel");
161 goto error;
162 }
163
164 const size_t new_size = code_cache->cache.code.size;
165 return xnn_get_or_insert_code_cache(code_cache, old_code, new_size - old_size);
166
167 error:
168 return offset;
169 }
170
generate_igemms_up_to_max_mr(size_t max_mr,struct gemm_codegens generators,struct jit_gemm_params * jit_gemm_params,size_t group_output_channels,size_t nr,size_t group_input_channels,size_t log2_input_element_size,size_t kernel_size,xnn_operator_t convolution_op)171 static void generate_igemms_up_to_max_mr(
172 size_t max_mr,
173 struct gemm_codegens generators,
174 struct jit_gemm_params *jit_gemm_params,
175 size_t group_output_channels,
176 size_t nr,
177 size_t group_input_channels,
178 size_t log2_input_element_size,
179 size_t kernel_size,
180 xnn_operator_t convolution_op)
181 {
182 assert(XNN_MAX_MR >= max_mr);
183 if (convolution_op->code_cache == NULL) {
184 return;
185 }
186 convolution_op->ukernel.igemm.igemm_cases[0].generated_code_offset[XNN_UARCH_DEFAULT] =
187 get_generated_igemm(generators.igemm1, jit_gemm_params, group_output_channels, nr, group_input_channels,
188 log2_input_element_size, kernel_size, 1, convolution_op->code_cache);
189 for (size_t mr = 2; mr <= max_mr; mr++) {
190 convolution_op->ukernel.igemm.igemm_cases[mr - 1].generated_code_offset[XNN_UARCH_DEFAULT] =
191 get_generated_igemm(generators.igemm, jit_gemm_params, group_output_channels, nr, group_input_channels,
192 log2_input_element_size, kernel_size, mr, convolution_op->code_cache);
193 }
194 }
195 #endif // XNN_PLATFORM_JIT
196
create_convolution2d_nhwc(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const void * kernel,const void * bias,uint32_t flags,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,xnn_pack_vmulcaddc_w_function pack_vmulcaddc_w,xnn_pack_dwconv_hwg_w_function pack_dwconv_hwg_w,xnn_pack_dwconv_ghw_w_function pack_dwconv_ghw_w,xnn_pack_gemm_goi_w_function pack_gemm_goi_w,xnn_pack_conv_kgo_w_function pack_conv_kgo_w,xnn_pack_conv_goki_w_function pack_conv_goki_w,const void * packing_params,int input_padding_byte,int packed_weights_padding_byte,size_t extra_weights_bytes,xnn_init_qc8_scale_params_fn init_scale_params,const float * scale_params,const void * gemm_params,size_t gemm_params_size,const void * dwconv_params,size_t dwconv_params_size,const void * vmulcaddc_params,size_t vmulcaddc_params_size,const struct gemm_parameters * gemm_parameters,const struct dwconv_parameters * dwconv_ukernel,const struct vmulcaddc_parameters * vmulcaddc_parameters,struct jit_gemm_params * jit_gemm_params,bool linear_activation,bool relu_activation,uint32_t datatype_init_flags,enum xnn_operator_type operator_type,size_t num_post_operations,void * post_operation_params,xnn_caches_t caches,xnn_operator_t * convolution_op_out)197 static enum xnn_status create_convolution2d_nhwc(
198 uint32_t input_padding_top,
199 uint32_t input_padding_right,
200 uint32_t input_padding_bottom,
201 uint32_t input_padding_left,
202 uint32_t kernel_height,
203 uint32_t kernel_width,
204 uint32_t subsampling_height,
205 uint32_t subsampling_width,
206 uint32_t dilation_height,
207 uint32_t dilation_width,
208 uint32_t groups,
209 size_t group_input_channels,
210 size_t group_output_channels,
211 size_t input_channel_stride,
212 size_t output_channel_stride,
213 const void* kernel,
214 const void* bias,
215 uint32_t flags,
216 uint32_t log2_input_element_size,
217 uint32_t log2_filter_element_size,
218 uint32_t bias_element_size,
219 xnn_pack_vmulcaddc_w_function pack_vmulcaddc_w,
220 xnn_pack_dwconv_hwg_w_function pack_dwconv_hwg_w,
221 xnn_pack_dwconv_ghw_w_function pack_dwconv_ghw_w,
222 xnn_pack_gemm_goi_w_function pack_gemm_goi_w,
223 xnn_pack_conv_kgo_w_function pack_conv_kgo_w,
224 xnn_pack_conv_goki_w_function pack_conv_goki_w,
225 const void* packing_params,
226 int input_padding_byte,
227 int packed_weights_padding_byte,
228 size_t extra_weights_bytes,
229 xnn_init_qc8_scale_params_fn init_scale_params,
230 const float* scale_params,
231 const void* gemm_params,
232 size_t gemm_params_size,
233 const void* dwconv_params,
234 size_t dwconv_params_size,
235 const void* vmulcaddc_params,
236 size_t vmulcaddc_params_size,
237 const struct gemm_parameters* gemm_parameters,
238 const struct dwconv_parameters* dwconv_ukernel,
239 const struct vmulcaddc_parameters* vmulcaddc_parameters,
240 struct jit_gemm_params* jit_gemm_params,
241 bool linear_activation,
242 bool relu_activation,
243 uint32_t datatype_init_flags,
244 enum xnn_operator_type operator_type,
245 size_t num_post_operations,
246 void* post_operation_params,
247 xnn_caches_t caches,
248 xnn_operator_t* convolution_op_out)
249 {
250 xnn_operator_t convolution_op = NULL;
251 enum xnn_status status = xnn_status_uninitialized;
252
253 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
254 xnn_log_error(
255 "failed to create %s operator: XNNPACK is not initialized",
256 xnn_operator_type_to_string(operator_type));
257 goto error;
258 }
259
260 status = xnn_status_unsupported_hardware;
261
262 if ((xnn_params.init_flags & datatype_init_flags) != datatype_init_flags) {
263 xnn_log_error(
264 "failed to create %s operator: operations on data type are not supported",
265 xnn_operator_type_to_string(operator_type));
266 goto error;
267 }
268
269 status = xnn_status_invalid_parameter;
270
271 if (kernel_width == 0 || kernel_height == 0) {
272 xnn_log_error(
273 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
274 xnn_operator_type_to_string(operator_type), kernel_width, kernel_height);
275 goto error;
276 }
277
278 if (subsampling_width == 0 || subsampling_height == 0) {
279 xnn_log_error(
280 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " subsampling: subsampling dimensions must be non-zero",
281 xnn_operator_type_to_string(operator_type), subsampling_width, subsampling_height);
282 goto error;
283 }
284
285 if (dilation_width == 0 || dilation_height == 0) {
286 xnn_log_error(
287 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " dilation: dilation dimensions must be non-zero",
288 xnn_operator_type_to_string(operator_type), dilation_width, dilation_height);
289 goto error;
290 }
291
292 if (groups == 0) {
293 xnn_log_error(
294 "failed to create %s operator with %" PRIu32 " groups: number of groups must be non-zero",
295 xnn_operator_type_to_string(operator_type), groups);
296 goto error;
297 }
298
299 if (group_input_channels == 0) {
300 xnn_log_error(
301 "failed to create %s operator with %zu input channels per group: number of channels must be non-zero",
302 xnn_operator_type_to_string(operator_type), group_input_channels);
303 goto error;
304 }
305
306 if (group_output_channels == 0) {
307 xnn_log_error(
308 "failed to create %s operator with %zu output channels per group: number of channels must be non-zero",
309 xnn_operator_type_to_string(operator_type), group_output_channels);
310 goto error;
311 }
312
313 const size_t input_channels = groups * group_input_channels;
314 if (input_channel_stride < input_channels) {
315 xnn_log_error(
316 "failed to create %s operator with input channel stride of %zu: "
317 "stride must be at least as large as the number of input channels (%" PRIu32 "x%zu)",
318 xnn_operator_type_to_string(operator_type),
319 input_channel_stride, groups, group_input_channels);
320 goto error;
321 }
322
323 const size_t output_channels = groups * group_output_channels;
324 if (output_channel_stride < output_channels) {
325 xnn_log_error(
326 "failed to create %s operator with output channel stride of %zu: "
327 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
328 xnn_operator_type_to_string(operator_type),
329 output_channel_stride, groups, group_output_channels);
330 goto error;
331 }
332
333 if ((flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) != 0 && group_input_channels != 1) {
334 xnn_log_error(
335 "failed to create depthwise %s operator with %zu input channels per group: "
336 "depthwise convolution must have exactly 1 input channel per group",
337 xnn_operator_type_to_string(operator_type), group_input_channels);
338 goto error;
339 }
340
341 const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
342 if ((flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
343 if (any_padding) {
344 xnn_log_error(
345 "failed to create %s operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
346 "TensorFlow SAME padding can't be combined with explicit padding specification",
347 xnn_operator_type_to_string(operator_type),
348 input_padding_top, input_padding_left, input_padding_bottom, input_padding_right);
349 goto error;
350 }
351 }
352
353 status = xnn_status_out_of_memory;
354
355 convolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
356 if (convolution_op == NULL) {
357 xnn_log_error(
358 "failed to allocate %zu bytes for %s operator descriptor",
359 sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
360 goto error;
361 }
362
363 if (caches != NULL) {
364 convolution_op->weights_cache = caches->weights_cache;
365 convolution_op->code_cache = caches->code_cache;
366 }
367
368 const size_t kernel_size = kernel_height * kernel_width;
369
370 enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_default;
371 const bool unit_subsampling = (subsampling_width | subsampling_height) == 1;
372 if (group_input_channels == 1 && group_output_channels == 1 && kernel_size == 1 && unit_subsampling && !any_padding && vmulcaddc_parameters != NULL) {
373 ukernel_type = xnn_ukernel_type_vmulcaddc;
374 } else if (group_input_channels == 1 && group_output_channels == 1 && dwconv_ukernel != NULL)
375 {
376 ukernel_type = xnn_ukernel_type_dwconv;
377 } else if (kernel_size == 1 && unit_subsampling && !any_padding) {
378 ukernel_type = xnn_ukernel_type_gemm;
379 } else {
380 ukernel_type = xnn_ukernel_type_igemm;
381 }
382 assert(ukernel_type != xnn_ukernel_type_default);
383
384 if (num_post_operations != 0 && ukernel_type != xnn_ukernel_type_gemm) {
385 xnn_log_error(
386 "convolution with post operations not support for these parameters: "
387 "kernel_size: %zu unit_subsampling: %d padding: %d",
388 kernel_size, unit_subsampling, any_padding);
389 goto error;
390 }
391
392 size_t zero_size = 0;
393 switch (ukernel_type) {
394 case xnn_ukernel_type_vmulcaddc:
395 {
396 assert(vmulcaddc_parameters != NULL);
397 assert(vmulcaddc_params != NULL);
398
399 const size_t c_stride = round_up_po2(groups, vmulcaddc_parameters->channel_tile);
400 const size_t packed_weights_size = ((UINT32_C(1) << log2_filter_element_size) + bias_element_size) * c_stride;
401 size_t aligned_total_weights_size = round_up_po2(packed_weights_size, XNN_ALLOCATION_ALIGNMENT);
402 void* weights_ptr = xnn_get_pointer_to_write_weights(
403 convolution_op, aligned_total_weights_size, packed_weights_padding_byte);
404 if (weights_ptr == NULL) {
405 xnn_log_error("failed to reserve or allocated %zu bytes for %s operator vmulcaddc packed weights",
406 aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
407 goto error;
408 }
409
410 pack_vmulcaddc_w(
411 groups, vmulcaddc_parameters->channel_tile,
412 kernel, bias, weights_ptr, packing_params);
413
414 if (use_weights_cache(convolution_op)) {
415 convolution_op->packed_weights.offset = xnn_get_or_insert_weights_cache(
416 convolution_op->weights_cache, weights_ptr, aligned_total_weights_size);
417 }
418
419 memcpy(&convolution_op->params, vmulcaddc_params, vmulcaddc_params_size);
420
421 convolution_op->ukernel.vmulcaddc = (struct xnn_ukernel_vmulcaddc) {
422 .function = vmulcaddc_parameters->ukernel,
423 .mr = vmulcaddc_parameters->row_tile,
424 };
425 break;
426 }
427 case xnn_ukernel_type_dwconv:
428 {
429 assert(dwconv_ukernel != NULL);
430 assert(dwconv_ukernel->primary_tile == kernel_size);
431
432 const size_t c_stride = round_up_po2(groups, dwconv_ukernel->channel_tile);
433 const size_t packed_weights_size = ((kernel_size << log2_filter_element_size) + bias_element_size + extra_weights_bytes) * c_stride;
434 size_t aligned_total_weights_size = round_up_po2(packed_weights_size, XNN_ALLOCATION_ALIGNMENT);
435 void* weights_ptr = xnn_get_pointer_to_write_weights(
436 convolution_op, aligned_total_weights_size, packed_weights_padding_byte);
437 if (weights_ptr == NULL) {
438 xnn_log_error("failed to reserve or allocated %zu bytes for %s operator dwconv packed weights",
439 aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
440 goto error;
441 }
442 memcpy(&convolution_op->params, dwconv_params, dwconv_params_size);
443
444 if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
445 pack_dwconv_hwg_w(
446 dwconv_ukernel->primary_tile,
447 kernel_height, kernel_width,
448 groups, dwconv_ukernel->channel_tile,
449 kernel, bias, weights_ptr,
450 dwconv_ukernel->channel_tile * extra_weights_bytes,
451 packing_params);
452 } else {
453 pack_dwconv_ghw_w(
454 dwconv_ukernel->primary_tile,
455 kernel_height, kernel_width,
456 groups, dwconv_ukernel->channel_tile,
457 kernel, bias, weights_ptr,
458 dwconv_ukernel->channel_tile * extra_weights_bytes,
459 packing_params);
460 }
461
462 if (scale_params != NULL) {
463 assert(init_scale_params != NULL);
464
465 init_scale_params(
466 groups, dwconv_ukernel->channel_tile,
467 dwconv_ukernel->channel_tile * ((kernel_size << log2_filter_element_size) + bias_element_size + extra_weights_bytes),
468 scale_params,
469 (void*) ((uintptr_t) weights_ptr + dwconv_ukernel->channel_tile * ((kernel_size << log2_filter_element_size) + bias_element_size)));
470 }
471
472 if (use_weights_cache(convolution_op)) {
473 convolution_op->packed_weights.offset = xnn_get_or_insert_weights_cache(
474 convolution_op->weights_cache, weights_ptr, aligned_total_weights_size);
475 }
476
477 const union dwconv_fused_ukernels* ukernels = &dwconv_ukernel->minmax;
478 if (linear_activation && dwconv_ukernel->linear.unipass != NULL) {
479 ukernels = &dwconv_ukernel->linear;
480 }
481 convolution_op->ukernel.dwconv = (struct xnn_ukernel_dwconv) {
482 .unipass_function = ukernels->unipass,
483 .primary_tile = dwconv_ukernel->primary_tile,
484 .incremental_tile = dwconv_ukernel->incremental_tile,
485 };
486
487 zero_size = XNN_EXTRA_BYTES + (c_stride << log2_input_element_size);
488 break;
489 }
490 case xnn_ukernel_type_gemm:
491 case xnn_ukernel_type_igemm:
492 {
493 const uint32_t nr = gemm_parameters->nr;
494 const uint32_t kr = UINT32_C(1) << gemm_parameters->log2_kr;
495 const uint32_t sr = UINT32_C(1) << gemm_parameters->log2_sr;
496 const size_t n_stride = round_up(group_output_channels, nr);
497 const size_t k_stride = round_up_po2(group_input_channels, kr * sr);
498
499 const size_t packed_group_weights_size = ((kernel_size * k_stride << log2_filter_element_size) + bias_element_size + extra_weights_bytes) * n_stride;
500 const size_t aligned_total_weights_size = round_up_po2(packed_group_weights_size * groups, XNN_ALLOCATION_ALIGNMENT);
501 void* weights_ptr = xnn_get_pointer_to_write_weights(
502 convolution_op, aligned_total_weights_size, packed_weights_padding_byte);
503 if (weights_ptr == NULL) {
504 xnn_log_error("failed to reserve or allocated %zu bytes for %s operator gemm packed weights",
505 aligned_total_weights_size, xnn_operator_type_to_string(operator_type));
506 goto error;
507 }
508 memcpy(&convolution_op->params, gemm_params, gemm_params_size);
509 convolution_op->num_post_operation_params = num_post_operations;
510 convolution_op->post_operation_params = post_operation_params;
511
512 const struct gemm_fused_ukernels* gemm_ukernels = &gemm_parameters->minmax;
513 const uint32_t mr = gemm_parameters->mr;
514 if (linear_activation && gemm_parameters->linear.gemm[mr - 1].function[XNN_UARCH_DEFAULT] != NULL) {
515 gemm_ukernels = &gemm_parameters->linear;
516 } else if (relu_activation && gemm_parameters->relu.gemm[mr - 1].function[XNN_UARCH_DEFAULT] != NULL) {
517 gemm_ukernels = &gemm_parameters->relu;
518 }
519 switch (ukernel_type) {
520 case xnn_ukernel_type_gemm:
521 pack_gemm_goi_w(
522 groups, group_output_channels, group_input_channels,
523 nr, kr, sr,
524 kernel, bias, weights_ptr, gemm_parameters->nr * extra_weights_bytes, packing_params);
525 convolution_op->ukernel.gemm = (struct xnn_ukernel_gemm) {
526 .mr = mr,
527 .nr = nr,
528 .kr = kr,
529 .sr = sr,
530 };
531
532 assert(XNN_MAX_MR >= mr);
533 for (size_t i = 0; i < mr; i++) {
534 convolution_op->ukernel.gemm.gemm_cases[i] = gemm_ukernels->gemm[i];
535 }
536
537 #if XNN_PLATFORM_JIT
538 generate_gemms_up_to_max_mr(
539 mr, gemm_parameters->generator, jit_gemm_params, group_output_channels, nr,
540 group_input_channels, log2_input_element_size, convolution_op);
541 #endif // XNN_PLATFORM_JIT
542
543 break;
544 case xnn_ukernel_type_igemm:
545 if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
546 pack_conv_kgo_w(
547 groups, group_output_channels, kernel_size,
548 nr, kr, sr,
549 kernel, bias, weights_ptr, gemm_parameters->nr * extra_weights_bytes, packing_params);
550 } else {
551 pack_conv_goki_w(
552 groups, group_output_channels, kernel_size, group_input_channels,
553 nr, kr, sr,
554 kernel, bias, weights_ptr, gemm_parameters->nr * extra_weights_bytes, packing_params);
555 }
556 convolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
557 .mr = mr,
558 .nr = nr,
559 .kr = kr,
560 .sr = sr,
561 };
562
563 assert(XNN_MAX_MR >= mr);
564 for (size_t i = 0; i < mr; i++) {
565 convolution_op->ukernel.igemm.igemm_cases[i] = gemm_ukernels->igemm[i];
566 }
567
568 #if XNN_PLATFORM_JIT
569 generate_igemms_up_to_max_mr(
570 mr, gemm_parameters->generator, jit_gemm_params, group_output_channels, nr,
571 group_input_channels, log2_input_element_size, kernel_size, convolution_op);
572 #endif // XNN_PLATFORM_JIT
573
574 break;
575 default:
576 XNN_UNREACHABLE;
577 }
578
579 if (scale_params != NULL) {
580 assert(init_scale_params != NULL);
581
582 void* group_weights = (void*)
583 ((uintptr_t) weights_ptr + gemm_parameters->nr * ((kernel_size * k_stride << log2_filter_element_size) + bias_element_size));
584 const size_t weights_stride = (kernel_size * k_stride << log2_filter_element_size) + bias_element_size + extra_weights_bytes;
585 for (uint32_t group = 0; group < groups; group++) {
586 init_scale_params(
587 group_output_channels, gemm_parameters->nr,
588 gemm_parameters->nr * weights_stride,
589 scale_params, group_weights);
590 scale_params += group_output_channels;
591 group_weights = (void*) ((uintptr_t) group_weights + n_stride * weights_stride);
592 }
593 }
594
595 if (use_weights_cache(convolution_op)) {
596 convolution_op->packed_weights.offset = xnn_get_or_insert_weights_cache(
597 convolution_op->weights_cache, weights_ptr, aligned_total_weights_size);
598 }
599
600 zero_size = XNN_EXTRA_BYTES + (k_stride << log2_input_element_size);
601 break;
602 }
603 default:
604 XNN_UNREACHABLE;
605 }
606
607 const bool tf_same_padding = (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0 && kernel_size != 1;
608 if (any_padding || tf_same_padding) {
609 convolution_op->zero_buffer = xnn_allocate_simd_memory(zero_size);
610 if (convolution_op->zero_buffer == NULL) {
611 xnn_log_error(
612 "failed to allocate %zu bytes for %s operator zero padding",
613 zero_size, xnn_operator_type_to_string(operator_type));
614 goto error;
615 }
616 memset(convolution_op->zero_buffer, input_padding_byte, zero_size);
617 }
618
619 convolution_op->padding_top = input_padding_top;
620 convolution_op->padding_right = input_padding_right;
621 convolution_op->padding_bottom = input_padding_bottom;
622 convolution_op->padding_left = input_padding_left;
623
624 convolution_op->kernel_height = kernel_height;
625 convolution_op->kernel_width = kernel_width;
626 convolution_op->stride_height = subsampling_height;
627 convolution_op->stride_width = subsampling_width;
628 convolution_op->dilation_height = dilation_height;
629 convolution_op->dilation_width = dilation_width;
630 convolution_op->groups = groups;
631 convolution_op->group_input_channels = group_input_channels;
632 convolution_op->group_output_channels = group_output_channels;
633 convolution_op->input_pixel_stride = input_channel_stride;
634 convolution_op->output_pixel_stride = output_channel_stride;
635
636 convolution_op->type = operator_type;
637 convolution_op->ukernel.type = ukernel_type;
638 convolution_op->flags = flags & ~XNN_FLAG_TENSORFLOW_SAME_PADDING;
639 if (tf_same_padding) {
640 convolution_op->flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
641 }
642
643 convolution_op->state = xnn_run_state_invalid;
644
645 *convolution_op_out = convolution_op;
646 return xnn_status_success;
647
648 error:
649 xnn_delete_operator(convolution_op);
650 return status;
651 }
652
xnn_create_convolution2d_nhwc_qu8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,uint8_t input_zero_point,float input_scale,uint8_t kernel_zero_point,float kernel_scale,const uint8_t * kernel,const int32_t * bias,uint8_t output_zero_point,float output_scale,uint8_t output_min,uint8_t output_max,uint32_t flags,xnn_caches_t caches,xnn_operator_t * convolution_op_out)653 enum xnn_status xnn_create_convolution2d_nhwc_qu8(
654 uint32_t input_padding_top,
655 uint32_t input_padding_right,
656 uint32_t input_padding_bottom,
657 uint32_t input_padding_left,
658 uint32_t kernel_height,
659 uint32_t kernel_width,
660 uint32_t subsampling_height,
661 uint32_t subsampling_width,
662 uint32_t dilation_height,
663 uint32_t dilation_width,
664 uint32_t groups,
665 size_t group_input_channels,
666 size_t group_output_channels,
667 size_t input_channel_stride,
668 size_t output_channel_stride,
669 uint8_t input_zero_point,
670 float input_scale,
671 uint8_t kernel_zero_point,
672 float kernel_scale,
673 const uint8_t* kernel,
674 const int32_t* bias,
675 uint8_t output_zero_point,
676 float output_scale,
677 uint8_t output_min,
678 uint8_t output_max,
679 uint32_t flags,
680 xnn_caches_t caches,
681 xnn_operator_t* convolution_op_out)
682 {
683 if (input_scale <= 0.0f || !isnormal(input_scale)) {
684 xnn_log_error(
685 "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
686 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), input_scale);
687 return xnn_status_invalid_parameter;
688 }
689
690 if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
691 xnn_log_error(
692 "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
693 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), kernel_scale);
694 return xnn_status_invalid_parameter;
695 }
696
697 if (output_scale <= 0.0f || !isnormal(output_scale)) {
698 xnn_log_error(
699 "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
700 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), output_scale);
701 return xnn_status_invalid_parameter;
702 }
703
704 if (output_min >= output_max) {
705 xnn_log_error(
706 "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
707 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), output_min, output_max);
708 return xnn_status_invalid_parameter;
709 }
710
711 const float requantization_scale = input_scale * kernel_scale / output_scale;
712 if (requantization_scale >= 256.0f) {
713 xnn_log_error(
714 "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
715 "requantization scale %.7g is greater or equal to 256.0",
716 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8),
717 input_scale, kernel_scale, output_scale, requantization_scale);
718 return xnn_status_unsupported_parameter;
719 }
720
721 const struct xnn_qu8_packing_params packing_params = {
722 .input_zero_point = input_zero_point,
723 .kernel_zero_point = kernel_zero_point,
724 };
725
726
727 union xnn_qu8_conv_minmax_params gemm_params;
728 if XNN_LIKELY(xnn_params.qu8.gemm.init.qu8 != NULL) {
729 xnn_params.qu8.gemm.init.qu8(&gemm_params,
730 kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max);
731 }
732
733 union xnn_qu8_conv_minmax_params dwconv_params;
734 const struct dwconv_parameters* dwconv_ukernel =
735 find_dwconv_ukernel(kernel_height * kernel_width, xnn_params.qu8.dwconv, XNN_MAX_QU8_DWCONV_UKERNELS);
736 if XNN_LIKELY(dwconv_ukernel != NULL) {
737 dwconv_ukernel->init.qu8(&dwconv_params,
738 kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max);
739 }
740
741 return create_convolution2d_nhwc(
742 input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
743 kernel_height, kernel_width,
744 subsampling_height, subsampling_width,
745 dilation_height, dilation_width,
746 groups, group_input_channels, group_output_channels,
747 input_channel_stride, output_channel_stride,
748 kernel, bias, flags,
749 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
750 0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
751 sizeof(int32_t) /* sizeof(bias element) */,
752 (xnn_pack_vmulcaddc_w_function) NULL,
753 (xnn_pack_dwconv_hwg_w_function) xnn_pack_qu8_dwconv_hwg_w,
754 (xnn_pack_dwconv_ghw_w_function) xnn_pack_qu8_dwconv_ghw_w,
755 (xnn_pack_gemm_goi_w_function) xnn_pack_qu8_gemm_goi_w,
756 (xnn_pack_conv_kgo_w_function) xnn_pack_qu8_conv_kgo_w,
757 (xnn_pack_conv_goki_w_function) xnn_pack_qu8_conv_goki_w,
758 &packing_params, input_zero_point /* input padding byte */, kernel_zero_point /* packed weights padding byte */,
759 0 /* extra weights bytes */, NULL /* init scale params fn */, NULL /* scale params */,
760 &gemm_params, sizeof(gemm_params),
761 &dwconv_params, sizeof(dwconv_params),
762 NULL /* vmulcaddc params */, 0,
763 &xnn_params.qu8.gemm, dwconv_ukernel, NULL /* vmulcaddc parameters */,
764 NULL /* jit_gemm_params */,
765 false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_QU8,
766 xnn_operator_type_convolution_nhwc_qu8,
767 0, NULL,
768 caches,
769 convolution_op_out);
770 }
771
xnn_create_convolution2d_nhwc_qs8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,int8_t input_zero_point,float input_scale,float kernel_scale,const int8_t * kernel,const int32_t * bias,int8_t output_zero_point,float output_scale,int8_t output_min,int8_t output_max,uint32_t flags,xnn_caches_t caches,xnn_operator_t * convolution_op_out)772 enum xnn_status xnn_create_convolution2d_nhwc_qs8(
773 uint32_t input_padding_top,
774 uint32_t input_padding_right,
775 uint32_t input_padding_bottom,
776 uint32_t input_padding_left,
777 uint32_t kernel_height,
778 uint32_t kernel_width,
779 uint32_t subsampling_height,
780 uint32_t subsampling_width,
781 uint32_t dilation_height,
782 uint32_t dilation_width,
783 uint32_t groups,
784 size_t group_input_channels,
785 size_t group_output_channels,
786 size_t input_channel_stride,
787 size_t output_channel_stride,
788 int8_t input_zero_point,
789 float input_scale,
790 float kernel_scale,
791 const int8_t* kernel,
792 const int32_t* bias,
793 int8_t output_zero_point,
794 float output_scale,
795 int8_t output_min,
796 int8_t output_max,
797 uint32_t flags,
798 xnn_caches_t caches,
799 xnn_operator_t* convolution_op_out)
800 {
801 if (input_scale <= 0.0f || !isnormal(input_scale)) {
802 xnn_log_error(
803 "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
804 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), input_scale);
805 return xnn_status_invalid_parameter;
806 }
807
808 if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
809 xnn_log_error(
810 "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
811 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), kernel_scale);
812 return xnn_status_invalid_parameter;
813 }
814
815 if (output_scale <= 0.0f || !isnormal(output_scale)) {
816 xnn_log_error(
817 "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
818 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), output_scale);
819 return xnn_status_invalid_parameter;
820 }
821
822 if (output_min >= output_max) {
823 xnn_log_error(
824 "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: range min must be below range max",
825 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), output_min, output_max);
826 return xnn_status_invalid_parameter;
827 }
828
829 const float requantization_scale = input_scale * kernel_scale / output_scale;
830 if (requantization_scale >= 256.0f) {
831 xnn_log_error(
832 "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
833 "requantization scale %.7g is greater or equal to 256.0",
834 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8),
835 input_scale, kernel_scale, output_scale, requantization_scale);
836 return xnn_status_unsupported_parameter;
837 }
838
839 const struct xnn_qs8_packing_params packing_params = { .input_zero_point = input_zero_point, };
840
841 union xnn_qs8_conv_minmax_params gemm_params;
842 if XNN_LIKELY(xnn_params.qs8.gemm.init.qs8 != NULL) {
843 xnn_params.qs8.gemm.init.qs8(&gemm_params,
844 requantization_scale, output_zero_point, output_min, output_max);
845 }
846
847 union xnn_qs8_conv_minmax_params dwconv_params;
848 const struct dwconv_parameters* dwconv_ukernel =
849 find_dwconv_ukernel(kernel_height * kernel_width, xnn_params.qs8.dwconv, XNN_MAX_QS8_DWCONV_UKERNELS);
850 if XNN_LIKELY(dwconv_ukernel != NULL) {
851 dwconv_ukernel->init.qs8(&dwconv_params,
852 requantization_scale, output_zero_point, output_min, output_max);
853 }
854
855 return create_convolution2d_nhwc(
856 input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
857 kernel_height, kernel_width,
858 subsampling_height, subsampling_width,
859 dilation_height, dilation_width,
860 groups, group_input_channels, group_output_channels,
861 input_channel_stride, output_channel_stride,
862 kernel, bias, flags,
863 0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
864 0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
865 sizeof(int32_t) /* sizeof(bias element) */,
866 (xnn_pack_vmulcaddc_w_function) NULL,
867 (xnn_pack_dwconv_hwg_w_function) xnn_pack_qs8_dwconv_hwg_w,
868 (xnn_pack_dwconv_ghw_w_function) xnn_pack_qs8_dwconv_ghw_w,
869 (xnn_pack_gemm_goi_w_function) xnn_pack_qs8_gemm_goi_w,
870 (xnn_pack_conv_kgo_w_function) xnn_pack_qs8_conv_kgo_w,
871 (xnn_pack_conv_goki_w_function) xnn_pack_qs8_conv_goki_w,
872 &packing_params, input_zero_point /* input padding byte */, 0 /* packed weights padding byte */,
873 0 /* extra weights bytes */, NULL /* init scale params fn */, NULL /* scale params */,
874 &gemm_params, sizeof(gemm_params),
875 &dwconv_params, sizeof(dwconv_params),
876 NULL /* vmulcaddc params */, 0,
877 &xnn_params.qs8.gemm, dwconv_ukernel, NULL /* vmulcaddc parameters */,
878 NULL /* jit_gemm_params */,
879 false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_QS8,
880 xnn_operator_type_convolution_nhwc_qs8,
881 0, NULL,
882 caches,
883 convolution_op_out);
884 }
885
xnn_create_convolution2d_nhwc_qc8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,int8_t input_zero_point,float input_scale,const float * kernel_scale,const int8_t * kernel,const int32_t * bias,int8_t output_zero_point,float output_scale,int8_t output_min,int8_t output_max,uint32_t flags,xnn_caches_t caches,xnn_operator_t * convolution_op_out)886 enum xnn_status xnn_create_convolution2d_nhwc_qc8(
887 uint32_t input_padding_top,
888 uint32_t input_padding_right,
889 uint32_t input_padding_bottom,
890 uint32_t input_padding_left,
891 uint32_t kernel_height,
892 uint32_t kernel_width,
893 uint32_t subsampling_height,
894 uint32_t subsampling_width,
895 uint32_t dilation_height,
896 uint32_t dilation_width,
897 uint32_t groups,
898 size_t group_input_channels,
899 size_t group_output_channels,
900 size_t input_channel_stride,
901 size_t output_channel_stride,
902 int8_t input_zero_point,
903 float input_scale,
904 const float* kernel_scale,
905 const int8_t* kernel,
906 const int32_t* bias,
907 int8_t output_zero_point,
908 float output_scale,
909 int8_t output_min,
910 int8_t output_max,
911 uint32_t flags,
912 xnn_caches_t caches,
913 xnn_operator_t* convolution_op_out)
914 {
915 if (input_scale <= 0.0f || !isnormal(input_scale)) {
916 xnn_log_error(
917 "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
918 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), input_scale);
919 return xnn_status_invalid_parameter;
920 }
921
922 for (size_t output_channel = 0; output_channel < groups * group_output_channels; output_channel++) {
923 if (kernel_scale[output_channel] <= 0.0f || !isnormal(kernel_scale[output_channel])) {
924 xnn_log_error(
925 "failed to create %s operator with %.7g kernel scale in output channel #%zu: "
926 "scale must be finite, normalized, and positive",
927 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), kernel_scale[output_channel],
928 output_channel);
929 return xnn_status_invalid_parameter;
930 }
931 }
932
933 if (output_scale <= 0.0f || !isnormal(output_scale)) {
934 xnn_log_error(
935 "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
936 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), output_scale);
937 return xnn_status_invalid_parameter;
938 }
939
940 if (output_min >= output_max) {
941 xnn_log_error(
942 "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: range min must be below range max",
943 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8), output_min, output_max);
944 return xnn_status_invalid_parameter;
945 }
946
947 float* requantization_scale = XNN_SIMD_ALLOCA(groups * group_output_channels * sizeof(float));
948 for (size_t output_channel = 0; output_channel < groups * group_output_channels; output_channel++) {
949 requantization_scale[output_channel] = input_scale * kernel_scale[output_channel] / output_scale;
950 if (requantization_scale[output_channel] >= 256.0f) {
951 xnn_log_error(
952 "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale in output channel #%zu: "
953 "requantization scale %.7g is greater or equal to 256.0",
954 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qc8),
955 input_scale, kernel_scale[output_channel], output_scale,
956 output_channel, requantization_scale[output_channel]);
957 return xnn_status_unsupported_parameter;
958 }
959 }
960
961 const struct xnn_qs8_packing_params packing_params = { .input_zero_point = input_zero_point, };
962
963 union xnn_qc8_conv_minmax_params gemm_params;
964 if XNN_LIKELY(xnn_params.qc8.gemm.init.qc8 != NULL) {
965 xnn_params.qc8.gemm.init.qc8(&gemm_params,
966 output_zero_point, output_min, output_max);
967 }
968
969 union xnn_qc8_conv_minmax_params dwconv_params;
970 const struct dwconv_parameters* dwconv_ukernel =
971 find_dwconv_ukernel(kernel_height * kernel_width, xnn_params.qc8.dwconv, XNN_MAX_QC8_DWCONV_UKERNELS);
972 if XNN_LIKELY(dwconv_ukernel != NULL) {
973 dwconv_ukernel->init.qc8(&dwconv_params,
974 output_zero_point, output_min, output_max);
975 }
976
977 return create_convolution2d_nhwc(
978 input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
979 kernel_height, kernel_width,
980 subsampling_height, subsampling_width,
981 dilation_height, dilation_width,
982 groups, group_input_channels, group_output_channels,
983 input_channel_stride, output_channel_stride,
984 kernel, bias, flags,
985 0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
986 0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
987 sizeof(int32_t) /* sizeof(bias element) */,
988 (xnn_pack_vmulcaddc_w_function) NULL,
989 (xnn_pack_dwconv_hwg_w_function) xnn_pack_qs8_dwconv_hwg_w,
990 (xnn_pack_dwconv_ghw_w_function) xnn_pack_qs8_dwconv_ghw_w,
991 (xnn_pack_gemm_goi_w_function) xnn_pack_qs8_gemm_goi_w,
992 (xnn_pack_conv_kgo_w_function) xnn_pack_qs8_conv_kgo_w,
993 (xnn_pack_conv_goki_w_function) xnn_pack_qs8_conv_goki_w,
994 &packing_params, input_zero_point /* input padding byte */, 0 /* packed weights padding byte */,
995 sizeof(float) /* extra weights bytes */, xnn_init_qc8_scale_fp32_params, requantization_scale,
996 &gemm_params, sizeof(gemm_params),
997 &dwconv_params, sizeof(dwconv_params),
998 NULL /* vmulcaddc params */, 0,
999 &xnn_params.qc8.gemm, dwconv_ukernel, NULL /* vmulcaddc parameters */,
1000 NULL /* jit_gemm_params */,
1001 false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_QC8,
1002 xnn_operator_type_convolution_nhwc_qc8,
1003 0, NULL,
1004 caches,
1005 convolution_op_out);
1006 }
1007
xnn_create_convolution2d_nhwc_f16(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const void * kernel,const void * bias,float output_min,float output_max,uint32_t flags,xnn_caches_t caches,xnn_operator_t * convolution_op_out)1008 enum xnn_status xnn_create_convolution2d_nhwc_f16(
1009 uint32_t input_padding_top,
1010 uint32_t input_padding_right,
1011 uint32_t input_padding_bottom,
1012 uint32_t input_padding_left,
1013 uint32_t kernel_height,
1014 uint32_t kernel_width,
1015 uint32_t subsampling_height,
1016 uint32_t subsampling_width,
1017 uint32_t dilation_height,
1018 uint32_t dilation_width,
1019 uint32_t groups,
1020 size_t group_input_channels,
1021 size_t group_output_channels,
1022 size_t input_channel_stride,
1023 size_t output_channel_stride,
1024 const void* kernel,
1025 const void* bias,
1026 float output_min,
1027 float output_max,
1028 uint32_t flags,
1029 xnn_caches_t caches,
1030 xnn_operator_t* convolution_op_out)
1031 {
1032 if (isnan(output_min)) {
1033 xnn_log_error(
1034 "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
1035 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
1036 return xnn_status_invalid_parameter;
1037 }
1038
1039 if (isnan(output_max)) {
1040 xnn_log_error(
1041 "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
1042 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
1043 return xnn_status_invalid_parameter;
1044 }
1045
1046 const uint16_t fp16_output_min = fp16_ieee_from_fp32_value(output_min);
1047 const uint16_t fp16_output_max = fp16_ieee_from_fp32_value(output_max);
1048 const float rounded_output_min = fp16_ieee_to_fp32_value(fp16_output_min);
1049 const float rounded_output_max = fp16_ieee_to_fp32_value(fp16_output_max);
1050 if (rounded_output_min >= rounded_output_max) {
1051 xnn_log_error(
1052 "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
1053 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16), rounded_output_min, rounded_output_max);
1054 return xnn_status_invalid_parameter;
1055 }
1056
1057 union xnn_f16_minmax_params gemm_params;
1058 if XNN_LIKELY(xnn_params.f16.gemm.init.f16 != NULL) {
1059 xnn_params.f16.gemm.init.f16(&gemm_params,
1060 fp16_output_min, fp16_output_max);
1061 }
1062
1063 union xnn_f16_minmax_params dwconv_params;
1064 const struct dwconv_parameters* dwconv_ukernel =
1065 find_dwconv_ukernel(kernel_height * kernel_width, xnn_params.f16.dwconv, XNN_MAX_F16_DWCONV_UKERNELS);
1066 if XNN_LIKELY(dwconv_ukernel != NULL) {
1067 dwconv_ukernel->init.f16(&dwconv_params, fp16_output_min, fp16_output_max);
1068 }
1069
1070 union xnn_f16_minmax_params vmulcaddc_params;
1071 if XNN_LIKELY(xnn_params.f16.vmulcaddc.init.f16 != NULL) {
1072 xnn_params.f16.vmulcaddc.init.f16(&vmulcaddc_params, fp16_output_min, fp16_output_max);
1073 }
1074
1075 xnn_pack_vmulcaddc_w_function pack_vmulcaddc_w = (xnn_pack_vmulcaddc_w_function) xnn_pack_f16_vmulcaddc_w;
1076 xnn_pack_dwconv_hwg_w_function pack_dwconv_hwg_w = (xnn_pack_dwconv_hwg_w_function) xnn_pack_f16_dwconv_hwg_w;
1077 xnn_pack_dwconv_ghw_w_function pack_dwconv_ghw_w = (xnn_pack_dwconv_ghw_w_function) xnn_pack_f16_dwconv_ghw_w;
1078 xnn_pack_gemm_goi_w_function pack_gemm_goi_w = (xnn_pack_gemm_goi_w_function) xnn_pack_f16_gemm_goi_w;
1079 xnn_pack_conv_kgo_w_function pack_conv_kgo_w = (xnn_pack_conv_kgo_w_function) xnn_pack_f16_conv_kgo_w;
1080 xnn_pack_conv_goki_w_function pack_conv_goki_w = (xnn_pack_conv_goki_w_function) xnn_pack_f16_conv_goki_w;
1081 if (flags & XNN_FLAG_FP32_STATIC_WEIGHTS) {
1082 pack_vmulcaddc_w = (xnn_pack_vmulcaddc_w_function) xnn_pack_f32_to_f16_vmulcaddc_w;
1083 pack_dwconv_hwg_w = (xnn_pack_dwconv_hwg_w_function) xnn_pack_f32_to_f16_dwconv_hwg_w;
1084 pack_dwconv_ghw_w = (xnn_pack_dwconv_ghw_w_function) xnn_pack_f32_to_f16_dwconv_ghw_w;
1085 pack_gemm_goi_w = (xnn_pack_gemm_goi_w_function) xnn_pack_f32_to_f16_gemm_goi_w;
1086 pack_conv_kgo_w = (xnn_pack_conv_kgo_w_function) xnn_pack_f32_to_f16_conv_kgo_w;
1087 pack_conv_goki_w = (xnn_pack_conv_goki_w_function) xnn_pack_f32_to_f16_conv_goki_w;
1088 }
1089
1090 return create_convolution2d_nhwc(
1091 input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
1092 kernel_height, kernel_width,
1093 subsampling_height, subsampling_width,
1094 dilation_height, dilation_width,
1095 groups, group_input_channels, group_output_channels,
1096 input_channel_stride, output_channel_stride,
1097 kernel, bias, flags,
1098 1 /* log2(sizeof(input element)) = log2(sizeof(uint16_t)) */,
1099 1 /* log2(sizeof(filter element)) = log2(sizeof(uint16_t)) */,
1100 sizeof(uint16_t) /* sizeof(bias element) */,
1101 pack_vmulcaddc_w,
1102 pack_dwconv_hwg_w,
1103 pack_dwconv_ghw_w,
1104 pack_gemm_goi_w,
1105 pack_conv_kgo_w,
1106 pack_conv_goki_w,
1107 NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
1108 0 /* extra weights bytes */, NULL /* init scale params fn */, NULL /* scale params */,
1109 &gemm_params, sizeof(gemm_params),
1110 &dwconv_params, sizeof(dwconv_params),
1111 &vmulcaddc_params, sizeof(vmulcaddc_params),
1112 &xnn_params.f16.gemm, dwconv_ukernel, &xnn_params.f16.vmulcaddc,
1113 NULL /* jit_gemm_params */,
1114 false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_F16,
1115 xnn_operator_type_convolution_nhwc_f16,
1116 0, NULL,
1117 caches,
1118 convolution_op_out);
1119 }
1120
xnn_create_convolution2d_nhwc_f32(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const float * kernel,const float * bias,float output_min,float output_max,uint32_t flags,xnn_caches_t caches,xnn_operator_t * convolution_op_out)1121 enum xnn_status xnn_create_convolution2d_nhwc_f32(
1122 uint32_t input_padding_top,
1123 uint32_t input_padding_right,
1124 uint32_t input_padding_bottom,
1125 uint32_t input_padding_left,
1126 uint32_t kernel_height,
1127 uint32_t kernel_width,
1128 uint32_t subsampling_height,
1129 uint32_t subsampling_width,
1130 uint32_t dilation_height,
1131 uint32_t dilation_width,
1132 uint32_t groups,
1133 size_t group_input_channels,
1134 size_t group_output_channels,
1135 size_t input_channel_stride,
1136 size_t output_channel_stride,
1137 const float* kernel,
1138 const float* bias,
1139 float output_min,
1140 float output_max,
1141 uint32_t flags,
1142 xnn_caches_t caches,
1143 xnn_operator_t* convolution_op_out)
1144 {
1145 if (isnan(output_min)) {
1146 xnn_log_error(
1147 "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
1148 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1149 return xnn_status_invalid_parameter;
1150 }
1151
1152 if (isnan(output_max)) {
1153 xnn_log_error(
1154 "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
1155 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1156 return xnn_status_invalid_parameter;
1157 }
1158
1159 if (output_min >= output_max) {
1160 xnn_log_error(
1161 "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
1162 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32), output_min, output_max);
1163 return xnn_status_invalid_parameter;
1164 }
1165
1166 const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max);
1167 const bool relu_activation = (output_max == INFINITY) && (output_min == 0.0f);
1168
1169 const struct gemm_parameters* gemm_parameters = &xnn_params.f32.gemm;
1170 if (gemm_parameters->nr > group_output_channels) {
1171 // Default micro-kernel is suboptimal. Try to find a better micro-kernel.
1172
1173 if (xnn_params.f32.gemm2.minmax.igemm[gemm_parameters->mr].function[XNN_UARCH_DEFAULT] != NULL) {
1174 gemm_parameters = &xnn_params.f32.gemm2;
1175 }
1176 }
1177
1178 union xnn_f32_minmax_params gemm_params;
1179 if XNN_LIKELY(gemm_parameters->init.f32 != NULL) {
1180 gemm_parameters->init.f32(&gemm_params, output_min, output_max);
1181 }
1182
1183 struct jit_gemm_params jit_gemm_params = {
1184 .f32_minmax = {
1185 .min = output_min,
1186 .max = output_max
1187 }
1188 };
1189
1190 union xnn_f32_minmax_params dwconv_params;
1191 const struct dwconv_parameters* dwconv_ukernel =
1192 find_dwconv_ukernel(kernel_height * kernel_width, xnn_params.f32.dwconv, XNN_MAX_F32_DWCONV_UKERNELS);
1193 if XNN_LIKELY(dwconv_ukernel != NULL) {
1194 dwconv_ukernel->init.f32(&dwconv_params, output_min, output_max);
1195 }
1196
1197 union xnn_f32_minmax_params vmulcaddc_params;
1198 if XNN_LIKELY(xnn_params.f32.vmulcaddc.init.f32 != NULL) {
1199 xnn_params.f32.vmulcaddc.init.f32(&vmulcaddc_params, output_min, output_max);
1200 }
1201
1202 return create_convolution2d_nhwc(
1203 input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
1204 kernel_height, kernel_width,
1205 subsampling_height, subsampling_width,
1206 dilation_height, dilation_width,
1207 groups, group_input_channels, group_output_channels,
1208 input_channel_stride, output_channel_stride,
1209 kernel, bias, flags,
1210 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1211 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1212 sizeof(float) /* sizeof(bias element) */,
1213 (xnn_pack_vmulcaddc_w_function) xnn_pack_f32_vmulcaddc_w,
1214 (xnn_pack_dwconv_hwg_w_function) xnn_pack_f32_dwconv_hwg_w,
1215 (xnn_pack_dwconv_ghw_w_function) xnn_pack_f32_dwconv_ghw_w,
1216 (xnn_pack_gemm_goi_w_function) xnn_pack_f32_gemm_goi_w,
1217 (xnn_pack_conv_kgo_w_function) xnn_pack_f32_conv_kgo_w,
1218 (xnn_pack_conv_goki_w_function) xnn_pack_f32_conv_goki_w,
1219 NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
1220 0 /* extra weights bytes */, NULL /* init scale params fn */, NULL /* scale params */,
1221 &gemm_params, sizeof(gemm_params),
1222 &dwconv_params, sizeof(dwconv_params),
1223 &vmulcaddc_params, sizeof(vmulcaddc_params),
1224 gemm_parameters, dwconv_ukernel, &xnn_params.f32.vmulcaddc,
1225 &jit_gemm_params,
1226 linear_activation, relu_activation, XNN_INIT_FLAG_F32,
1227 xnn_operator_type_convolution_nhwc_f32,
1228 0, NULL,
1229 caches,
1230 convolution_op_out);
1231 }
1232
xnn_create_fused_convolution2d_nhwc_f32(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const float * kernel,const float * bias,size_t num_post_operations,struct xnn_post_operation * post_operations,uint32_t flags,xnn_caches_t caches,xnn_operator_t * convolution_op_out)1233 enum xnn_status xnn_create_fused_convolution2d_nhwc_f32(
1234 uint32_t input_padding_top,
1235 uint32_t input_padding_right,
1236 uint32_t input_padding_bottom,
1237 uint32_t input_padding_left,
1238 uint32_t kernel_height,
1239 uint32_t kernel_width,
1240 uint32_t subsampling_height,
1241 uint32_t subsampling_width,
1242 uint32_t dilation_height,
1243 uint32_t dilation_width,
1244 uint32_t groups,
1245 size_t group_input_channels,
1246 size_t group_output_channels,
1247 size_t input_channel_stride,
1248 size_t output_channel_stride,
1249 const float* kernel,
1250 const float* bias,
1251 size_t num_post_operations,
1252 struct xnn_post_operation* post_operations,
1253 uint32_t flags,
1254 xnn_caches_t caches,
1255 xnn_operator_t* convolution_op_out)
1256 {
1257 #if !XNN_ENABLE_JIT
1258 xnn_log_error(
1259 "failed to create %s operator: convolution with post operations available only if JIT is enabled",
1260 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
1261 return xnn_status_invalid_parameter;
1262 #endif
1263
1264 // Convolution is specified with linear activation, any clamping should be specified as a post operator.
1265 const float output_max = INFINITY;
1266 const float output_min = -INFINITY;
1267
1268 struct jit_gemm_params jit_gemm_params = {
1269 .f32_minmax = {
1270 .min = output_min,
1271 .max = output_max
1272 },
1273 .num_post_operations = num_post_operations,
1274 .post_operations = post_operations,
1275 };
1276
1277 char* post_operation_params = allocate_and_initialize_post_operation_params(num_post_operations, post_operations);
1278
1279 union xnn_f32_minmax_params gemm_params;
1280 if XNN_LIKELY(xnn_params.f32.gemm.init.f32 != NULL) {
1281 xnn_params.f32.gemm.init.f32(&gemm_params, output_min, output_max);
1282 }
1283
1284 union xnn_f32_minmax_params dwconv_params;
1285 const struct dwconv_parameters* dwconv_ukernel =
1286 find_dwconv_ukernel(kernel_height * kernel_width, xnn_params.f32.dwconv, XNN_MAX_F32_DWCONV_UKERNELS);
1287 if XNN_LIKELY(dwconv_ukernel != NULL) {
1288 dwconv_ukernel->init.f32(&dwconv_params, output_min, output_max);
1289 }
1290
1291 union xnn_f32_minmax_params vmulcaddc_params;
1292 if XNN_LIKELY(xnn_params.f32.vmulcaddc.init.f32 != NULL) {
1293 xnn_params.f32.vmulcaddc.init.f32(&vmulcaddc_params, output_min, output_max);
1294 }
1295
1296 return create_convolution2d_nhwc(
1297 input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
1298 kernel_height, kernel_width,
1299 subsampling_height, subsampling_width,
1300 dilation_height, dilation_width,
1301 groups, group_input_channels, group_output_channels,
1302 input_channel_stride, output_channel_stride,
1303 kernel, bias, flags,
1304 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1305 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1306 sizeof(float) /* sizeof(bias element) */,
1307 (xnn_pack_vmulcaddc_w_function) xnn_pack_f32_vmulcaddc_w,
1308 (xnn_pack_dwconv_hwg_w_function) xnn_pack_f32_dwconv_hwg_w,
1309 (xnn_pack_dwconv_ghw_w_function) xnn_pack_f32_dwconv_ghw_w,
1310 (xnn_pack_gemm_goi_w_function) xnn_pack_f32_gemm_goi_w,
1311 (xnn_pack_conv_kgo_w_function) xnn_pack_f32_conv_kgo_w,
1312 (xnn_pack_conv_goki_w_function) xnn_pack_f32_conv_goki_w,
1313 NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
1314 0 /* extra weights bytes */, NULL /* init scale params fn */, NULL /* scale params */,
1315 (void*) &gemm_params, sizeof(gemm_params),
1316 &dwconv_params, sizeof(dwconv_params),
1317 &vmulcaddc_params, sizeof(vmulcaddc_params),
1318 &xnn_params.f32.gemm, dwconv_ukernel, &xnn_params.f32.vmulcaddc,
1319 &jit_gemm_params,
1320 true /* linear_activation */, false /* relu_activation */, XNN_INIT_FLAG_F32,
1321 xnn_operator_type_convolution_nhwc_f32,
1322 num_post_operations, post_operation_params,
1323 caches,
1324 convolution_op_out);
1325 }
1326
setup_convolution2d_nhwc(xnn_operator_t convolution_op,enum xnn_operator_type expected_operator_type,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,uint32_t datatype_init_flags,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t extra_weights_elements_size,uint32_t log2_output_element_size,size_t num_threads)1327 static enum xnn_status setup_convolution2d_nhwc(
1328 xnn_operator_t convolution_op,
1329 enum xnn_operator_type expected_operator_type,
1330 size_t batch_size,
1331 size_t input_height,
1332 size_t input_width,
1333 const void* input,
1334 void* output,
1335 uint32_t datatype_init_flags,
1336 uint32_t log2_input_element_size,
1337 uint32_t log2_filter_element_size,
1338 uint32_t extra_weights_elements_size,
1339 uint32_t log2_output_element_size,
1340 size_t num_threads)
1341 {
1342 if (convolution_op->type != expected_operator_type) {
1343 xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1344 xnn_operator_type_to_string(expected_operator_type),
1345 xnn_operator_type_to_string(convolution_op->type));
1346 return xnn_status_invalid_parameter;
1347 }
1348 convolution_op->state = xnn_run_state_invalid;
1349
1350 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
1351 xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
1352 xnn_operator_type_to_string(convolution_op->type));
1353 return xnn_status_uninitialized;
1354 }
1355
1356 if ((xnn_params.init_flags & datatype_init_flags) != datatype_init_flags) {
1357 xnn_log_error(
1358 "failed to create %s operator: operations on data type are not supported",
1359 xnn_operator_type_to_string(convolution_op->type));
1360 return xnn_status_unsupported_hardware;
1361 }
1362
1363 if (input_width == 0 || input_height == 0) {
1364 xnn_log_error(
1365 "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
1366 xnn_operator_type_to_string(convolution_op->type), input_width, input_height);
1367 return xnn_status_invalid_parameter;
1368 }
1369
1370 if (batch_size == 0) {
1371 convolution_op->state = xnn_run_state_skip;
1372 return xnn_status_success;
1373 }
1374
1375 if (convolution_op->weights_cache != NULL && !xnn_weights_cache_is_finalized(convolution_op->weights_cache)) {
1376 xnn_log_error("failed to setup %s operator: weights cache is not finalized",
1377 xnn_operator_type_to_string(convolution_op->type));
1378 return xnn_status_invalid_state;
1379 }
1380
1381 convolution_op->batch_size = batch_size;
1382 convolution_op->input_height = input_height;
1383 convolution_op->input_width = input_width;
1384 convolution_op->input = input;
1385
1386 if (convolution_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
1387 convolution_op->output_height = compute_output_dimension_with_tf_same_padding(
1388 input_height, convolution_op->stride_height);
1389 convolution_op->output_width = compute_output_dimension_with_tf_same_padding(
1390 input_width, convolution_op->stride_width);
1391
1392 const uint32_t effective_kernel_height = (convolution_op->kernel_height - 1) * convolution_op->dilation_height + 1;
1393 const uint32_t effective_kernel_width = (convolution_op->kernel_width - 1) * convolution_op->dilation_width + 1;
1394 const size_t total_padding_height =
1395 (convolution_op->output_height - 1) * convolution_op->stride_height + effective_kernel_height - input_height;
1396 const size_t total_padding_width =
1397 (convolution_op->output_width - 1) * convolution_op->stride_width + effective_kernel_width - input_width;
1398 convolution_op->padding_top = total_padding_height / 2;
1399 convolution_op->padding_left = total_padding_width / 2;
1400 convolution_op->padding_bottom = total_padding_height - convolution_op->padding_top;
1401 convolution_op->padding_right = total_padding_width - convolution_op->padding_left;
1402 } else {
1403 convolution_op->output_height = xnn_compute_convolution_output_dimension(
1404 convolution_op->padding_top + input_height + convolution_op->padding_bottom,
1405 convolution_op->kernel_height,
1406 convolution_op->dilation_height,
1407 convolution_op->stride_height);
1408 convolution_op->output_width = xnn_compute_convolution_output_dimension(
1409 convolution_op->padding_left + input_width + convolution_op->padding_right,
1410 convolution_op->kernel_width,
1411 convolution_op->dilation_width,
1412 convolution_op->stride_width);
1413 }
1414 convolution_op->output = output;
1415
1416 switch (convolution_op->ukernel.type) {
1417 case xnn_ukernel_type_gemm:
1418 {
1419 // Convolution maps directly to GEMM and doesn't use indirection buffer.
1420
1421 const size_t output_height = convolution_op->output_height;
1422 const size_t output_width = convolution_op->output_width;
1423 const size_t output_size = output_height * output_width;
1424 const size_t batch_output_size = batch_size * output_size;
1425
1426 const size_t groups = convolution_op->groups;
1427 const size_t group_input_channels = convolution_op->group_input_channels;
1428 const size_t w_stride = extra_weights_elements_size +
1429 (round_up_po2(group_input_channels, convolution_op->ukernel.gemm.kr * convolution_op->ukernel.gemm.sr) << log2_filter_element_size);
1430 const size_t group_output_channels = convolution_op->group_output_channels;
1431
1432 uint32_t mr = convolution_op->ukernel.gemm.mr;
1433 const uint32_t nr = convolution_op->ukernel.gemm.nr;
1434 struct xnn_hmp_gemm_ukernel *gemm_cases = convolution_op->ukernel.gemm.gemm_cases;
1435
1436 #if XNN_ENABLE_GEMM_M_SPECIALIZATION
1437 mr = xnn_get_heuristic_mr_gemm(batch_output_size, mr, nr, gemm_cases);
1438 #else
1439 if (batch_output_size == 1 && gemm_cases[0].function[XNN_UARCH_DEFAULT] != NULL) {
1440 mr = 1;
1441 }
1442 #endif
1443
1444 #if XNN_PLATFORM_JIT
1445 if (convolution_op->code_cache != NULL) {
1446 const size_t jit_code_offset = gemm_cases[mr - 1].generated_code_offset[XNN_UARCH_DEFAULT];
1447 if (jit_code_offset != XNN_CACHE_NOT_FOUND) {
1448 gemm_cases[mr - 1].function[XNN_UARCH_DEFAULT] =
1449 (xnn_gemm_ukernel_function) cached_code_at_offset(convolution_op, jit_code_offset);
1450 }
1451 }
1452 #endif // XNN_PLATFORM_JIT
1453 struct xnn_hmp_gemm_ukernel gemm_ukernel = gemm_cases[mr - 1];
1454
1455 convolution_op->context.gemm = (struct gemm_context) {
1456 .k_scaled = group_input_channels << log2_input_element_size,
1457 .a = input,
1458 .a_stride = convolution_op->input_pixel_stride << log2_input_element_size,
1459 .packed_w = packed_weights(convolution_op),
1460 .w_stride = w_stride,
1461 .wg_stride = w_stride * round_up(group_output_channels, nr),
1462 .c = output,
1463 .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1464 .cn_stride = nr << log2_output_element_size,
1465 .cg_stride = group_output_channels << log2_output_element_size,
1466 .log2_csize = log2_output_element_size,
1467 .ukernel = gemm_ukernel,
1468 };
1469 memcpy(&convolution_op->context.gemm.params, &convolution_op->params, sizeof(convolution_op->context.gemm.params));
1470 if (convolution_op->num_post_operation_params == 0) {
1471 convolution_op->context.gemm.fused_params = &convolution_op->context.gemm.params;
1472 } else {
1473 convolution_op->context.gemm.fused_params = convolution_op->post_operation_params;
1474 }
1475
1476 #if XNN_TEST_MODE
1477 const size_t nc = nr;
1478 #else
1479 size_t nc = group_output_channels;
1480 if (num_threads > 1) {
1481 const size_t num_other_tiles = groups * divide_round_up(batch_output_size, mr);
1482 const size_t target_tiles_per_thread = 5;
1483 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
1484 if (max_nc < nc) {
1485 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
1486 }
1487 }
1488 #endif
1489 if (groups == 1) {
1490 #if XNN_MAX_UARCH_TYPES > 1
1491 if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) {
1492 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d_with_uarch;
1493 convolution_op->compute.task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_gemm;
1494 } else {
1495 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
1496 convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
1497 }
1498 #else
1499 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
1500 convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
1501 #endif
1502 convolution_op->compute.range[0] = batch_output_size;
1503 convolution_op->compute.range[1] = group_output_channels;
1504 convolution_op->compute.tile[0] = mr;
1505 convolution_op->compute.tile[1] = nc;
1506 } else {
1507 #if XNN_MAX_UARCH_TYPES > 1
1508 if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) {
1509 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1510 convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_gemm;
1511 } else {
1512 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1513 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm;
1514 }
1515 #else
1516 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1517 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm;
1518 #endif
1519 convolution_op->compute.range[0] = groups;
1520 convolution_op->compute.range[1] = batch_output_size;
1521 convolution_op->compute.range[2] = group_output_channels;
1522 convolution_op->compute.tile[0] = mr;
1523 convolution_op->compute.tile[1] = nc;
1524 }
1525 convolution_op->state = xnn_run_state_ready;
1526
1527 return xnn_status_success;
1528 }
1529 case xnn_ukernel_type_igemm:
1530 {
1531 const size_t groups = convolution_op->groups;
1532 const size_t kernel_height = convolution_op->kernel_height;
1533 const size_t kernel_width = convolution_op->kernel_width;
1534 const size_t kernel_size = kernel_height * kernel_width;
1535 const size_t output_height = convolution_op->output_height;
1536 const size_t output_width = convolution_op->output_width;
1537 const size_t output_size = output_height * output_width;
1538
1539 uint32_t mr = convolution_op->ukernel.igemm.mr;
1540 const uint32_t nr = convolution_op->ukernel.igemm.nr;
1541 struct xnn_hmp_igemm_ukernel* igemm_cases = convolution_op->ukernel.igemm.igemm_cases;
1542
1543 #if XNN_ENABLE_GEMM_M_SPECIALIZATION
1544 mr = xnn_get_heuristic_mr_igemm(output_size, mr, nr, igemm_cases);
1545 #else
1546 if (output_size == 1 && igemm_cases[0].function[XNN_UARCH_DEFAULT] != NULL) {
1547 mr = 1;
1548 }
1549 #endif
1550
1551 #if XNN_PLATFORM_JIT
1552 if (convolution_op->code_cache != NULL) {
1553 const size_t jit_code_offset = igemm_cases[mr - 1].generated_code_offset[XNN_UARCH_DEFAULT];
1554 if (jit_code_offset != XNN_CACHE_NOT_FOUND) {
1555 igemm_cases[mr - 1].function[XNN_UARCH_DEFAULT] =
1556 (xnn_igemm_ukernel_function) cached_code_at_offset(convolution_op, jit_code_offset);
1557 }
1558 }
1559 #endif // XNN_PLATFORM_JIT
1560 struct xnn_hmp_igemm_ukernel igemm_ukernel = igemm_cases[mr - 1];
1561
1562 const size_t tiled_output_size = round_up(output_size, mr);
1563 const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
1564
1565 if (input_height != convolution_op->last_input_height ||
1566 input_width != convolution_op->last_input_width)
1567 {
1568 const void** indirection_buffer = (const void**) xnn_reallocate_memory((void*) convolution_op->indirection_buffer, indirection_buffer_size);
1569 if (indirection_buffer == NULL) {
1570 xnn_log_error(
1571 "failed to allocate %zu bytes for %s operator indirection buffer",
1572 indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type));
1573 return xnn_status_out_of_memory;
1574 }
1575 convolution_op->indirection_buffer = indirection_buffer;
1576 convolution_op->last_input = input;
1577 convolution_op->last_input_height = input_height;
1578 convolution_op->last_input_width = input_width;
1579
1580 xnn_indirection_init_conv2d(convolution_op, mr, log2_input_element_size);
1581 }
1582
1583 const size_t group_input_channels = convolution_op->group_input_channels;
1584 const size_t w_stride = extra_weights_elements_size +
1585 (round_up_po2(group_input_channels, convolution_op->ukernel.igemm.kr * convolution_op->ukernel.igemm.sr) * kernel_size << log2_filter_element_size);
1586 const size_t group_output_channels = convolution_op->group_output_channels;
1587 convolution_op->context.igemm = (struct igemm_context) {
1588 .ks = kernel_size,
1589 .ks_scaled = kernel_size * mr * sizeof(void*),
1590 .kc = group_input_channels << log2_input_element_size,
1591 .w_stride = w_stride,
1592 .indirect_a = convolution_op->indirection_buffer,
1593 .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) convolution_op->last_input),
1594 .zero = convolution_op->zero_buffer,
1595 .packed_w = packed_weights(convolution_op),
1596 .c = convolution_op->output,
1597 .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1598 .cn_stride = nr << log2_output_element_size,
1599 .ga_stride = group_input_channels << log2_input_element_size,
1600 .gw_stride = w_stride * round_up(group_output_channels, nr),
1601 .gc_stride = group_output_channels << log2_output_element_size,
1602 .ba_stride = input_height * input_width * convolution_op->input_pixel_stride << log2_input_element_size,
1603 .bc_stride = output_size * convolution_op->output_pixel_stride << log2_output_element_size,
1604 .log2_csize = log2_output_element_size,
1605 .ukernel = igemm_ukernel,
1606 };
1607 memcpy(&convolution_op->context.igemm.params, &convolution_op->params, sizeof(convolution_op->context.igemm.params));
1608
1609 #if XNN_TEST_MODE
1610 const size_t nc = nr;
1611 #else
1612 size_t nc = group_output_channels;
1613 if (num_threads > 1) {
1614 const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
1615 const size_t target_tiles_per_thread = 5;
1616 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
1617 if (max_nc < nc) {
1618 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
1619 }
1620 }
1621 #endif
1622 if (groups == 1) {
1623 #if XNN_MAX_UARCH_TYPES > 1
1624 if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) {
1625 if (batch_size > 1) {
1626 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1627 convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_batch_hmp_igemm;
1628 } else {
1629 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d_with_uarch;
1630 convolution_op->compute.task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_igemm;
1631 }
1632 } else {
1633 if (batch_size > 1) {
1634 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1635 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
1636 } else {
1637 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
1638 convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
1639 }
1640 }
1641 #else
1642 if (batch_size > 1) {
1643 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1644 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
1645 } else {
1646 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
1647 convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
1648 }
1649 #endif
1650 if (batch_size > 1) {
1651 convolution_op->compute.range[0] = batch_size;
1652 convolution_op->compute.range[1] = output_size;
1653 convolution_op->compute.range[2] = group_output_channels;
1654 } else {
1655 convolution_op->compute.range[0] = output_size;
1656 convolution_op->compute.range[1] = group_output_channels;
1657 }
1658 convolution_op->compute.tile[0] = mr;
1659 convolution_op->compute.tile[1] = nc;
1660 } else {
1661 #if XNN_MAX_UARCH_TYPES > 1
1662 if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) {
1663 if (batch_size > 1) {
1664 convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d_with_uarch;
1665 convolution_op->compute.task_4d_tile_2d_with_id = (pthreadpool_task_4d_tile_2d_with_id_t) xnn_compute_hmp_grouped_batch_igemm;
1666 } else {
1667 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1668 convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_igemm;
1669 }
1670 } else {
1671 if (batch_size > 1) {
1672 convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
1673 convolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
1674 } else {
1675 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1676 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
1677 }
1678 }
1679 #else
1680 if (batch_size > 1) {
1681 convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
1682 convolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
1683 } else {
1684 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1685 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
1686 }
1687 #endif
1688 if (batch_size > 1) {
1689 convolution_op->compute.range[0] = batch_size;
1690 convolution_op->compute.range[1] = groups;
1691 convolution_op->compute.range[2] = output_size;
1692 convolution_op->compute.range[3] = group_output_channels;
1693 } else {
1694 convolution_op->compute.range[0] = groups;
1695 convolution_op->compute.range[1] = output_size;
1696 convolution_op->compute.range[2] = group_output_channels;
1697 }
1698 convolution_op->compute.tile[0] = mr;
1699 convolution_op->compute.tile[1] = nc;
1700 }
1701 convolution_op->state = xnn_run_state_ready;
1702
1703 return xnn_status_success;
1704 }
1705 case xnn_ukernel_type_dwconv:
1706 {
1707 const size_t kernel_height = convolution_op->kernel_height;
1708 const size_t kernel_width = convolution_op->kernel_width;
1709 const size_t kernel_size = kernel_height * kernel_width;
1710 const size_t output_height = convolution_op->output_height;
1711 const size_t output_width = convolution_op->output_width;
1712 const size_t step_width = convolution_op->dilation_width == 1 ? convolution_op->stride_width : kernel_width;
1713 const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
1714 const size_t primary_tile = convolution_op->ukernel.dwconv.primary_tile;
1715 if (input_height != convolution_op->last_input_height || input_width != convolution_op->last_input_width) {
1716 // Micro-kernel will read (primary_tile - kernel_size) elements after the end of indirection buffer.
1717 const size_t indirection_buffer_size =
1718 sizeof(void*) * (primary_tile - kernel_size + output_height * step_height);
1719
1720 const void** indirection_buffer =
1721 (const void**) xnn_reallocate_memory(convolution_op->indirection_buffer, indirection_buffer_size);
1722 if (indirection_buffer == NULL) {
1723 xnn_log_error("failed to allocate %zu bytes for %s operator indirection buffer",
1724 indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type));
1725 return xnn_status_out_of_memory;
1726 }
1727 convolution_op->indirection_buffer = indirection_buffer;
1728
1729 xnn_indirection_init_dwconv2d(convolution_op, step_height, step_width, primary_tile, log2_input_element_size);
1730
1731 convolution_op->last_input = input;
1732 convolution_op->last_input_height = input_height;
1733 convolution_op->last_input_width = input_width;
1734 }
1735
1736 const size_t groups = convolution_op->groups;
1737 convolution_op->context.dwconv = (struct dwconv_context) {
1738 .indirect_input = convolution_op->indirection_buffer,
1739 .indirect_input_width_stride = kernel_height * step_width * sizeof(void*),
1740 .indirect_input_height_stride = step_height * sizeof(void*),
1741 .input_offset = (size_t) ((uintptr_t) input - (uintptr_t) convolution_op->last_input),
1742 .input_batch_stride = (input_height * input_width * convolution_op->input_pixel_stride) << log2_input_element_size,
1743 .packed_weights = packed_weights(convolution_op),
1744 .output = convolution_op->output,
1745 .output_batch_stride = (output_height * output_width * convolution_op->output_pixel_stride) << log2_output_element_size,
1746 .output_height_stride = (output_width * convolution_op->output_pixel_stride) << log2_output_element_size,
1747 .output_width = output_width,
1748 .groups = groups,
1749 .zero = convolution_op->zero_buffer,
1750 .output_increment = (convolution_op->output_pixel_stride - groups) << log2_output_element_size,
1751 .unipass_ukernel = convolution_op->ukernel.dwconv.unipass_function,
1752 };
1753 memcpy(&convolution_op->context.dwconv.params, &convolution_op->params, sizeof(convolution_op->context.dwconv.params));
1754
1755 convolution_op->compute.type = xnn_parallelization_type_2d;
1756 convolution_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_dwconv_unipass;
1757 convolution_op->compute.range[0] = batch_size;
1758 convolution_op->compute.range[1] = output_height;
1759 convolution_op->state = xnn_run_state_ready;
1760
1761 return xnn_status_success;
1762 }
1763 case xnn_ukernel_type_vmulcaddc:
1764 {
1765 const size_t batch_output_size = batch_size * convolution_op->output_height * convolution_op->output_width;
1766
1767 convolution_op->context.vmulcaddc = (struct vmulcaddc_context) {
1768 .n = convolution_op->groups << log2_input_element_size,
1769 .x = input,
1770 .x_stride = convolution_op->input_pixel_stride << log2_input_element_size,
1771 .w = packed_weights(convolution_op),
1772 .y = output,
1773 .y_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1774 .ukernel = convolution_op->ukernel.vmulcaddc.function,
1775 };
1776 memcpy(&convolution_op->context.vmulcaddc.params, &convolution_op->params, sizeof(convolution_op->context.vmulcaddc.params));
1777
1778 #if XNN_TEST_MODE
1779 const size_t mc = convolution_op->ukernel.vmulcaddc.mr;
1780 #else
1781 size_t mc = batch_output_size;
1782 if (num_threads > 1) {
1783 const size_t target_tiles_per_thread = 5;
1784 const size_t max_mc = divide_round_up(batch_output_size, num_threads * target_tiles_per_thread);
1785 if (max_mc < mc) {
1786 const uint32_t mr = convolution_op->ukernel.vmulcaddc.mr;
1787 mc = min(mc, divide_round_up(mc, max_mc * mr) * mr);
1788 }
1789 }
1790 #endif
1791 convolution_op->compute.type = xnn_parallelization_type_1d_tile_1d;
1792 convolution_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_vmulcaddc;
1793 convolution_op->compute.range[0] = batch_output_size;
1794 convolution_op->compute.tile[0] = mc;
1795 convolution_op->state = xnn_run_state_ready;
1796
1797 return xnn_status_success;
1798 }
1799 default:
1800 XNN_UNREACHABLE;
1801 }
1802 }
1803
xnn_setup_convolution2d_nhwc_qu8(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const uint8_t * input,uint8_t * output,pthreadpool_t threadpool)1804 enum xnn_status xnn_setup_convolution2d_nhwc_qu8(
1805 xnn_operator_t convolution_op,
1806 size_t batch_size,
1807 size_t input_height,
1808 size_t input_width,
1809 const uint8_t* input,
1810 uint8_t* output,
1811 pthreadpool_t threadpool)
1812 {
1813 return setup_convolution2d_nhwc(
1814 convolution_op, xnn_operator_type_convolution_nhwc_qu8,
1815 batch_size, input_height, input_width,
1816 input, output,
1817 XNN_INIT_FLAG_QU8,
1818 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
1819 0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
1820 sizeof(int32_t) /* sizeof(extra weights elements) */,
1821 0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
1822 pthreadpool_get_threads_count(threadpool));
1823 }
1824
xnn_setup_convolution2d_nhwc_qs8(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const int8_t * input,int8_t * output,pthreadpool_t threadpool)1825 enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
1826 xnn_operator_t convolution_op,
1827 size_t batch_size,
1828 size_t input_height,
1829 size_t input_width,
1830 const int8_t* input,
1831 int8_t* output,
1832 pthreadpool_t threadpool)
1833 {
1834 return setup_convolution2d_nhwc(
1835 convolution_op, xnn_operator_type_convolution_nhwc_qs8,
1836 batch_size, input_height, input_width,
1837 input, output,
1838 XNN_INIT_FLAG_QS8,
1839 0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
1840 0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
1841 sizeof(int32_t) /* sizeof(extra weights elements) */,
1842 0 /* log2(sizeof(output element)) = log2(sizeof(int8_t)) */,
1843 pthreadpool_get_threads_count(threadpool));
1844 }
1845
xnn_setup_convolution2d_nhwc_qc8(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const int8_t * input,int8_t * output,pthreadpool_t threadpool)1846 enum xnn_status xnn_setup_convolution2d_nhwc_qc8(
1847 xnn_operator_t convolution_op,
1848 size_t batch_size,
1849 size_t input_height,
1850 size_t input_width,
1851 const int8_t* input,
1852 int8_t* output,
1853 pthreadpool_t threadpool)
1854 {
1855 return setup_convolution2d_nhwc(
1856 convolution_op, xnn_operator_type_convolution_nhwc_qc8,
1857 batch_size, input_height, input_width,
1858 input, output,
1859 XNN_INIT_FLAG_QC8,
1860 0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
1861 0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
1862 sizeof(int32_t) + sizeof(float) /* sizeof(extra weights elements) */,
1863 0 /* log2(sizeof(output element)) = log2(sizeof(int8_t)) */,
1864 pthreadpool_get_threads_count(threadpool));
1865 }
1866
xnn_setup_convolution2d_nhwc_f16(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,pthreadpool_t threadpool)1867 enum xnn_status xnn_setup_convolution2d_nhwc_f16(
1868 xnn_operator_t convolution_op,
1869 size_t batch_size,
1870 size_t input_height,
1871 size_t input_width,
1872 const void* input,
1873 void* output,
1874 pthreadpool_t threadpool)
1875 {
1876 return setup_convolution2d_nhwc(
1877 convolution_op, xnn_operator_type_convolution_nhwc_f16,
1878 batch_size, input_height, input_width,
1879 input, output,
1880 XNN_INIT_FLAG_F16,
1881 1 /* log2(sizeof(input element)) = log2(sizeof(uint16_t)) */,
1882 1 /* log2(sizeof(filter element)) = log2(sizeof(uint16_t)) */,
1883 sizeof(uint16_t) /* sizeof(extra weights elements) */,
1884 1 /* log2(sizeof(output element)) = log2(sizeof(uint16_t)) */,
1885 pthreadpool_get_threads_count(threadpool));
1886 }
1887
xnn_setup_convolution2d_nhwc_f32(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const float * input,float * output,pthreadpool_t threadpool)1888 enum xnn_status xnn_setup_convolution2d_nhwc_f32(
1889 xnn_operator_t convolution_op,
1890 size_t batch_size,
1891 size_t input_height,
1892 size_t input_width,
1893 const float* input,
1894 float* output,
1895 pthreadpool_t threadpool)
1896 {
1897 return setup_convolution2d_nhwc(
1898 convolution_op, xnn_operator_type_convolution_nhwc_f32,
1899 batch_size, input_height, input_width,
1900 input, output,
1901 XNN_INIT_FLAG_F32,
1902 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1903 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1904 sizeof(float) /* sizeof(extra weights elements) */,
1905 2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
1906 pthreadpool_get_threads_count(threadpool));
1907 }
1908