1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <assert.h>
10 #include <math.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <string.h>
16
17 #include <xnnpack.h>
18 #include <xnnpack/allocator.h>
19 #include <xnnpack/common.h>
20 #include <xnnpack/compute.h>
21 #include <xnnpack/indirection.h>
22 #include <xnnpack/log.h>
23 #include <xnnpack/math.h>
24 #include <xnnpack/operator.h>
25 #include <xnnpack/pack.h>
26 #include <xnnpack/params-init.h>
27 #include <xnnpack/params.h>
28
29
compute_output_dimension(size_t padded_input_dimension,size_t kernel_dimension,size_t dilation_dimension,size_t subsampling_dimension)30 static inline size_t compute_output_dimension(
31 size_t padded_input_dimension,
32 size_t kernel_dimension,
33 size_t dilation_dimension,
34 size_t subsampling_dimension)
35 {
36 const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
37 return doz(padded_input_dimension, effective_kernel_dimension) / subsampling_dimension + 1;
38 }
39
compute_output_dimension_with_tf_same_padding(size_t input_dimension,size_t subsampling_dimension)40 static inline size_t compute_output_dimension_with_tf_same_padding(
41 size_t input_dimension,
42 size_t subsampling_dimension)
43 {
44 return divide_round_up(input_dimension, subsampling_dimension);
45 }
46
find_dwconv_ukernel(size_t kernel_size,const struct dwconv_parameters * ukernel,size_t num_ukernels)47 static const struct dwconv_parameters* find_dwconv_ukernel(
48 size_t kernel_size,
49 const struct dwconv_parameters* ukernel,
50 size_t num_ukernels)
51 {
52 while (num_ukernels-- != 0) {
53 if (ukernel->primary_tile == kernel_size) {
54 return ukernel;
55 }
56 ukernel++;
57 }
58 return NULL;
59 }
60
create_convolution2d_nhwc(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const void * kernel,const void * bias,uint32_t flags,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,xnn_pack_vmulcaddc_w_function pack_vmulcaddc_w,xnn_pack_dwconv_hwg_w_function pack_dwconv_hwg_w,xnn_pack_dwconv_ghw_w_function pack_dwconv_ghw_w,xnn_pack_gemm_goi_w_function pack_gemm_goi_w,xnn_pack_conv_kgo_w_function pack_conv_kgo_w,xnn_pack_conv_goki_w_function pack_conv_goki_w,const void * packing_params,int input_padding_byte,int packed_weights_padding_byte,const void * params,size_t params_size,const struct gemm_parameters * gemm_parameters,const struct dwconv_parameters * dwconv_parameters,size_t num_dwconv_parameters,const struct vmulcaddc_parameters * vmulcaddc_parameters,bool linear_activation,bool relu_activation,uint32_t datatype_init_flags,enum xnn_operator_type operator_type,xnn_operator_t * convolution_op_out)61 static enum xnn_status create_convolution2d_nhwc(
62 uint32_t input_padding_top,
63 uint32_t input_padding_right,
64 uint32_t input_padding_bottom,
65 uint32_t input_padding_left,
66 uint32_t kernel_height,
67 uint32_t kernel_width,
68 uint32_t subsampling_height,
69 uint32_t subsampling_width,
70 uint32_t dilation_height,
71 uint32_t dilation_width,
72 uint32_t groups,
73 size_t group_input_channels,
74 size_t group_output_channels,
75 size_t input_channel_stride,
76 size_t output_channel_stride,
77 const void* kernel,
78 const void* bias,
79 uint32_t flags,
80 uint32_t log2_input_element_size,
81 uint32_t log2_filter_element_size,
82 uint32_t bias_element_size,
83 xnn_pack_vmulcaddc_w_function pack_vmulcaddc_w,
84 xnn_pack_dwconv_hwg_w_function pack_dwconv_hwg_w,
85 xnn_pack_dwconv_ghw_w_function pack_dwconv_ghw_w,
86 xnn_pack_gemm_goi_w_function pack_gemm_goi_w,
87 xnn_pack_conv_kgo_w_function pack_conv_kgo_w,
88 xnn_pack_conv_goki_w_function pack_conv_goki_w,
89 const void* packing_params,
90 int input_padding_byte,
91 int packed_weights_padding_byte,
92 const void* params,
93 size_t params_size,
94 const struct gemm_parameters* gemm_parameters,
95 const struct dwconv_parameters* dwconv_parameters,
96 size_t num_dwconv_parameters,
97 const struct vmulcaddc_parameters* vmulcaddc_parameters,
98 bool linear_activation,
99 bool relu_activation,
100 uint32_t datatype_init_flags,
101 enum xnn_operator_type operator_type,
102 xnn_operator_t* convolution_op_out)
103 {
104 xnn_operator_t convolution_op = NULL;
105 enum xnn_status status = xnn_status_uninitialized;
106
107 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
108 xnn_log_error(
109 "failed to create %s operator: XNNPACK is not initialized",
110 xnn_operator_type_to_string(operator_type));
111 goto error;
112 }
113
114 status = xnn_status_unsupported_hardware;
115
116 if ((xnn_params.init_flags & datatype_init_flags) != datatype_init_flags) {
117 xnn_log_error(
118 "failed to create %s operator: operations on data type are not supported",
119 xnn_operator_type_to_string(operator_type));
120 goto error;
121 }
122
123 status = xnn_status_invalid_parameter;
124
125 if (kernel_width == 0 || kernel_height == 0) {
126 xnn_log_error(
127 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
128 xnn_operator_type_to_string(operator_type), kernel_width, kernel_height);
129 goto error;
130 }
131
132 if (subsampling_width == 0 || subsampling_height == 0) {
133 xnn_log_error(
134 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " subsampling: subsampling dimensions must be non-zero",
135 xnn_operator_type_to_string(operator_type), subsampling_width, subsampling_height);
136 goto error;
137 }
138
139 if (dilation_width == 0 || dilation_height == 0) {
140 xnn_log_error(
141 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " dilation: dilation dimensions must be non-zero",
142 xnn_operator_type_to_string(operator_type), dilation_width, dilation_height);
143 goto error;
144 }
145
146 if (groups == 0) {
147 xnn_log_error(
148 "failed to create %s operator with %" PRIu32 " groups: number of groups must be non-zero",
149 xnn_operator_type_to_string(operator_type), groups);
150 goto error;
151 }
152
153 if (group_input_channels == 0) {
154 xnn_log_error(
155 "failed to create %s operator with %zu input channels per group: number of channels must be non-zero",
156 xnn_operator_type_to_string(operator_type), group_input_channels);
157 goto error;
158 }
159
160 if (group_output_channels == 0) {
161 xnn_log_error(
162 "failed to create %s operator with %zu output channels per group: number of channels must be non-zero",
163 xnn_operator_type_to_string(operator_type), group_output_channels);
164 goto error;
165 }
166
167 const size_t input_channels = groups * group_input_channels;
168 if (input_channel_stride < input_channels) {
169 xnn_log_error(
170 "failed to create %s operator with input channel stride of %zu: "
171 "stride must be at least as large as the number of input channels (%" PRIu32 "x%zu)",
172 xnn_operator_type_to_string(operator_type),
173 input_channel_stride, groups, group_input_channels);
174 goto error;
175 }
176
177 const size_t output_channels = groups * group_output_channels;
178 if (output_channel_stride < output_channels) {
179 xnn_log_error(
180 "failed to create %s operator with output channel stride of %zu: "
181 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
182 xnn_operator_type_to_string(operator_type),
183 output_channel_stride, groups, group_output_channels);
184 goto error;
185 }
186
187 if ((flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) != 0 && group_input_channels != 1) {
188 xnn_log_error(
189 "failed to create depthwise %s operator with %zu input channels per group: "
190 "depthwise convolution must have exactly 1 input channel per group",
191 xnn_operator_type_to_string(operator_type), group_input_channels);
192 goto error;
193 }
194
195 const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
196 if ((flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
197 if (any_padding) {
198 xnn_log_error(
199 "failed to create %s operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
200 "TensorFlow SAME padding can't be combined with explicit padding specification",
201 xnn_operator_type_to_string(operator_type),
202 input_padding_top, input_padding_left, input_padding_bottom, input_padding_right);
203 goto error;
204 }
205 }
206
207 status = xnn_status_out_of_memory;
208
209 convolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
210 if (convolution_op == NULL) {
211 xnn_log_error(
212 "failed to allocate %zu bytes for %s operator descriptor",
213 sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
214 goto error;
215 }
216
217 const size_t kernel_size = kernel_height * kernel_width;
218
219 enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_default;
220 const struct dwconv_parameters* dwconv_ukernel = NULL;
221 const bool unit_subsampling = (subsampling_width | subsampling_height) == 1;
222 if (group_input_channels == 1 && group_output_channels == 1 && kernel_size == 1 && unit_subsampling && !any_padding && vmulcaddc_parameters != NULL) {
223 ukernel_type = xnn_ukernel_type_vmulcaddc;
224 } else if (group_input_channels == 1 && group_output_channels == 1 && (dwconv_ukernel =
225 find_dwconv_ukernel(kernel_size, dwconv_parameters, num_dwconv_parameters)) != NULL)
226 {
227 ukernel_type = xnn_ukernel_type_dwconv;
228 } else if (kernel_size == 1 && unit_subsampling && !any_padding) {
229 ukernel_type = xnn_ukernel_type_gemm;
230 } else {
231 ukernel_type = xnn_ukernel_type_igemm;
232 }
233 assert(ukernel_type != xnn_ukernel_type_default);
234
235 size_t zero_size = 0;
236 switch (ukernel_type) {
237 case xnn_ukernel_type_vmulcaddc:
238 {
239 assert(vmulcaddc_parameters != NULL);
240
241 const size_t c_stride = round_up_po2(groups, vmulcaddc_parameters->channel_tile);
242 const size_t packed_weights_size = ((UINT32_C(1) << log2_filter_element_size) + bias_element_size) * c_stride;
243 convolution_op->packed_weights = xnn_allocate_simd_memory(packed_weights_size);
244 if (convolution_op->packed_weights == NULL) {
245 xnn_log_error(
246 "failed to allocate %zu bytes for %s operator packed weights",
247 packed_weights_size, xnn_operator_type_to_string(operator_type));
248 goto error;
249 }
250
251 pack_vmulcaddc_w(
252 groups, vmulcaddc_parameters->channel_tile,
253 kernel, bias, convolution_op->packed_weights, packing_params);
254
255 convolution_op->ukernel.vmulcaddc = (struct xnn_ukernel_vmulcaddc) {
256 .function = vmulcaddc_parameters->ukernel,
257 .mr = vmulcaddc_parameters->row_tile,
258 };
259 break;
260 }
261 case xnn_ukernel_type_dwconv:
262 {
263 assert(dwconv_ukernel != NULL);
264 assert(dwconv_ukernel->primary_tile == kernel_size);
265
266 const size_t c_stride = round_up_po2(groups, dwconv_ukernel->channel_tile);
267 const size_t packed_weights_size = ((kernel_size << log2_filter_element_size) + bias_element_size) * c_stride;
268 convolution_op->packed_weights = xnn_allocate_simd_memory(packed_weights_size);
269 if (convolution_op->packed_weights == NULL) {
270 xnn_log_error(
271 "failed to allocate %zu bytes for %s operator packed weights",
272 packed_weights_size, xnn_operator_type_to_string(operator_type));
273 goto error;
274 }
275 memset(convolution_op->packed_weights, packed_weights_padding_byte, packed_weights_size);
276
277 if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
278 pack_dwconv_hwg_w(
279 kernel_height, kernel_width,
280 groups, dwconv_ukernel->channel_tile,
281 kernel, bias, convolution_op->packed_weights, packing_params);
282 } else {
283 pack_dwconv_ghw_w(
284 kernel_height, kernel_width,
285 groups, dwconv_ukernel->channel_tile,
286 kernel, bias, convolution_op->packed_weights, packing_params);
287 }
288
289 const union dwconv_fused_ukernels* ukernels = &dwconv_ukernel->minmax;
290 if (linear_activation && dwconv_ukernel->linear.unipass != NULL) {
291 ukernels = &dwconv_ukernel->linear;
292 }
293 convolution_op->ukernel.dwconv = (struct xnn_ukernel_dwconv) {
294 .unipass_function = ukernels->unipass,
295 .primary_tile = dwconv_ukernel->primary_tile,
296 .incremental_tile = dwconv_ukernel->incremental_tile,
297 };
298
299 zero_size = XNN_EXTRA_BYTES + (c_stride << log2_input_element_size);
300 break;
301 }
302 case xnn_ukernel_type_gemm:
303 case xnn_ukernel_type_igemm:
304 {
305 const uint32_t nr = gemm_parameters->nr;
306 const uint32_t kr = UINT32_C(1) << gemm_parameters->log2_kr;
307 const uint32_t sr = UINT32_C(1) << gemm_parameters->log2_sr;
308 const size_t n_stride = round_up(group_output_channels, nr);
309 const size_t k_stride = round_up_po2(group_input_channels, kr);
310
311 const size_t packed_group_weights_size = ((kernel_size * k_stride << log2_filter_element_size) + bias_element_size) * n_stride;
312 convolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
313 if (convolution_op->packed_weights == NULL) {
314 xnn_log_error(
315 "failed to allocate %zu bytes for %s operator packed weights",
316 packed_group_weights_size * groups, xnn_operator_type_to_string(operator_type));
317 goto error;
318 }
319 memset(convolution_op->packed_weights, packed_weights_padding_byte, packed_group_weights_size * groups);
320
321 const struct gemm_fused_ukernels* gemm_ukernels = &gemm_parameters->minmax;
322 if (linear_activation && gemm_parameters->linear.gemm.function[XNN_UARCH_DEFAULT] != NULL) {
323 gemm_ukernels = &gemm_parameters->linear;
324 } else if (relu_activation && gemm_parameters->relu.gemm.function[XNN_UARCH_DEFAULT] != NULL) {
325 gemm_ukernels = &gemm_parameters->relu;
326 }
327 switch (ukernel_type) {
328 case xnn_ukernel_type_gemm:
329 pack_gemm_goi_w(
330 groups, group_output_channels, group_input_channels,
331 nr, kr, sr,
332 kernel, bias, convolution_op->packed_weights, packing_params);
333 convolution_op->ukernel.gemm = (struct xnn_ukernel_gemm) {
334 .mr = gemm_parameters->mr,
335 .nr = nr,
336 .kr = kr,
337 .general_case = gemm_ukernels->gemm,
338 .mr1_case = gemm_ukernels->gemm1,
339 };
340 break;
341 case xnn_ukernel_type_igemm:
342 if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
343 pack_conv_kgo_w(
344 groups, group_output_channels, kernel_size,
345 nr, kr,
346 kernel, bias, convolution_op->packed_weights, packing_params);
347 } else {
348 pack_conv_goki_w(
349 groups, group_output_channels, kernel_size, group_input_channels,
350 nr, kr, sr,
351 kernel, bias, convolution_op->packed_weights, packing_params);
352 }
353 convolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
354 .mr = gemm_parameters->mr,
355 .nr = nr,
356 .kr = kr,
357 .general_case = gemm_ukernels->igemm,
358 .mr1_case = gemm_ukernels->igemm1,
359 };
360 break;
361 default:
362 XNN_UNREACHABLE;
363 }
364
365 zero_size = XNN_EXTRA_BYTES + (k_stride << log2_input_element_size);
366 break;
367 }
368 default:
369 XNN_UNREACHABLE;
370 }
371
372 const bool tf_same_padding = (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0 && kernel_size != 1;
373 if (any_padding || tf_same_padding) {
374 convolution_op->zero_buffer = xnn_allocate_simd_memory(zero_size);
375 if (convolution_op->zero_buffer == NULL) {
376 xnn_log_error(
377 "failed to allocate %zu bytes for %s operator zero padding",
378 zero_size, xnn_operator_type_to_string(operator_type));
379 goto error;
380 }
381 memset(convolution_op->zero_buffer, input_padding_byte, zero_size);
382 }
383
384 convolution_op->padding_top = input_padding_top;
385 convolution_op->padding_right = input_padding_right;
386 convolution_op->padding_bottom = input_padding_bottom;
387 convolution_op->padding_left = input_padding_left;
388
389 convolution_op->kernel_height = kernel_height;
390 convolution_op->kernel_width = kernel_width;
391 convolution_op->stride_height = subsampling_height;
392 convolution_op->stride_width = subsampling_width;
393 convolution_op->dilation_height = dilation_height;
394 convolution_op->dilation_width = dilation_width;
395 convolution_op->groups = groups;
396 convolution_op->group_input_channels = group_input_channels;
397 convolution_op->group_output_channels = group_output_channels;
398 convolution_op->input_pixel_stride = input_channel_stride;
399 convolution_op->output_pixel_stride = output_channel_stride;
400
401 memcpy(&convolution_op->params, params, params_size);
402 convolution_op->type = operator_type;
403 convolution_op->ukernel.type = ukernel_type;
404 if (tf_same_padding) {
405 convolution_op->flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
406 }
407
408 convolution_op->state = xnn_run_state_invalid;
409
410 *convolution_op_out = convolution_op;
411 return xnn_status_success;
412
413 error:
414 xnn_delete_operator(convolution_op);
415 return status;
416 }
417
xnn_create_convolution2d_nhwc_qu8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,uint8_t input_zero_point,float input_scale,uint8_t kernel_zero_point,float kernel_scale,const uint8_t * kernel,const int32_t * bias,uint8_t output_zero_point,float output_scale,uint8_t output_min,uint8_t output_max,uint32_t flags,xnn_operator_t * convolution_op_out)418 enum xnn_status xnn_create_convolution2d_nhwc_qu8(
419 uint32_t input_padding_top,
420 uint32_t input_padding_right,
421 uint32_t input_padding_bottom,
422 uint32_t input_padding_left,
423 uint32_t kernel_height,
424 uint32_t kernel_width,
425 uint32_t subsampling_height,
426 uint32_t subsampling_width,
427 uint32_t dilation_height,
428 uint32_t dilation_width,
429 uint32_t groups,
430 size_t group_input_channels,
431 size_t group_output_channels,
432 size_t input_channel_stride,
433 size_t output_channel_stride,
434 uint8_t input_zero_point,
435 float input_scale,
436 uint8_t kernel_zero_point,
437 float kernel_scale,
438 const uint8_t* kernel,
439 const int32_t* bias,
440 uint8_t output_zero_point,
441 float output_scale,
442 uint8_t output_min,
443 uint8_t output_max,
444 uint32_t flags,
445 xnn_operator_t* convolution_op_out)
446 {
447 if (input_scale <= 0.0f || !isnormal(input_scale)) {
448 xnn_log_error(
449 "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
450 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), input_scale);
451 return xnn_status_invalid_parameter;
452 }
453
454 if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
455 xnn_log_error(
456 "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
457 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), kernel_scale);
458 return xnn_status_invalid_parameter;
459 }
460
461 if (output_scale <= 0.0f || !isnormal(output_scale)) {
462 xnn_log_error(
463 "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
464 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), output_scale);
465 return xnn_status_invalid_parameter;
466 }
467
468 if (output_min >= output_max) {
469 xnn_log_error(
470 "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
471 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8), output_min, output_max);
472 return xnn_status_invalid_parameter;
473 }
474
475 const float requantization_scale = input_scale * kernel_scale / output_scale;
476 if (requantization_scale >= 1.0f) {
477 xnn_log_error(
478 "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
479 "requantization scale %.7g is greater or equal to 1.0",
480 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8),
481 input_scale, kernel_scale, output_scale, requantization_scale);
482 return xnn_status_unsupported_parameter;
483 }
484
485 const struct xnn_qu8_packing_params packing_params = {
486 .input_zero_point = input_zero_point,
487 .kernel_zero_point = kernel_zero_point,
488 };
489 const union xnn_qu8_gemm_params params = xnn_init_qu8_gemm_params(
490 kernel_zero_point, requantization_scale, output_zero_point, output_min, output_max);
491 return create_convolution2d_nhwc(
492 input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
493 kernel_height, kernel_width,
494 subsampling_height, subsampling_width,
495 dilation_height, dilation_width,
496 groups, group_input_channels, group_output_channels,
497 input_channel_stride, output_channel_stride,
498 kernel, bias, flags,
499 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
500 0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
501 sizeof(int32_t) /* sizeof(bias element) */,
502 (xnn_pack_vmulcaddc_w_function) NULL,
503 (xnn_pack_dwconv_hwg_w_function) xnn_pack_qu8_dwconv_hwg_w,
504 (xnn_pack_dwconv_ghw_w_function) xnn_pack_qu8_dwconv_ghw_w,
505 (xnn_pack_gemm_goi_w_function) xnn_pack_qu8_gemm_goi_w,
506 (xnn_pack_conv_kgo_w_function) xnn_pack_qu8_conv_kgo_w,
507 (xnn_pack_conv_goki_w_function) xnn_pack_qu8_conv_goki_w,
508 &packing_params, input_zero_point /* input padding byte */, kernel_zero_point /* packed weights padding byte */,
509 ¶ms, sizeof(params),
510 &xnn_params.qu8.gemm, xnn_params.qu8.dwconv, XNN_MAX_QU8_DWCONV_UKERNELS, NULL /* vmulcaddc parameters */,
511 false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_QU8,
512 xnn_operator_type_convolution_nhwc_qu8,
513 convolution_op_out);
514 }
515
xnn_create_convolution2d_nhwc_qs8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,int8_t input_zero_point,float input_scale,float kernel_scale,const int8_t * kernel,const int32_t * bias,int8_t output_zero_point,float output_scale,int8_t output_min,int8_t output_max,uint32_t flags,xnn_operator_t * convolution_op_out)516 enum xnn_status xnn_create_convolution2d_nhwc_qs8(
517 uint32_t input_padding_top,
518 uint32_t input_padding_right,
519 uint32_t input_padding_bottom,
520 uint32_t input_padding_left,
521 uint32_t kernel_height,
522 uint32_t kernel_width,
523 uint32_t subsampling_height,
524 uint32_t subsampling_width,
525 uint32_t dilation_height,
526 uint32_t dilation_width,
527 uint32_t groups,
528 size_t group_input_channels,
529 size_t group_output_channels,
530 size_t input_channel_stride,
531 size_t output_channel_stride,
532 int8_t input_zero_point,
533 float input_scale,
534 float kernel_scale,
535 const int8_t* kernel,
536 const int32_t* bias,
537 int8_t output_zero_point,
538 float output_scale,
539 int8_t output_min,
540 int8_t output_max,
541 uint32_t flags,
542 xnn_operator_t* convolution_op_out)
543 {
544 if (input_scale <= 0.0f || !isnormal(input_scale)) {
545 xnn_log_error(
546 "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive",
547 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), input_scale);
548 return xnn_status_invalid_parameter;
549 }
550
551 if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
552 xnn_log_error(
553 "failed to create %s operator with %.7g kernel scale: scale must be finite, normalized, and positive",
554 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), kernel_scale);
555 return xnn_status_invalid_parameter;
556 }
557
558 if (output_scale <= 0.0f || !isnormal(output_scale)) {
559 xnn_log_error(
560 "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive",
561 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), output_scale);
562 return xnn_status_invalid_parameter;
563 }
564
565 if (output_min >= output_max) {
566 xnn_log_error(
567 "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: range min must be below range max",
568 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8), output_min, output_max);
569 return xnn_status_invalid_parameter;
570 }
571
572 const float requantization_scale = input_scale * kernel_scale / output_scale;
573 if (requantization_scale >= 1.0f) {
574 xnn_log_error(
575 "failed to create %s operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
576 "requantization scale %.7g is greater or equal to 1.0",
577 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8),
578 input_scale, kernel_scale, output_scale, requantization_scale);
579 return xnn_status_unsupported_parameter;
580 }
581
582 const struct xnn_qs8_packing_params packing_params = { .input_zero_point = input_zero_point, };
583 const union xnn_qs8_gemm_params params = xnn_init_qs8_gemm_params(
584 requantization_scale, output_zero_point, output_min, output_max);
585 return create_convolution2d_nhwc(
586 input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
587 kernel_height, kernel_width,
588 subsampling_height, subsampling_width,
589 dilation_height, dilation_width,
590 groups, group_input_channels, group_output_channels,
591 input_channel_stride, output_channel_stride,
592 kernel, bias, flags,
593 0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
594 0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
595 sizeof(int32_t) /* sizeof(bias element) */,
596 (xnn_pack_vmulcaddc_w_function) NULL,
597 (xnn_pack_dwconv_hwg_w_function) xnn_pack_qs8_dwconv_hwg_w,
598 (xnn_pack_dwconv_ghw_w_function) xnn_pack_qs8_dwconv_ghw_w,
599 (xnn_pack_gemm_goi_w_function) xnn_pack_qs8_gemm_goi_w,
600 (xnn_pack_conv_kgo_w_function) xnn_pack_qs8_conv_kgo_w,
601 (xnn_pack_conv_goki_w_function) xnn_pack_qs8_conv_goki_w,
602 &packing_params, input_zero_point /* input padding byte */, 0 /* packed weights padding byte */,
603 ¶ms, sizeof(params),
604 &xnn_params.qs8.gemm, xnn_params.qs8.dwconv, XNN_MAX_QS8_DWCONV_UKERNELS, NULL /* vmulcaddc parameters */,
605 false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_QS8,
606 xnn_operator_type_convolution_nhwc_qs8,
607 convolution_op_out);
608 }
609
xnn_create_convolution2d_nhwc_f16(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const void * kernel,const void * bias,float output_min,float output_max,uint32_t flags,xnn_operator_t * convolution_op_out)610 enum xnn_status xnn_create_convolution2d_nhwc_f16(
611 uint32_t input_padding_top,
612 uint32_t input_padding_right,
613 uint32_t input_padding_bottom,
614 uint32_t input_padding_left,
615 uint32_t kernel_height,
616 uint32_t kernel_width,
617 uint32_t subsampling_height,
618 uint32_t subsampling_width,
619 uint32_t dilation_height,
620 uint32_t dilation_width,
621 uint32_t groups,
622 size_t group_input_channels,
623 size_t group_output_channels,
624 size_t input_channel_stride,
625 size_t output_channel_stride,
626 const void* kernel,
627 const void* bias,
628 float output_min,
629 float output_max,
630 uint32_t flags,
631 xnn_operator_t* convolution_op_out)
632 {
633 if (isnan(output_min)) {
634 xnn_log_error(
635 "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
636 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
637 return xnn_status_invalid_parameter;
638 }
639
640 if (isnan(output_max)) {
641 xnn_log_error(
642 "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
643 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16));
644 return xnn_status_invalid_parameter;
645 }
646
647 const uint16_t fp16_output_min = fp16_ieee_from_fp32_value(output_min);
648 const uint16_t fp16_output_max = fp16_ieee_from_fp32_value(output_max);
649 const float rounded_output_min = fp16_ieee_to_fp32_value(fp16_output_min);
650 const float rounded_output_max = fp16_ieee_to_fp32_value(fp16_output_max);
651 if (rounded_output_min >= rounded_output_max) {
652 xnn_log_error(
653 "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
654 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16), rounded_output_min, rounded_output_max);
655 return xnn_status_invalid_parameter;
656 }
657
658 const struct {
659 struct xnn_f16_minmax_params minmax;
660 struct xnn_f16_scaleminmax_params scaleminmax;
661 } params = {
662 .minmax = xnn_init_f16_minmax_params(fp16_output_min, fp16_output_max),
663 .scaleminmax = xnn_init_f16_scaleminmax_params(
664 UINT16_C(0x3C00) /* 1.0 */, fp16_output_min, fp16_output_max),
665 };
666 return create_convolution2d_nhwc(
667 input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
668 kernel_height, kernel_width,
669 subsampling_height, subsampling_width,
670 dilation_height, dilation_width,
671 groups, group_input_channels, group_output_channels,
672 input_channel_stride, output_channel_stride,
673 kernel, bias, flags,
674 1 /* log2(sizeof(input element)) = log2(sizeof(uint16_t)) */,
675 1 /* log2(sizeof(filter element)) = log2(sizeof(uint16_t)) */,
676 sizeof(uint16_t) /* sizeof(bias element) */,
677 (xnn_pack_vmulcaddc_w_function) xnn_pack_f16_vmulcaddc_w,
678 (xnn_pack_dwconv_hwg_w_function) xnn_pack_f16_dwconv_hwg_w,
679 (xnn_pack_dwconv_ghw_w_function) xnn_pack_f16_dwconv_ghw_w,
680 (xnn_pack_gemm_goi_w_function) xnn_pack_f16_gemm_goi_w,
681 (xnn_pack_conv_kgo_w_function) xnn_pack_f16_conv_kgo_w,
682 (xnn_pack_conv_goki_w_function) xnn_pack_f16_conv_goki_w,
683 NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
684 ¶ms, sizeof(params),
685 &xnn_params.f16.gemm, xnn_params.f16.dwconv, XNN_MAX_F16_DWCONV_UKERNELS, &xnn_params.f16.vmulcaddc,
686 false /* linear activation */, false /* relu activation */, XNN_INIT_FLAG_F16,
687 xnn_operator_type_convolution_nhwc_f16,
688 convolution_op_out);
689 }
690
xnn_create_convolution2d_nhwc_f32(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t kernel_height,uint32_t kernel_width,uint32_t subsampling_height,uint32_t subsampling_width,uint32_t dilation_height,uint32_t dilation_width,uint32_t groups,size_t group_input_channels,size_t group_output_channels,size_t input_channel_stride,size_t output_channel_stride,const float * kernel,const float * bias,float output_min,float output_max,uint32_t flags,xnn_operator_t * convolution_op_out)691 enum xnn_status xnn_create_convolution2d_nhwc_f32(
692 uint32_t input_padding_top,
693 uint32_t input_padding_right,
694 uint32_t input_padding_bottom,
695 uint32_t input_padding_left,
696 uint32_t kernel_height,
697 uint32_t kernel_width,
698 uint32_t subsampling_height,
699 uint32_t subsampling_width,
700 uint32_t dilation_height,
701 uint32_t dilation_width,
702 uint32_t groups,
703 size_t group_input_channels,
704 size_t group_output_channels,
705 size_t input_channel_stride,
706 size_t output_channel_stride,
707 const float* kernel,
708 const float* bias,
709 float output_min,
710 float output_max,
711 uint32_t flags,
712 xnn_operator_t* convolution_op_out)
713 {
714 if (isnan(output_min)) {
715 xnn_log_error(
716 "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
717 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
718 return xnn_status_invalid_parameter;
719 }
720
721 if (isnan(output_max)) {
722 xnn_log_error(
723 "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
724 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32));
725 return xnn_status_invalid_parameter;
726 }
727
728 if (output_min >= output_max) {
729 xnn_log_error(
730 "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
731 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32), output_min, output_max);
732 return xnn_status_invalid_parameter;
733 }
734
735 const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max);
736 const bool relu_activation = (output_max == INFINITY) && (output_min == 0.0f);
737 const union xnn_f32_minmax_params params = xnn_init_f32_minmax_params(output_min, output_max);
738 return create_convolution2d_nhwc(
739 input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
740 kernel_height, kernel_width,
741 subsampling_height, subsampling_width,
742 dilation_height, dilation_width,
743 groups, group_input_channels, group_output_channels,
744 input_channel_stride, output_channel_stride,
745 kernel, bias, flags,
746 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
747 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
748 sizeof(float) /* sizeof(bias element) */,
749 (xnn_pack_vmulcaddc_w_function) xnn_pack_f32_vmulcaddc_w,
750 (xnn_pack_dwconv_hwg_w_function) xnn_pack_f32_dwconv_hwg_w,
751 (xnn_pack_dwconv_ghw_w_function) xnn_pack_f32_dwconv_ghw_w,
752 (xnn_pack_gemm_goi_w_function) xnn_pack_f32_gemm_goi_w,
753 (xnn_pack_conv_kgo_w_function) xnn_pack_f32_conv_kgo_w,
754 (xnn_pack_conv_goki_w_function) xnn_pack_f32_conv_goki_w,
755 NULL /* packing params */, 0 /* input padding byte */, 0 /* packed weights padding byte */,
756 ¶ms, sizeof(params),
757 &xnn_params.f32.gemm, xnn_params.f32.dwconv, XNN_MAX_F32_DWCONV_UKERNELS, &xnn_params.f32.vmulcaddc,
758 linear_activation, relu_activation, XNN_INIT_FLAG_F32,
759 xnn_operator_type_convolution_nhwc_f32,
760 convolution_op_out);
761 }
762
setup_convolution2d_nhwc(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,uint32_t datatype_init_flags,uint32_t log2_input_element_size,uint32_t log2_filter_element_size,uint32_t bias_element_size,uint32_t log2_output_element_size,const void * gemm_params,const void * dwconv_params,size_t num_threads)763 static enum xnn_status setup_convolution2d_nhwc(
764 xnn_operator_t convolution_op,
765 size_t batch_size,
766 size_t input_height,
767 size_t input_width,
768 const void* input,
769 void* output,
770 uint32_t datatype_init_flags,
771 uint32_t log2_input_element_size,
772 uint32_t log2_filter_element_size,
773 uint32_t bias_element_size,
774 uint32_t log2_output_element_size,
775 const void* gemm_params,
776 const void* dwconv_params,
777 size_t num_threads)
778 {
779 convolution_op->state = xnn_run_state_invalid;
780
781 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
782 xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
783 xnn_operator_type_to_string(convolution_op->type));
784 return xnn_status_uninitialized;
785 }
786
787 if ((xnn_params.init_flags & datatype_init_flags) != datatype_init_flags) {
788 xnn_log_error(
789 "failed to create %s operator: operations on data type are not supported",
790 xnn_operator_type_to_string(convolution_op->type));
791 return xnn_status_unsupported_hardware;
792 }
793
794 if (input_width == 0 || input_height == 0) {
795 xnn_log_error(
796 "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
797 xnn_operator_type_to_string(convolution_op->type), input_width, input_height);
798 return xnn_status_invalid_parameter;
799 }
800
801 if (batch_size == 0) {
802 convolution_op->state = xnn_run_state_skip;
803 return xnn_status_success;
804 }
805
806 convolution_op->batch_size = batch_size;
807 convolution_op->input_height = input_height;
808 convolution_op->input_width = input_width;
809 convolution_op->input = input;
810
811 if (convolution_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
812 convolution_op->output_height = compute_output_dimension_with_tf_same_padding(
813 input_height, convolution_op->stride_height);
814 convolution_op->output_width = compute_output_dimension_with_tf_same_padding(
815 input_width, convolution_op->stride_width);
816
817 const uint32_t effective_kernel_height = (convolution_op->kernel_height - 1) * convolution_op->dilation_height + 1;
818 const uint32_t effective_kernel_width = (convolution_op->kernel_width - 1) * convolution_op->dilation_width + 1;
819 const size_t total_padding_height =
820 (convolution_op->output_height - 1) * convolution_op->stride_height + effective_kernel_height - input_height;
821 const size_t total_padding_width =
822 (convolution_op->output_width - 1) * convolution_op->stride_width + effective_kernel_width - input_width;
823 convolution_op->padding_top = total_padding_height / 2;
824 convolution_op->padding_left = total_padding_width / 2;
825 convolution_op->padding_bottom = total_padding_height - convolution_op->padding_top;
826 convolution_op->padding_right = total_padding_width - convolution_op->padding_left;
827 } else {
828 convolution_op->output_height = compute_output_dimension(
829 convolution_op->padding_top + input_height + convolution_op->padding_bottom,
830 convolution_op->kernel_height,
831 convolution_op->dilation_height,
832 convolution_op->stride_height);
833 convolution_op->output_width = compute_output_dimension(
834 convolution_op->padding_left + input_width + convolution_op->padding_right,
835 convolution_op->kernel_width,
836 convolution_op->dilation_width,
837 convolution_op->stride_width);
838 }
839 convolution_op->output = output;
840
841 switch (convolution_op->ukernel.type) {
842 case xnn_ukernel_type_gemm:
843 {
844 // Convolution maps directly to GEMM and doesn't use indirection buffer.
845
846 const size_t output_height = convolution_op->output_height;
847 const size_t output_width = convolution_op->output_width;
848 const size_t output_size = output_height * output_width;
849 const size_t batch_output_size = batch_size * output_size;
850
851 const size_t groups = convolution_op->groups;
852 const size_t group_input_channels = convolution_op->group_input_channels;
853 const size_t w_stride = bias_element_size +
854 (round_up_po2(group_input_channels, convolution_op->ukernel.gemm.kr) << log2_filter_element_size);
855 const size_t group_output_channels = convolution_op->group_output_channels;
856
857 uint32_t mr = convolution_op->ukernel.gemm.mr;
858 const uint32_t nr = convolution_op->ukernel.gemm.nr;
859 struct xnn_hmp_gemm_ukernel gemm_ukernel = convolution_op->ukernel.gemm.general_case;
860 if (batch_output_size == 1 && convolution_op->ukernel.gemm.mr1_case.function[XNN_UARCH_DEFAULT] != NULL) {
861 mr = 1;
862 gemm_ukernel = convolution_op->ukernel.gemm.mr1_case;
863 }
864
865 convolution_op->context.gemm = (struct gemm_context) {
866 .k_scaled = group_input_channels << log2_input_element_size,
867 .a = input,
868 .a_stride = convolution_op->input_pixel_stride << log2_input_element_size,
869 .packed_w = convolution_op->packed_weights,
870 .w_stride = w_stride,
871 .wg_stride = w_stride * round_up(group_output_channels, nr),
872 .c = output,
873 .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
874 .cn_stride = nr << log2_output_element_size,
875 .cg_stride = group_output_channels << log2_output_element_size,
876 .log2_csize = log2_output_element_size,
877 .ukernel = gemm_ukernel,
878 };
879 memcpy(&convolution_op->context.gemm.params, gemm_params, sizeof(convolution_op->context.gemm.params));
880
881 size_t nc = group_output_channels;
882 if (num_threads > 1) {
883 const size_t num_other_tiles = groups * divide_round_up(batch_output_size, mr);
884 const size_t target_tiles_per_thread = 5;
885 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
886 if (max_nc < nc) {
887 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
888 }
889 }
890 if (groups == 1) {
891 #if XNN_MAX_UARCH_TYPES > 1
892 if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) {
893 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d_with_uarch;
894 convolution_op->compute.task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_gemm;
895 } else {
896 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
897 convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
898 }
899 #else
900 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
901 convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
902 #endif
903 convolution_op->compute.range[0] = batch_output_size;
904 convolution_op->compute.range[1] = group_output_channels;
905 convolution_op->compute.tile[0] = mr;
906 convolution_op->compute.tile[1] = nc;
907 } else {
908 #if XNN_MAX_UARCH_TYPES > 1
909 if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) {
910 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
911 convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_gemm;
912 } else {
913 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
914 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm;
915 }
916 #else
917 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
918 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm;
919 #endif
920 convolution_op->compute.range[0] = groups;
921 convolution_op->compute.range[1] = batch_output_size;
922 convolution_op->compute.range[2] = group_output_channels;
923 convolution_op->compute.tile[0] = mr;
924 convolution_op->compute.tile[1] = nc;
925 }
926 convolution_op->state = xnn_run_state_ready;
927
928 return xnn_status_success;
929 }
930 case xnn_ukernel_type_igemm:
931 {
932 const size_t groups = convolution_op->groups;
933 const size_t kernel_height = convolution_op->kernel_height;
934 const size_t kernel_width = convolution_op->kernel_width;
935 const size_t kernel_size = kernel_height * kernel_width;
936 const size_t output_height = convolution_op->output_height;
937 const size_t output_width = convolution_op->output_width;
938 const size_t output_size = output_height * output_width;
939
940 uint32_t mr = convolution_op->ukernel.igemm.mr;
941 const uint32_t nr = convolution_op->ukernel.igemm.nr;
942 struct xnn_hmp_igemm_ukernel igemm_ukernel = convolution_op->ukernel.igemm.general_case;
943 if (output_size == 1 && convolution_op->ukernel.igemm.mr1_case.function[XNN_UARCH_DEFAULT] != NULL) {
944 mr = 1;
945 igemm_ukernel = convolution_op->ukernel.igemm.mr1_case;
946 }
947
948 const size_t tiled_output_size = round_up(output_size, mr);
949 const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
950
951 if (input_height != convolution_op->last_input_height ||
952 input_width != convolution_op->last_input_width)
953 {
954 const void** indirection_buffer = (const void**) xnn_reallocate_memory((void*) convolution_op->indirection_buffer, indirection_buffer_size);
955 if (indirection_buffer == NULL) {
956 xnn_log_error(
957 "failed to allocate %zu bytes for %s operator indirection buffer",
958 indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type));
959 return xnn_status_out_of_memory;
960 }
961 convolution_op->indirection_buffer = indirection_buffer;
962 convolution_op->last_input = input;
963 convolution_op->last_input_height = input_height;
964 convolution_op->last_input_width = input_width;
965
966 xnn_indirection_init_conv2d(convolution_op, mr, log2_input_element_size);
967 }
968
969 const size_t group_input_channels = convolution_op->group_input_channels;
970 const size_t w_stride = (round_up_po2(group_input_channels, convolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size) + bias_element_size;
971 const size_t group_output_channels = convolution_op->group_output_channels;
972 convolution_op->context.igemm = (struct igemm_context) {
973 .ks = kernel_size,
974 .ks_scaled = kernel_size * mr * sizeof(void*),
975 .kc = group_input_channels << log2_input_element_size,
976 .w_stride = w_stride,
977 .indirect_a = convolution_op->indirection_buffer,
978 .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) convolution_op->last_input),
979 .zero = convolution_op->zero_buffer,
980 .packed_w = convolution_op->packed_weights,
981 .c = convolution_op->output,
982 .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
983 .cn_stride = nr << log2_output_element_size,
984 .ga_stride = group_input_channels << log2_input_element_size,
985 .gw_stride = w_stride * round_up(group_output_channels, nr),
986 .gc_stride = group_output_channels << log2_output_element_size,
987 .ba_stride = input_height * input_width * convolution_op->input_pixel_stride << log2_input_element_size,
988 .bc_stride = output_size * convolution_op->output_pixel_stride << log2_output_element_size,
989 .log2_csize = log2_output_element_size,
990 .ukernel = igemm_ukernel,
991 };
992 memcpy(&convolution_op->context.igemm.params, gemm_params, sizeof(convolution_op->context.igemm.params));
993
994 size_t nc = group_output_channels;
995 if (num_threads > 1) {
996 const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
997 const size_t target_tiles_per_thread = 5;
998 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
999 if (max_nc < nc) {
1000 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
1001 }
1002 }
1003 if (groups == 1) {
1004 #if XNN_MAX_UARCH_TYPES > 1
1005 if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) {
1006 if (batch_size > 1) {
1007 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1008 convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_batch_hmp_igemm;
1009 } else {
1010 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d_with_uarch;
1011 convolution_op->compute.task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_igemm;
1012 }
1013 } else {
1014 if (batch_size > 1) {
1015 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1016 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
1017 } else {
1018 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
1019 convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
1020 }
1021 }
1022 #else
1023 if (batch_size > 1) {
1024 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1025 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_batch_igemm;
1026 } else {
1027 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
1028 convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_igemm;
1029 }
1030 #endif
1031 if (batch_size > 1) {
1032 convolution_op->compute.range[0] = batch_size;
1033 convolution_op->compute.range[1] = output_size;
1034 convolution_op->compute.range[2] = group_output_channels;
1035 } else {
1036 convolution_op->compute.range[0] = output_size;
1037 convolution_op->compute.range[1] = group_output_channels;
1038 }
1039 convolution_op->compute.tile[0] = mr;
1040 convolution_op->compute.tile[1] = nc;
1041 } else {
1042 #if XNN_MAX_UARCH_TYPES > 1
1043 if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) {
1044 if (batch_size > 1) {
1045 convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d_with_uarch;
1046 convolution_op->compute.task_4d_tile_2d_with_id = (pthreadpool_task_4d_tile_2d_with_id_t) xnn_compute_hmp_grouped_batch_igemm;
1047 } else {
1048 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
1049 convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_igemm;
1050 }
1051 } else {
1052 if (batch_size > 1) {
1053 convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
1054 convolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
1055 } else {
1056 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1057 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
1058 }
1059 }
1060 #else
1061 if (batch_size > 1) {
1062 convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
1063 convolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_batch_igemm;
1064 } else {
1065 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
1066 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_igemm;
1067 }
1068 #endif
1069 if (batch_size > 1) {
1070 convolution_op->compute.range[0] = batch_size;
1071 convolution_op->compute.range[1] = groups;
1072 convolution_op->compute.range[2] = output_size;
1073 convolution_op->compute.range[3] = group_output_channels;
1074 } else {
1075 convolution_op->compute.range[0] = groups;
1076 convolution_op->compute.range[1] = output_size;
1077 convolution_op->compute.range[2] = group_output_channels;
1078 }
1079 convolution_op->compute.tile[0] = mr;
1080 convolution_op->compute.tile[1] = nc;
1081 }
1082 convolution_op->state = xnn_run_state_ready;
1083
1084 return xnn_status_success;
1085 }
1086 case xnn_ukernel_type_dwconv:
1087 {
1088 const size_t kernel_height = convolution_op->kernel_height;
1089 const size_t kernel_width = convolution_op->kernel_width;
1090 const size_t kernel_size = kernel_height * kernel_width;
1091 const size_t output_height = convolution_op->output_height;
1092 const size_t output_width = convolution_op->output_width;
1093 const size_t step_width = convolution_op->dilation_width == 1 ? convolution_op->stride_width : kernel_width;
1094 const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
1095 if (input_height != convolution_op->last_input_height || input_width != convolution_op->last_input_width) {
1096 const size_t indirection_buffer_size = sizeof(void*) * output_height * step_height;
1097
1098 const void** indirection_buffer =
1099 (const void**) xnn_reallocate_memory(convolution_op->indirection_buffer, indirection_buffer_size);
1100 if (indirection_buffer == NULL) {
1101 xnn_log_error("failed to allocate %zu bytes for %s operator indirection buffer",
1102 indirection_buffer_size, xnn_operator_type_to_string(convolution_op->type));
1103 return xnn_status_out_of_memory;
1104 }
1105 convolution_op->indirection_buffer = indirection_buffer;
1106
1107 xnn_indirection_init_dwconv2d(convolution_op, step_height, step_width, log2_input_element_size);
1108
1109 convolution_op->last_input = input;
1110 convolution_op->last_input_height = input_height;
1111 convolution_op->last_input_width = input_width;
1112 }
1113
1114 const size_t groups = convolution_op->groups;
1115 convolution_op->context.dwconv = (struct dwconv_context) {
1116 .indirect_input = convolution_op->indirection_buffer,
1117 .indirect_input_width_stride = kernel_height * step_width * sizeof(void*),
1118 .indirect_input_height_stride = step_height * sizeof(void*),
1119 .input_offset = (size_t) ((uintptr_t) input - (uintptr_t) convolution_op->last_input),
1120 .input_batch_stride = (input_height * input_width * convolution_op->input_pixel_stride) << log2_input_element_size,
1121 .packed_weights = convolution_op->packed_weights,
1122 .output = convolution_op->output,
1123 .output_batch_stride = (output_height * output_width * convolution_op->output_pixel_stride) << log2_output_element_size,
1124 .output_height_stride = (output_width * convolution_op->output_pixel_stride) << log2_output_element_size,
1125 .output_width = output_width,
1126 .groups = groups,
1127 .zero = convolution_op->zero_buffer,
1128 .output_increment = (convolution_op->output_pixel_stride - groups) << log2_output_element_size,
1129 .unipass_ukernel = convolution_op->ukernel.dwconv.unipass_function,
1130 };
1131 memcpy(&convolution_op->context.dwconv.params, dwconv_params, sizeof(convolution_op->context.dwconv.params));
1132
1133 convolution_op->compute.type = xnn_parallelization_type_2d;
1134 convolution_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_dwconv_unipass;
1135 convolution_op->compute.range[0] = batch_size;
1136 convolution_op->compute.range[1] = output_height;
1137 convolution_op->state = xnn_run_state_ready;
1138
1139 return xnn_status_success;
1140 }
1141 case xnn_ukernel_type_vmulcaddc:
1142 {
1143 const size_t batch_output_size = batch_size * convolution_op->output_height * convolution_op->output_width;
1144
1145 convolution_op->context.vmulcaddc = (struct vmulcaddc_context) {
1146 .n = convolution_op->groups << log2_input_element_size,
1147 .x = input,
1148 .x_stride = convolution_op->input_pixel_stride << log2_input_element_size,
1149 .w = convolution_op->packed_weights,
1150 .y = output,
1151 .y_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1152 .ukernel = convolution_op->ukernel.vmulcaddc.function,
1153 };
1154 memcpy(&convolution_op->context.vmulcaddc.params, dwconv_params, sizeof(convolution_op->context.vmulcaddc.params));
1155
1156 size_t mc = batch_output_size;
1157 if (num_threads > 1) {
1158 const size_t target_tiles_per_thread = 5;
1159 const size_t max_mc = divide_round_up(batch_output_size, num_threads * target_tiles_per_thread);
1160 if (max_mc < mc) {
1161 const uint32_t mr = convolution_op->ukernel.vmulcaddc.mr;
1162 mc = min(mc, divide_round_up(mc, max_mc * mr) * mr);
1163 }
1164 }
1165 convolution_op->compute.type = xnn_parallelization_type_1d_tile_1d;
1166 convolution_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_vmulcaddc;
1167 convolution_op->compute.range[0] = batch_output_size;
1168 convolution_op->compute.tile[0] = mc;
1169 convolution_op->state = xnn_run_state_ready;
1170
1171 return xnn_status_success;
1172 }
1173 default:
1174 XNN_UNREACHABLE;
1175 }
1176 }
1177
xnn_setup_convolution2d_nhwc_qu8(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const uint8_t * input,uint8_t * output,pthreadpool_t threadpool)1178 enum xnn_status xnn_setup_convolution2d_nhwc_qu8(
1179 xnn_operator_t convolution_op,
1180 size_t batch_size,
1181 size_t input_height,
1182 size_t input_width,
1183 const uint8_t* input,
1184 uint8_t* output,
1185 pthreadpool_t threadpool)
1186 {
1187 if (convolution_op->type != xnn_operator_type_convolution_nhwc_qu8) {
1188 xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1189 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qu8),
1190 xnn_operator_type_to_string(convolution_op->type));
1191 return xnn_status_invalid_parameter;
1192 }
1193
1194 return setup_convolution2d_nhwc(
1195 convolution_op,
1196 batch_size, input_height, input_width,
1197 input, output,
1198 XNN_INIT_FLAG_QU8,
1199 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
1200 0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
1201 sizeof(int32_t) /* sizeof(bias element) */,
1202 0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
1203 &convolution_op->params.qu8_gemm,
1204 &convolution_op->params.qu8_gemm,
1205 pthreadpool_get_threads_count(threadpool));
1206 }
1207
xnn_setup_convolution2d_nhwc_qs8(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const int8_t * input,int8_t * output,pthreadpool_t threadpool)1208 enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
1209 xnn_operator_t convolution_op,
1210 size_t batch_size,
1211 size_t input_height,
1212 size_t input_width,
1213 const int8_t* input,
1214 int8_t* output,
1215 pthreadpool_t threadpool)
1216 {
1217 if (convolution_op->type != xnn_operator_type_convolution_nhwc_qs8) {
1218 xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1219 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_qs8),
1220 xnn_operator_type_to_string(convolution_op->type));
1221 return xnn_status_invalid_parameter;
1222 }
1223
1224 return setup_convolution2d_nhwc(
1225 convolution_op,
1226 batch_size, input_height, input_width,
1227 input, output,
1228 XNN_INIT_FLAG_QS8,
1229 0 /* log2(sizeof(input element)) = log2(sizeof(int8_t)) */,
1230 0 /* log2(sizeof(filter element)) = log2(sizeof(int8_t)) */,
1231 sizeof(int32_t) /* sizeof(bias element) */,
1232 0 /* log2(sizeof(output element)) = log2(sizeof(int8_t)) */,
1233 &convolution_op->params.qs8_gemm,
1234 &convolution_op->params.qs8_gemm,
1235 pthreadpool_get_threads_count(threadpool));
1236 }
1237
xnn_setup_convolution2d_nhwc_f16(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,pthreadpool_t threadpool)1238 enum xnn_status xnn_setup_convolution2d_nhwc_f16(
1239 xnn_operator_t convolution_op,
1240 size_t batch_size,
1241 size_t input_height,
1242 size_t input_width,
1243 const void* input,
1244 void* output,
1245 pthreadpool_t threadpool)
1246 {
1247 if (convolution_op->type != xnn_operator_type_convolution_nhwc_f16) {
1248 xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1249 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f16),
1250 xnn_operator_type_to_string(convolution_op->type));
1251 return xnn_status_invalid_parameter;
1252 }
1253
1254 return setup_convolution2d_nhwc(
1255 convolution_op,
1256 batch_size, input_height, input_width,
1257 input, output,
1258 XNN_INIT_FLAG_F16,
1259 1 /* log2(sizeof(input element)) = log2(sizeof(uint16_t)) */,
1260 1 /* log2(sizeof(filter element)) = log2(sizeof(uint16_t)) */,
1261 sizeof(uint16_t) /* sizeof(bias element) */,
1262 1 /* log2(sizeof(output element)) = log2(sizeof(uint16_t)) */,
1263 &convolution_op->params.f16_scaleminmax,
1264 &convolution_op->params.f16_minmax,
1265 pthreadpool_get_threads_count(threadpool));
1266 }
1267
xnn_setup_convolution2d_nhwc_f32(xnn_operator_t convolution_op,size_t batch_size,size_t input_height,size_t input_width,const float * input,float * output,pthreadpool_t threadpool)1268 enum xnn_status xnn_setup_convolution2d_nhwc_f32(
1269 xnn_operator_t convolution_op,
1270 size_t batch_size,
1271 size_t input_height,
1272 size_t input_width,
1273 const float* input,
1274 float* output,
1275 pthreadpool_t threadpool)
1276 {
1277 if (convolution_op->type != xnn_operator_type_convolution_nhwc_f32) {
1278 xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
1279 xnn_operator_type_to_string(xnn_operator_type_convolution_nhwc_f32),
1280 xnn_operator_type_to_string(convolution_op->type));
1281 return xnn_status_invalid_parameter;
1282 }
1283
1284 return setup_convolution2d_nhwc(
1285 convolution_op,
1286 batch_size, input_height, input_width,
1287 input, output,
1288 XNN_INIT_FLAG_F32,
1289 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1290 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1291 sizeof(float) /* sizeof(bias element) */,
1292 2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
1293 &convolution_op->params.f32_minmax,
1294 &convolution_op->params.f32_minmax,
1295 pthreadpool_get_threads_count(threadpool));
1296 }
1297