• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 #include <math.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <string.h>
16 
17 #include <xnnpack.h>
18 #include <xnnpack/allocator.h>
19 #include <xnnpack/operator.h>
20 #include <xnnpack/common.h>
21 #include <xnnpack/log.h>
22 #include <xnnpack/math.h>
23 #include <xnnpack/params-init.h>
24 #include <xnnpack/params.h>
25 #include <xnnpack/indirection.h>
26 
27 
compute_output_dimension(size_t padded_input_dimension,size_t pooling_dimension,size_t stride_dimension)28 static inline size_t compute_output_dimension(
29     size_t padded_input_dimension,
30     size_t pooling_dimension,
31     size_t stride_dimension)
32 {
33   return (padded_input_dimension - pooling_dimension) / stride_dimension + 1;
34 }
35 
xnn_create_average_pooling2d_nhwc_q8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t pooling_height,uint32_t pooling_width,uint32_t stride_height,uint32_t stride_width,size_t channels,size_t input_pixel_stride,size_t output_pixel_stride,uint8_t input_zero_point,float input_scale,uint8_t output_zero_point,float output_scale,uint8_t output_min,uint8_t output_max,uint32_t flags,xnn_operator_t * average_pooling_op_out)36 enum xnn_status xnn_create_average_pooling2d_nhwc_q8(
37     uint32_t input_padding_top,
38     uint32_t input_padding_right,
39     uint32_t input_padding_bottom,
40     uint32_t input_padding_left,
41     uint32_t pooling_height,
42     uint32_t pooling_width,
43     uint32_t stride_height,
44     uint32_t stride_width,
45     size_t channels,
46     size_t input_pixel_stride,
47     size_t output_pixel_stride,
48     uint8_t input_zero_point,
49     float input_scale,
50     uint8_t output_zero_point,
51     float output_scale,
52     uint8_t output_min,
53     uint8_t output_max,
54     uint32_t flags,
55     xnn_operator_t* average_pooling_op_out)
56 {
57   xnn_operator_t average_pooling_op = NULL;
58   enum xnn_status status = xnn_status_uninitialized;
59 
60   if (!xnn_params.initialized) {
61     xnn_log_error("failed to create Average Pooling operator: XNNPACK is not initialized");
62     goto error;
63   }
64 
65   status = xnn_status_invalid_parameter;
66 
67   const uint32_t pooling_size = pooling_height * pooling_width;
68   if (pooling_size == 0) {
69     xnn_log_error(
70       "failed to create Average Pooling operator with %" PRIu32 "x%" PRIu32 " pooling size: "
71       "pooling size dimensions must be non-zero",
72       pooling_width, pooling_height);
73     goto error;
74   }
75 
76   if (pooling_size == 1) {
77     xnn_log_error(
78       "failed to create Average Pooling operator with 1 pooling element: 1x1 pooling is meaningless");
79     goto error;
80   }
81 
82   if (stride_height == 0 || stride_width == 0) {
83     xnn_log_error(
84       "failed to create Average Pooling operator with %" PRIu32 "x%" PRIu32 " stride: "
85       "stride dimensions must be non-zero",
86       stride_width, stride_height);
87     goto error;
88   }
89 
90   if (channels == 0) {
91     xnn_log_error(
92       "failed to create Average Pooling operator with %zu channels: number of channels must be non-zero",
93       channels);
94     goto error;
95   }
96 
97   if (input_pixel_stride < channels) {
98     xnn_log_error(
99       "failed to create Average Pooling operator with input pixel stride of %zu: "
100       "stride must be at least as large as the number of channels (%zu)",
101       input_pixel_stride, channels);
102     goto error;
103   }
104 
105   if (output_pixel_stride < channels) {
106     xnn_log_error(
107       "failed to create Average Pooling operator with output pixel stride of %zu: "
108       "stride must be at least as large as the number of channels (%zu)",
109       output_pixel_stride, channels);
110     goto error;
111   }
112 
113   if (input_scale <= 0.0f || !isnormal(input_scale)) {
114     xnn_log_error(
115       "failed to create Average Pooling operator with %.7g input scale: "
116       "scale must be finite, normalized, and positive",
117       input_scale);
118     goto error;
119   }
120 
121   if (output_scale <= 0.0f || !isnormal(output_scale)) {
122     xnn_log_error(
123       "failed to create Average Pooling operator with %.7g output scale: "
124       "scale must be finite, normalized, and positive",
125       output_scale);
126     goto error;
127   }
128 
129   if (output_min >= output_max) {
130     xnn_log_error(
131       "failed to create Average Pooling operator with [%" PRIu8 ", %" PRIu8 "] output range: "
132       "range min must be below range max",
133       output_min, output_max);
134     goto error;
135   }
136 
137   status = xnn_status_unsupported_parameter;
138 
139   const float input_output_scale = input_scale / output_scale;
140   if (input_output_scale < 0x1.0p-8f || input_output_scale >= 0x1.0p+8f) {
141     xnn_log_error(
142       "failed to create Average Pooling operator with %.7g input scale and %.7g output scale: "
143       "input-to-output scale ratio (%.7f) must be in [2**-8, 2**8) range",
144       input_scale, output_scale, input_output_scale);
145     goto error;
146   }
147 
148   if (pooling_size >= 16777216) {
149     xnn_log_error(
150       "failed to create Average Pooling operator with %"PRIu32" (%" PRIu32 "x%" PRIu32 ") pooling elements: "
151       "the number of elements in the pooling area must be below 2**24",
152       pooling_size, pooling_width, pooling_height);
153     goto error;
154   }
155 
156   status = xnn_status_out_of_memory;
157 
158   average_pooling_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
159   if (average_pooling_op == NULL) {
160     xnn_log_error("failed to allocate %zu bytes for Average Pooling operator descriptor", sizeof(struct xnn_operator));
161     goto error;
162   }
163 
164   const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
165   const uint32_t mr = xnn_params.q8.avgpool.mr;
166   const uint32_t qr = xnn_params.q8.avgpool.qr;
167   if (any_padding || pooling_size < mr || (pooling_size - mr) % qr != 0) {
168     void* zero_buffer = xnn_allocate_simd_memory(channels * sizeof(uint8_t) + XNN_EXTRA_BYTES);
169     if (zero_buffer == NULL) {
170       xnn_log_error("failed to allocate %zu bytes for Average Pooling zero padding",
171         channels * sizeof(uint8_t) + XNN_EXTRA_BYTES);
172       goto error;
173     }
174     memset(zero_buffer, input_zero_point, channels * sizeof(uint8_t));
175     average_pooling_op->zero_buffer = zero_buffer;
176   }
177 
178   average_pooling_op->padding_top = input_padding_top;
179   average_pooling_op->padding_right = input_padding_right;
180   average_pooling_op->padding_bottom = input_padding_bottom;
181   average_pooling_op->padding_left = input_padding_left;
182 
183   average_pooling_op->kernel_height = pooling_height;
184   average_pooling_op->kernel_width = pooling_width;
185   average_pooling_op->stride_height = stride_height;
186   average_pooling_op->stride_width = stride_width;
187   average_pooling_op->dilation_height = 1;
188   average_pooling_op->dilation_width = 1;
189   average_pooling_op->channels = channels;
190   average_pooling_op->input_pixel_stride = input_pixel_stride;
191   average_pooling_op->output_pixel_stride = output_pixel_stride;
192 
193   // Number of rows read in the micro-kernel.
194   const size_t nrows = round_up(doz(pooling_size, mr), qr) + mr;
195   average_pooling_op->q8_avgpool_params =
196     xnn_init_q8_avgpool_params(
197       (int32_t) -((uint32_t) input_zero_point * (uint32_t) nrows),
198       input_scale / (output_scale * (float) pooling_size),
199       output_zero_point, output_min, output_max);
200 
201   average_pooling_op->type = xnn_operator_type_average_pooling_nhwc_q8;
202   average_pooling_op->ukernel.type = xnn_ukernel_type_average_pooling;
203 
204   *average_pooling_op_out = average_pooling_op;
205   return xnn_status_success;
206 
207 error:
208   xnn_delete_operator(average_pooling_op);
209   return status;
210 }
211 
xnn_create_average_pooling2d_nhwc_f32(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t pooling_height,uint32_t pooling_width,uint32_t stride_height,uint32_t stride_width,size_t channels,size_t input_pixel_stride,size_t output_pixel_stride,float output_min,float output_max,uint32_t flags,xnn_operator_t * average_pooling_op_out)212 enum xnn_status xnn_create_average_pooling2d_nhwc_f32(
213     uint32_t input_padding_top,
214     uint32_t input_padding_right,
215     uint32_t input_padding_bottom,
216     uint32_t input_padding_left,
217     uint32_t pooling_height,
218     uint32_t pooling_width,
219     uint32_t stride_height,
220     uint32_t stride_width,
221     size_t channels,
222     size_t input_pixel_stride,
223     size_t output_pixel_stride,
224     float output_min,
225     float output_max,
226     uint32_t flags,
227     xnn_operator_t* average_pooling_op_out)
228 {
229   xnn_operator_t average_pooling_op = NULL;
230   enum xnn_status status = xnn_status_uninitialized;
231 
232   if (!xnn_params.initialized) {
233     xnn_log_error("failed to create Average Pooling operator: XNNPACK is not initialized");
234     goto error;
235   }
236 
237   status = xnn_status_invalid_parameter;
238 
239   const uint32_t pooling_size = pooling_height * pooling_width;
240   if (pooling_size == 0) {
241     xnn_log_error(
242       "failed to create Average Pooling operator with %" PRIu32 "x%" PRIu32 " pooling size: "
243       "pooling size dimensions must be non-zero",
244       pooling_width, pooling_height);
245     goto error;
246   }
247 
248   if (pooling_size == 1) {
249     xnn_log_error(
250       "failed to create Average Pooling operator with 1 pooling element: 1x1 pooling is meaningless");
251     goto error;
252   }
253 
254   if (stride_height == 0 || stride_width == 0) {
255     xnn_log_error(
256       "failed to create Average Pooling operator with %" PRIu32 "x%" PRIu32 " stride: "
257       "stride dimensions must be non-zero",
258       stride_width, stride_height);
259     goto error;
260   }
261 
262   if (channels == 0) {
263     xnn_log_error(
264       "failed to create Average Pooling operator with %zu channels: number of channels must be non-zero",
265       channels);
266     goto error;
267   }
268 
269   if (input_pixel_stride < channels) {
270     xnn_log_error(
271       "failed to create Average Pooling operator with input pixel stride of %zu: "
272       "stride must be at least as large as the number of channels (%zu)",
273       input_pixel_stride, channels);
274     goto error;
275   }
276 
277   if (output_pixel_stride < channels) {
278     xnn_log_error(
279       "failed to create Average Pooling operator with output pixel stride of %zu: "
280       "stride must be at least as large as the number of channels (%zu)",
281       output_pixel_stride, channels);
282     goto error;
283   }
284 
285   if (isnan(output_min)) {
286     xnn_log_error(
287       "failed to create Average Pooling operator with NaN output lower bound: lower bound must be non-NaN");
288     goto error;
289   }
290 
291   if (isnan(output_max)) {
292     xnn_log_error(
293       "failed to create Average Pooling operator with NaN output upper bound: upper bound must be non-NaN");
294     goto error;
295   }
296 
297   if (output_min >= output_max) {
298     xnn_log_error(
299       "failed to create Average Pooling operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
300       output_min, output_max);
301     goto error;
302   }
303 
304   status = xnn_status_out_of_memory;
305 
306   average_pooling_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
307   if (average_pooling_op == NULL) {
308     xnn_log_error("failed to allocate %zu bytes for Average Pooling operator descriptor", sizeof(struct xnn_operator));
309     goto error;
310   }
311 
312   const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
313   const uint32_t mr = xnn_params.f32.avgpool.mr;
314   const uint32_t qr = xnn_params.f32.avgpool.qr;
315   if (any_padding || pooling_size < mr || (pooling_size - mr) % qr != 0) {
316     void* zero_buffer = xnn_allocate_zero_simd_memory(channels * sizeof(float) + XNN_EXTRA_BYTES);
317     if (zero_buffer == NULL) {
318       xnn_log_error("failed to allocate %zu bytes for Average Pooling zero padding",
319         channels * sizeof(float) + XNN_EXTRA_BYTES);
320       goto error;
321     }
322     average_pooling_op->zero_buffer = zero_buffer;
323   }
324 
325   average_pooling_op->padding_top = input_padding_top;
326   average_pooling_op->padding_right = input_padding_right;
327   average_pooling_op->padding_bottom = input_padding_bottom;
328   average_pooling_op->padding_left = input_padding_left;
329 
330   average_pooling_op->kernel_height = pooling_height;
331   average_pooling_op->kernel_width = pooling_width;
332   average_pooling_op->stride_height = stride_height;
333   average_pooling_op->stride_width = stride_width;
334   average_pooling_op->dilation_height = 1;
335   average_pooling_op->dilation_width = 1;
336   average_pooling_op->channels = channels;
337   average_pooling_op->input_pixel_stride = input_pixel_stride;
338   average_pooling_op->output_pixel_stride = output_pixel_stride;
339 
340   average_pooling_op->type = xnn_operator_type_average_pooling_nhwc_f32;
341   if (any_padding) {
342     average_pooling_op->f32_output_params =
343       xnn_init_f32_output_params(output_min, output_max);
344 
345     average_pooling_op->ukernel.type = xnn_ukernel_type_pixelwise_average_pooling;
346   } else {
347     average_pooling_op->f32_avgpool_params =
348       xnn_init_f32_avgpool_params(1.0f / (float) pooling_size, output_min, output_max);
349 
350     average_pooling_op->ukernel.type = xnn_ukernel_type_average_pooling;
351   }
352 
353   *average_pooling_op_out = average_pooling_op;
354   return xnn_status_success;
355 
356 error:
357   xnn_delete_operator(average_pooling_op);
358   return status;
359 }
360 
xnn_setup_average_pooling2d_nhwc_q8(xnn_operator_t average_pooling_op,size_t batch_size,size_t input_height,size_t input_width,const uint8_t * input,uint8_t * output,pthreadpool_t threadpool)361 enum xnn_status xnn_setup_average_pooling2d_nhwc_q8(
362     xnn_operator_t average_pooling_op,
363     size_t batch_size,
364     size_t input_height,
365     size_t input_width,
366     const uint8_t* input,
367     uint8_t* output,
368     pthreadpool_t threadpool)
369 {
370   if (average_pooling_op->type != xnn_operator_type_average_pooling_nhwc_q8) {
371     xnn_log_error("failed to setup Average Pooling (Q8) operator: operator type mismatch");
372     return xnn_status_invalid_parameter;
373   }
374   average_pooling_op->state = xnn_run_state_invalid;
375 
376   if (!xnn_params.initialized) {
377     xnn_log_error("failed to setup Average Pooling operator: XNNPACK is not initialized");
378     return xnn_status_uninitialized;
379   }
380 
381   if (input_width == 0 || input_height == 0) {
382     xnn_log_error(
383       "failed to setup Average Pooling operator with %zux%zu input: input dimensions must be non-zero",
384       input_width, input_height);
385     return xnn_status_invalid_parameter;
386   }
387 
388   if (batch_size == 0) {
389     average_pooling_op->state = xnn_run_state_skip;
390     return xnn_status_success;
391   }
392 
393   average_pooling_op->batch_size = batch_size;
394   average_pooling_op->input_height = input_height;
395   average_pooling_op->input_width = input_width;
396   average_pooling_op->input = input;
397 
398   average_pooling_op->output_height = compute_output_dimension(
399       average_pooling_op->padding_top + input_height + average_pooling_op->padding_bottom,
400       average_pooling_op->kernel_height,
401       average_pooling_op->stride_height);
402   average_pooling_op->output_width = compute_output_dimension(
403       average_pooling_op->padding_left + input_width + average_pooling_op->padding_right,
404       average_pooling_op->kernel_width,
405       average_pooling_op->stride_width);
406   average_pooling_op->output = output;
407 
408   size_t valid_batch_size = 0;
409   if (input == average_pooling_op->last_input &&
410       input_height == average_pooling_op->last_input_height &&
411       input_width == average_pooling_op->last_input_width)
412   {
413     valid_batch_size = average_pooling_op->valid_batch_size;
414     if (batch_size <= valid_batch_size) {
415       average_pooling_op->compute.range[0] = batch_size;
416       average_pooling_op->context.average_pooling.output = output;
417       average_pooling_op->state = xnn_run_state_ready;
418       return xnn_status_success;
419     }
420   }
421 
422   const size_t pooling_height = average_pooling_op->kernel_height;
423   const size_t pooling_width = average_pooling_op->kernel_width;
424   const size_t pooling_size = pooling_height * pooling_width;
425   const size_t output_height = average_pooling_op->output_height;
426   const size_t output_width = average_pooling_op->output_width;
427   // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
428   const uint32_t mr = xnn_params.q8.avgpool.mr;
429 
430   const size_t step_width = min(average_pooling_op->stride_width, pooling_width);
431   const size_t step_height = pooling_size + (output_width - 1) * step_width * pooling_height;
432   const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + batch_size * output_height * step_height);
433 
434   const void** indirection_buffer = (const void**) xnn_reallocate_memory(average_pooling_op->indirection_buffer, indirection_buffer_size);
435   if (indirection_buffer == NULL) {
436     xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
437     return xnn_status_out_of_memory;
438   }
439   average_pooling_op->indirection_buffer = indirection_buffer;
440 
441   xnn_indirection_init_dwconv2d(
442     average_pooling_op, valid_batch_size, step_height, step_width, 0 /* log2(sizeof(uint8_t)) */);
443 
444   const uint32_t qr = xnn_params.q8.avgpool.qr;
445   const size_t channels = average_pooling_op->channels;
446 
447   const size_t indirect_input_height_stride = step_height * sizeof(void*);
448   const size_t output_width_stride = average_pooling_op->output_pixel_stride * sizeof(uint8_t);
449   const size_t output_height_stride = output_width * output_width_stride;
450 
451   const size_t multipass_adjustment =
452     pooling_size > mr ? round_up(pooling_size - mr, qr) + mr - qr : 0;
453   average_pooling_op->context.average_pooling = (struct average_pooling_context) {
454       .indirect_input = indirection_buffer,
455       .indirect_input_batch_stride = output_height * indirect_input_height_stride,
456       .indirect_input_height_stride = indirect_input_height_stride,
457       .output = output,
458       .output_batch_stride = output_height * output_height_stride,
459       .output_height_stride = output_height_stride,
460       .output_width = output_width,
461       .pooling_size = pooling_size,
462       .channels = channels,
463       .zero = average_pooling_op->zero_buffer,
464       .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
465       .output_increment = output_width_stride - channels * sizeof(uint8_t),
466       .params.q8 = average_pooling_op->q8_avgpool_params,
467   };
468   average_pooling_op->compute.type = xnn_parallelization_type_2d;
469   average_pooling_op->compute.range[0] = batch_size;
470   average_pooling_op->compute.range[1] = output_height;
471 
472   if (pooling_size <= mr) {
473     average_pooling_op->context.average_pooling.unipass_ukernel = xnn_params.q8.avgpool.up;
474     average_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_average_pooling_unipass;
475   } else {
476     average_pooling_op->context.average_pooling.multipass_ukernel = xnn_params.q8.avgpool.mp;
477     average_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_average_pooling_multipass;
478   }
479   average_pooling_op->state = xnn_run_state_ready;
480 
481   average_pooling_op->last_input = input;
482   average_pooling_op->last_input_height = input_height;
483   average_pooling_op->last_input_width = input_width;
484   average_pooling_op->valid_batch_size = max(valid_batch_size, batch_size);
485 
486   return xnn_status_success;
487 }
488 
xnn_setup_average_pooling2d_nhwc_f32(xnn_operator_t average_pooling_op,size_t batch_size,size_t input_height,size_t input_width,const float * input,float * output,pthreadpool_t threadpool)489 enum xnn_status xnn_setup_average_pooling2d_nhwc_f32(
490     xnn_operator_t average_pooling_op,
491     size_t batch_size,
492     size_t input_height,
493     size_t input_width,
494     const float* input,
495     float* output,
496     pthreadpool_t threadpool)
497 {
498   if (average_pooling_op->type != xnn_operator_type_average_pooling_nhwc_f32) {
499     xnn_log_error("failed to setup Average Pooling (F32) operator: operator type mismatch");
500     return xnn_status_invalid_parameter;
501   }
502   average_pooling_op->state = xnn_run_state_invalid;
503 
504   if (!xnn_params.initialized) {
505     xnn_log_error("failed to setup Average Pooling operator: XNNPACK is not initialized");
506     return xnn_status_uninitialized;
507   }
508 
509   if (input_width == 0 || input_height == 0) {
510     xnn_log_error(
511       "failed to setup Average Pooling operator with %zux%zu input: input dimensions must be non-zero",
512       input_width, input_height);
513     return xnn_status_invalid_parameter;
514   }
515 
516   if (batch_size == 0) {
517     average_pooling_op->state = xnn_run_state_skip;
518     return xnn_status_success;
519   }
520 
521   average_pooling_op->batch_size = batch_size;
522   average_pooling_op->input_height = input_height;
523   average_pooling_op->input_width = input_width;
524   average_pooling_op->input = input;
525 
526   average_pooling_op->output_height = compute_output_dimension(
527       average_pooling_op->padding_top + input_height + average_pooling_op->padding_bottom,
528       average_pooling_op->kernel_height,
529       average_pooling_op->stride_height);
530   average_pooling_op->output_width = compute_output_dimension(
531       average_pooling_op->padding_left + input_width + average_pooling_op->padding_right,
532       average_pooling_op->kernel_width,
533       average_pooling_op->stride_width);
534   average_pooling_op->output = output;
535 
536   size_t valid_batch_size = 0;
537   if (input == average_pooling_op->last_input &&
538       input_height == average_pooling_op->last_input_height &&
539       input_width == average_pooling_op->last_input_width)
540   {
541     valid_batch_size = average_pooling_op->valid_batch_size;
542     if (batch_size <= valid_batch_size) {
543       average_pooling_op->compute.range[0] = batch_size;
544       average_pooling_op->context.average_pooling.output = output;
545       average_pooling_op->state = xnn_run_state_ready;
546       return xnn_status_success;
547     }
548   }
549 
550   const size_t pooling_height = average_pooling_op->kernel_height;
551   const size_t pooling_width = average_pooling_op->kernel_width;
552   const size_t pooling_size = pooling_height * pooling_width;
553   const size_t output_height = average_pooling_op->output_height;
554   const size_t output_width = average_pooling_op->output_width;
555   // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
556   const uint32_t mr = xnn_params.f32.avgpool.mr;
557   assert(mr == xnn_params.f32.pavgpool.mr);
558 
559   const size_t step_width = min(average_pooling_op->stride_width, pooling_width);
560   const size_t step_height = pooling_size + (output_width - 1) * step_width * pooling_height;
561   const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + batch_size * output_height * step_height);
562 
563   const void** indirection_buffer = (const void**) xnn_reallocate_memory(average_pooling_op->indirection_buffer, indirection_buffer_size);
564   if (indirection_buffer == NULL) {
565     xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
566     return xnn_status_out_of_memory;
567   }
568   average_pooling_op->indirection_buffer = indirection_buffer;
569 
570   xnn_indirection_init_dwconv2d(
571     average_pooling_op, valid_batch_size, step_height, step_width, 2 /* log2(sizeof(float)) */);
572 
573   const size_t channels = average_pooling_op->channels;
574 
575   const size_t indirect_input_height_stride = step_height * sizeof(void*);
576   const size_t output_width_stride = average_pooling_op->output_pixel_stride * sizeof(float);
577   const size_t output_height_stride = output_width * output_width_stride;
578 
579   switch (average_pooling_op->ukernel.type) {
580     case xnn_ukernel_type_average_pooling:
581     {
582       const uint32_t qr = xnn_params.f32.avgpool.qr;
583       const size_t multipass_adjustment =
584         pooling_size > mr ? round_up(pooling_size - mr, qr) + mr - qr : 0;
585       average_pooling_op->context.average_pooling = (struct average_pooling_context) {
586         .indirect_input = indirection_buffer,
587         .indirect_input_batch_stride = output_height * indirect_input_height_stride,
588         .indirect_input_height_stride = indirect_input_height_stride,
589         .output = output,
590         .output_batch_stride = output_height * output_height_stride,
591         .output_height_stride = output_height_stride,
592         .output_width = output_width,
593         .pooling_size = pooling_size,
594         .channels = channels,
595         .zero = average_pooling_op->zero_buffer,
596         .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
597         .output_increment = output_width_stride - channels * sizeof(float),
598         .params.f32 = average_pooling_op->f32_avgpool_params,
599       };
600       if (pooling_size <= mr) {
601         average_pooling_op->context.average_pooling.unipass_ukernel = xnn_params.f32.avgpool.up;
602         average_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_average_pooling_unipass;
603       } else {
604         average_pooling_op->context.average_pooling.multipass_ukernel = xnn_params.f32.avgpool.mp;
605         average_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_average_pooling_multipass;
606       }
607       break;
608     }
609     case xnn_ukernel_type_pixelwise_average_pooling:
610     {
611       if (input_height != average_pooling_op->last_input_height ||
612           input_width != average_pooling_op->last_input_width)
613       {
614         const size_t pixelwise_buffer_size = output_height * output_width * sizeof(float);
615         float* pixelwise_buffer = (float*) xnn_reallocate_memory(average_pooling_op->pixelwise_buffer, pixelwise_buffer_size);
616         if (pixelwise_buffer == NULL) {
617           xnn_log_error("failed to allocate %zu bytes for pixelwise buffer", pixelwise_buffer_size);
618           return xnn_status_out_of_memory;
619         }
620         average_pooling_op->pixelwise_buffer = pixelwise_buffer;
621 
622         float* pixelwise_pointer = pixelwise_buffer;
623         for (size_t output_y = 0; output_y < output_height; output_y++) {
624           const size_t input_y_start = doz(output_y * average_pooling_op->stride_height, average_pooling_op->padding_top);
625           const size_t input_y_end =
626             min(doz(output_y * average_pooling_op->stride_height + average_pooling_op->kernel_height, average_pooling_op->padding_top), input_height);
627           const uint32_t input_y_range = (uint32_t) (input_y_end - input_y_start);
628           for (size_t output_x = 0; output_x < output_width; output_x++) {
629             const size_t input_x_start = doz(output_x * average_pooling_op->stride_width, average_pooling_op->padding_left);
630             const size_t input_x_end =
631               min(doz(output_x * average_pooling_op->stride_width + average_pooling_op->kernel_width, average_pooling_op->padding_left), input_width);
632             const uint32_t input_x_range = (uint32_t) (input_x_end - input_x_start);
633             *pixelwise_pointer++ = 1.0f / ((float) (int32_t) (input_y_range * input_x_range));
634           }
635         }
636       }
637 
638       const uint32_t qr = xnn_params.f32.pavgpool.qr;
639       const size_t multipass_adjustment =
640         pooling_size > mr ? round_up(pooling_size - mr, qr) + mr - qr : 0;
641       average_pooling_op->context.pixelwise_average_pooling = (struct pixelwise_average_pooling_context) {
642         .indirect_input = indirection_buffer,
643         .indirect_input_batch_stride = output_height * indirect_input_height_stride,
644         .indirect_input_height_stride = indirect_input_height_stride,
645         .pixelwise_buffer = average_pooling_op->pixelwise_buffer,
646         .pixelwise_buffer_height_stride = output_width * sizeof(float),
647         .output = output,
648         .output_batch_stride = output_height * output_height_stride,
649         .output_height_stride = output_height_stride,
650         .output_width = output_width,
651         .pooling_size = pooling_size,
652         .channels = channels,
653         .zero = average_pooling_op->zero_buffer,
654         .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
655         .output_increment = output_width_stride - channels * sizeof(float),
656         .params.f32 = average_pooling_op->f32_output_params,
657       };
658       if (pooling_size <= mr) {
659         average_pooling_op->context.pixelwise_average_pooling.unipass_ukernel = xnn_params.f32.pavgpool.up;
660         average_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_pixelwise_average_pooling_unipass;
661       } else {
662         average_pooling_op->context.pixelwise_average_pooling.multipass_ukernel = xnn_params.f32.pavgpool.mp;
663         average_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_pixelwise_average_pooling_multipass;
664       }
665       break;
666     }
667     default:
668       XNN_UNREACHABLE;
669   }
670   average_pooling_op->compute.type = xnn_parallelization_type_2d;
671   average_pooling_op->compute.range[0] = batch_size;
672   average_pooling_op->compute.range[1] = output_height;
673   average_pooling_op->state = xnn_run_state_ready;
674 
675   average_pooling_op->last_input = input;
676   average_pooling_op->last_input_height = input_height;
677   average_pooling_op->last_input_width = input_width;
678   average_pooling_op->valid_batch_size = max(valid_batch_size, batch_size);
679 
680   return xnn_status_success;
681 }
682