1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <assert.h>
10 #include <math.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <string.h>
16
17 #include <xnnpack.h>
18 #include <xnnpack/allocator.h>
19 #include <xnnpack/common.h>
20 #include <xnnpack/indirection.h>
21 #include <xnnpack/log.h>
22 #include <xnnpack/math.h>
23 #include <xnnpack/operator.h>
24 #include <xnnpack/params-init.h>
25 #include <xnnpack/params.h>
26
27
compute_output_dimension(size_t padded_input_dimension,size_t kernel_dimension,size_t dilation_dimension,size_t stride_dimension)28 static inline size_t compute_output_dimension(
29 size_t padded_input_dimension,
30 size_t kernel_dimension,
31 size_t dilation_dimension,
32 size_t stride_dimension)
33 {
34 const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
35 return (padded_input_dimension - effective_kernel_dimension) / stride_dimension + 1;
36 }
37
compute_output_dimension_with_tf_same_padding(size_t input_dimension,size_t stride_dimension)38 static inline size_t compute_output_dimension_with_tf_same_padding(
39 size_t input_dimension,
40 size_t stride_dimension)
41 {
42 return divide_round_up(input_dimension, stride_dimension);
43 }
44
create_max_pooling2d_nhwc(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t pooling_height,uint32_t pooling_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,size_t channels,size_t input_pixel_stride,size_t output_pixel_stride,uint32_t flags,const void * params,size_t params_size,uint32_t datatype_init_flags,enum xnn_operator_type operator_type,xnn_operator_t * max_pooling_op_out)45 static enum xnn_status create_max_pooling2d_nhwc(
46 uint32_t input_padding_top,
47 uint32_t input_padding_right,
48 uint32_t input_padding_bottom,
49 uint32_t input_padding_left,
50 uint32_t pooling_height,
51 uint32_t pooling_width,
52 uint32_t stride_height,
53 uint32_t stride_width,
54 uint32_t dilation_height,
55 uint32_t dilation_width,
56 size_t channels,
57 size_t input_pixel_stride,
58 size_t output_pixel_stride,
59 uint32_t flags,
60 const void* params,
61 size_t params_size,
62 uint32_t datatype_init_flags,
63 enum xnn_operator_type operator_type,
64 xnn_operator_t* max_pooling_op_out)
65 {
66 xnn_operator_t max_pooling_op = NULL;
67 enum xnn_status status = xnn_status_uninitialized;
68
69 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
70 xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
71 xnn_operator_type_to_string(operator_type));
72 return xnn_status_uninitialized;
73 }
74
75 status = xnn_status_unsupported_hardware;
76
77 if ((xnn_params.init_flags & datatype_init_flags) != datatype_init_flags) {
78 xnn_log_error(
79 "failed to create %s operator: operations on data type are not supported",
80 xnn_operator_type_to_string(operator_type));
81 goto error;
82 }
83
84 status = xnn_status_invalid_parameter;
85
86 const uint32_t pooling_size = pooling_height * pooling_width;
87 if (pooling_size == 0) {
88 xnn_log_error(
89 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " pooling size: "
90 "pooling size dimensions must be non-zero",
91 xnn_operator_type_to_string(operator_type),
92 pooling_width, pooling_height);
93 goto error;
94 }
95
96 if (pooling_size == 1) {
97 xnn_log_error(
98 "failed to create %s operator with 1 pooling element: 1x1 pooling is meaningless",
99 xnn_operator_type_to_string(operator_type));
100 goto error;
101 }
102
103 if (stride_height == 0 || stride_width == 0) {
104 xnn_log_error(
105 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
106 xnn_operator_type_to_string(operator_type), stride_width, stride_height);
107 goto error;
108 }
109
110 if (dilation_height == 0 || dilation_width == 0) {
111 xnn_log_error(
112 "failed to create %s operator with %" PRIu32 "x%" PRIu32 " dilation: dilation dimensions must be non-zero",
113 xnn_operator_type_to_string(operator_type), dilation_width, dilation_height);
114 goto error;
115 }
116
117 if (channels == 0) {
118 xnn_log_error(
119 "failed to create %s operator with %zu channels: number of channels must be non-zero",
120 xnn_operator_type_to_string(operator_type), channels);
121 goto error;
122 }
123
124 if (input_pixel_stride < channels) {
125 xnn_log_error(
126 "failed to create %s operator with input pixel stride of %zu: "
127 "stride must be at least as large as the number of channels (%zu)",
128 xnn_operator_type_to_string(operator_type), input_pixel_stride, channels);
129 goto error;
130 }
131
132 if (output_pixel_stride < channels) {
133 xnn_log_error(
134 "failed to create %s operator with output pixel stride of %zu: "
135 "stride must be at least as large as the number of channels (%zu)",
136 xnn_operator_type_to_string(operator_type), output_pixel_stride, channels);
137 goto error;
138 }
139
140 const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
141 if ((flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
142 if (any_padding) {
143 xnn_log_error(
144 "failed to create %s operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
145 "TensorFlow SAME padding can't be combined with explicit padding specification",
146 xnn_operator_type_to_string(operator_type),
147 input_padding_top, input_padding_left, input_padding_bottom, input_padding_right);
148 goto error;
149 }
150 }
151
152 status = xnn_status_out_of_memory;
153
154 max_pooling_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
155 if (max_pooling_op == NULL) {
156 xnn_log_error(
157 "failed to allocate %zu bytes for %s operator descriptor",
158 sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
159 goto error;
160 }
161
162 max_pooling_op->padding_top = input_padding_top;
163 max_pooling_op->padding_right = input_padding_right;
164 max_pooling_op->padding_bottom = input_padding_bottom;
165 max_pooling_op->padding_left = input_padding_left;
166
167 max_pooling_op->kernel_height = pooling_height;
168 max_pooling_op->kernel_width = pooling_width;
169 max_pooling_op->stride_height = stride_height;
170 max_pooling_op->stride_width = stride_width;
171 max_pooling_op->dilation_height = dilation_height;
172 max_pooling_op->dilation_width = dilation_width;
173 max_pooling_op->channels = channels;
174 max_pooling_op->input_pixel_stride = input_pixel_stride;
175 max_pooling_op->output_pixel_stride = output_pixel_stride;
176
177 memcpy(&max_pooling_op->params, params, params_size);
178 max_pooling_op->type = operator_type;
179 max_pooling_op->flags = flags;
180
181 max_pooling_op->state = xnn_run_state_invalid;
182
183 *max_pooling_op_out = max_pooling_op;
184 return xnn_status_success;
185
186 error:
187 xnn_delete_operator(max_pooling_op);
188 return status;
189 }
190
setup_max_pooling2d_nhwc(xnn_operator_t max_pooling_op,size_t batch_size,size_t input_height,size_t input_width,const void * input,void * output,uint32_t log2_input_element_size,uint32_t log2_output_element_size,struct maxpool_parameters maxpool[restrict XNN_MIN_ELEMENTS (1)],const void * params,size_t params_size,size_t num_threads)191 static enum xnn_status setup_max_pooling2d_nhwc(
192 xnn_operator_t max_pooling_op,
193 size_t batch_size,
194 size_t input_height,
195 size_t input_width,
196 const void* input,
197 void* output,
198 uint32_t log2_input_element_size,
199 uint32_t log2_output_element_size,
200 struct maxpool_parameters maxpool[restrict XNN_MIN_ELEMENTS(1)],
201 const void* params,
202 size_t params_size,
203 size_t num_threads)
204 {
205 max_pooling_op->state = xnn_run_state_invalid;
206
207 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
208 xnn_log_error(
209 "failed to setup %s operator: XNNPACK is not initialized",
210 xnn_operator_type_to_string(max_pooling_op->type));
211 return xnn_status_uninitialized;
212 }
213
214 if (input_width == 0 || input_height == 0) {
215 xnn_log_error(
216 "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
217 xnn_operator_type_to_string(max_pooling_op->type), input_width, input_height);
218 return xnn_status_invalid_parameter;
219 }
220
221 if (batch_size == 0) {
222 max_pooling_op->state = xnn_run_state_skip;
223 return xnn_status_success;
224 }
225
226 max_pooling_op->input_height = input_height;
227 max_pooling_op->input_width = input_width;
228 max_pooling_op->input = input;
229
230 if (max_pooling_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
231 max_pooling_op->output_height = compute_output_dimension_with_tf_same_padding(
232 input_height, max_pooling_op->stride_height);
233 max_pooling_op->output_width = compute_output_dimension_with_tf_same_padding(
234 input_width, max_pooling_op->stride_width);
235
236 const uint32_t effective_kernel_height = (max_pooling_op->kernel_height - 1) * max_pooling_op->dilation_height + 1;
237 const uint32_t effective_kernel_width = (max_pooling_op->kernel_width - 1) * max_pooling_op->dilation_width + 1;
238 const uint32_t total_padding_height =
239 doz((max_pooling_op->output_height - 1) * max_pooling_op->stride_height + effective_kernel_height, input_height);
240 const uint32_t total_padding_width =
241 doz((max_pooling_op->output_width - 1) * max_pooling_op->stride_width + effective_kernel_width, input_width);
242 max_pooling_op->padding_top = total_padding_height / 2;
243 max_pooling_op->padding_left = total_padding_width / 2;
244 max_pooling_op->padding_bottom = total_padding_height - max_pooling_op->padding_top;
245 max_pooling_op->padding_right = total_padding_width - max_pooling_op->padding_left;
246 } else {
247 max_pooling_op->output_height = compute_output_dimension(
248 max_pooling_op->padding_top + input_height + max_pooling_op->padding_bottom,
249 max_pooling_op->kernel_height,
250 max_pooling_op->dilation_height,
251 max_pooling_op->stride_height);
252 max_pooling_op->output_width = compute_output_dimension(
253 max_pooling_op->padding_left + input_width + max_pooling_op->padding_right,
254 max_pooling_op->kernel_width,
255 max_pooling_op->dilation_width,
256 max_pooling_op->stride_width);
257 }
258
259 const size_t pooling_height = max_pooling_op->kernel_height;
260 const size_t pooling_width = max_pooling_op->kernel_width;
261 const size_t pooling_size = pooling_height * pooling_width;
262 const size_t output_height = max_pooling_op->output_height;
263 const size_t output_width = max_pooling_op->output_width;
264 const uint32_t mr = maxpool->mr;
265
266 const size_t step_width =
267 max_pooling_op->dilation_width > 1 ? pooling_width : min(max_pooling_op->stride_width, pooling_width);
268 const size_t step_height = pooling_size + (output_width - 1) * step_width * pooling_height;
269
270 if (input_height != max_pooling_op->last_input_height ||
271 input_width != max_pooling_op->last_input_width)
272 {
273 // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
274 const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + output_height * step_height);
275 const void** indirection_buffer =
276 (const void**) xnn_reallocate_memory(max_pooling_op->indirection_buffer, indirection_buffer_size);
277 if (indirection_buffer == NULL) {
278 xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
279 return xnn_status_out_of_memory;
280 }
281 max_pooling_op->indirection_buffer = indirection_buffer;
282
283 xnn_indirection_init_maxpool2d(max_pooling_op, step_height, step_width, log2_input_element_size);
284
285 max_pooling_op->last_input = input;
286 max_pooling_op->last_input_height = input_height;
287 max_pooling_op->last_input_width = input_width;
288 }
289
290 const uint32_t qr = maxpool->qr;
291 const size_t channels = max_pooling_op->channels;
292
293 const size_t indirect_input_height_stride = step_height * sizeof(void*);
294 const size_t output_width_stride = max_pooling_op->output_pixel_stride << log2_output_element_size;
295 const size_t output_height_stride = output_width * output_width_stride;
296 const size_t multipass_adjustment = round_up(doz(pooling_size, mr), qr) + mr;
297
298 max_pooling_op->context.max_pooling = (struct max_pooling_context) {
299 .indirect_input = max_pooling_op->indirection_buffer,
300 .indirect_input_height_stride = indirect_input_height_stride,
301 .input_offset = (size_t) ((uintptr_t) input - (uintptr_t) max_pooling_op->last_input),
302 .input_batch_stride = (input_height * input_width * max_pooling_op->input_pixel_stride) << log2_input_element_size,
303 .output = output,
304 .output_batch_stride = output_height * output_height_stride,
305 .output_height_stride = output_height_stride,
306 .output_width = output_width,
307 .pooling_size = pooling_size,
308 .channels = channels,
309 .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
310 .output_increment = output_width_stride - (channels << log2_output_element_size),
311 .ukernel = maxpool->ukernel,
312 };
313 memcpy(&max_pooling_op->context.max_pooling.params, params, params_size);
314
315 max_pooling_op->compute.type = xnn_parallelization_type_2d;
316 max_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_max_pooling;
317 max_pooling_op->compute.range[0] = batch_size;
318 max_pooling_op->compute.range[1] = output_height;
319 max_pooling_op->state = xnn_run_state_ready;
320
321 return xnn_status_success;
322 }
323
xnn_create_max_pooling2d_nhwc_u8(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t pooling_height,uint32_t pooling_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,size_t channels,size_t input_pixel_stride,size_t output_pixel_stride,uint8_t output_min,uint8_t output_max,uint32_t flags,xnn_operator_t * max_pooling_op_out)324 enum xnn_status xnn_create_max_pooling2d_nhwc_u8(
325 uint32_t input_padding_top,
326 uint32_t input_padding_right,
327 uint32_t input_padding_bottom,
328 uint32_t input_padding_left,
329 uint32_t pooling_height,
330 uint32_t pooling_width,
331 uint32_t stride_height,
332 uint32_t stride_width,
333 uint32_t dilation_height,
334 uint32_t dilation_width,
335 size_t channels,
336 size_t input_pixel_stride,
337 size_t output_pixel_stride,
338 uint8_t output_min,
339 uint8_t output_max,
340 uint32_t flags,
341 xnn_operator_t* max_pooling_op_out)
342 {
343 if (output_min >= output_max) {
344 xnn_log_error(
345 "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
346 xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_u8), output_min, output_max);
347 return xnn_status_invalid_parameter;
348 }
349
350 const union xnn_u8_minmax_params params = xnn_init_u8_minmax_params(output_min, output_max);
351 return create_max_pooling2d_nhwc(
352 input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
353 pooling_height, pooling_width,
354 stride_height, stride_width,
355 dilation_height, dilation_width,
356 channels, input_pixel_stride, output_pixel_stride,
357 flags,
358 ¶ms, sizeof(params), XNN_INIT_FLAG_U8,
359 xnn_operator_type_max_pooling_nhwc_u8,
360 max_pooling_op_out);
361 }
362
xnn_create_max_pooling2d_nhwc_f32(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t pooling_height,uint32_t pooling_width,uint32_t stride_height,uint32_t stride_width,uint32_t dilation_height,uint32_t dilation_width,size_t channels,size_t input_pixel_stride,size_t output_pixel_stride,float output_min,float output_max,uint32_t flags,xnn_operator_t * max_pooling_op_out)363 enum xnn_status xnn_create_max_pooling2d_nhwc_f32(
364 uint32_t input_padding_top,
365 uint32_t input_padding_right,
366 uint32_t input_padding_bottom,
367 uint32_t input_padding_left,
368 uint32_t pooling_height,
369 uint32_t pooling_width,
370 uint32_t stride_height,
371 uint32_t stride_width,
372 uint32_t dilation_height,
373 uint32_t dilation_width,
374 size_t channels,
375 size_t input_pixel_stride,
376 size_t output_pixel_stride,
377 float output_min,
378 float output_max,
379 uint32_t flags,
380 xnn_operator_t* max_pooling_op_out)
381 {
382 if (isnan(output_min)) {
383 xnn_log_error(
384 "failed to create %s with NaN output lower bound: lower bound must be non-NaN",
385 xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f32));
386 return xnn_status_invalid_parameter;
387 }
388
389 if (isnan(output_max)) {
390 xnn_log_error(
391 "failed to create %s with NaN output upper bound: upper bound must be non-NaN",
392 xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f32));
393 return xnn_status_invalid_parameter;
394 }
395
396 if (output_min >= output_max) {
397 xnn_log_error(
398 "failed to create %s with [%.7g, %.7g] output range: lower bound must be below upper bound",
399 xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f32), output_min, output_max);
400 return xnn_status_invalid_parameter;
401 }
402
403 const union xnn_f32_minmax_params params = xnn_init_f32_minmax_params(output_min, output_max);
404 return create_max_pooling2d_nhwc(
405 input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
406 pooling_height, pooling_width,
407 stride_height, stride_width,
408 dilation_height, dilation_width,
409 channels, input_pixel_stride, output_pixel_stride,
410 flags,
411 ¶ms, sizeof(params), XNN_INIT_FLAG_F32,
412 xnn_operator_type_max_pooling_nhwc_f32,
413 max_pooling_op_out);
414 }
415
xnn_setup_max_pooling2d_nhwc_u8(xnn_operator_t max_pooling_op,size_t batch_size,size_t input_height,size_t input_width,const uint8_t * input,uint8_t * output,pthreadpool_t threadpool)416 enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
417 xnn_operator_t max_pooling_op,
418 size_t batch_size,
419 size_t input_height,
420 size_t input_width,
421 const uint8_t* input,
422 uint8_t* output,
423 pthreadpool_t threadpool)
424 {
425 if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_u8) {
426 xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
427 xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_u8),
428 xnn_operator_type_to_string(max_pooling_op->type));
429 return xnn_status_invalid_parameter;
430 }
431
432 return setup_max_pooling2d_nhwc(
433 max_pooling_op,
434 batch_size, input_height, input_width,
435 input, output,
436 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
437 0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
438 &xnn_params.u8.maxpool,
439 &max_pooling_op->params.u8_minmax, sizeof(max_pooling_op->params.u8_minmax),
440 pthreadpool_get_threads_count(threadpool));
441 }
442
xnn_setup_max_pooling2d_nhwc_f32(xnn_operator_t max_pooling_op,size_t batch_size,size_t input_height,size_t input_width,const float * input,float * output,pthreadpool_t threadpool)443 enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
444 xnn_operator_t max_pooling_op,
445 size_t batch_size,
446 size_t input_height,
447 size_t input_width,
448 const float* input,
449 float* output,
450 pthreadpool_t threadpool)
451 {
452 if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_f32) {
453 xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
454 xnn_operator_type_to_string(xnn_operator_type_max_pooling_nhwc_f32),
455 xnn_operator_type_to_string(max_pooling_op->type));
456 return xnn_status_invalid_parameter;
457 }
458
459 return setup_max_pooling2d_nhwc(
460 max_pooling_op,
461 batch_size, input_height, input_width,
462 input, output,
463 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
464 2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
465 &xnn_params.f32.maxpool,
466 &max_pooling_op->params.f32_minmax, sizeof(max_pooling_op->params.f32_minmax),
467 pthreadpool_get_threads_count(threadpool));
468 }
469
470