1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7 #include <math.h>
8 #include <stdbool.h>
9 #include <stddef.h>
10 #include <stdint.h>
11 #include <stdlib.h>
12 #include <string.h>
13
14 #include <xnnpack.h>
15 #include <xnnpack/allocator.h>
16 #include <xnnpack/operator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/log.h>
19 #include <xnnpack/math.h>
20 #include <xnnpack/params-init.h>
21 #include <xnnpack/params.h>
22 #include <xnnpack/indirection.h>
23
24
compute_output_dimension(size_t padded_input_dimension,size_t kernel_dimension)25 static inline size_t compute_output_dimension(
26 size_t padded_input_dimension,
27 size_t kernel_dimension)
28 {
29 return padded_input_dimension / kernel_dimension;
30 }
31
select_ukernel(size_t pooling_size,const struct argmaxpool_parameters * ukernel)32 static const struct argmaxpool_parameters* select_ukernel(
33 size_t pooling_size,
34 const struct argmaxpool_parameters* ukernel)
35 {
36 while (ukernel->qr == 0 && ukernel->mr < pooling_size) {
37 ukernel++;
38 }
39 return ukernel;
40 }
41
xnn_create_argmax_pooling2d_nhwc_f32(uint32_t input_padding_top,uint32_t input_padding_right,uint32_t input_padding_bottom,uint32_t input_padding_left,uint32_t pooling_height,uint32_t pooling_width,size_t channels,size_t input_pixel_stride,size_t output_pixel_stride,float output_min,float output_max,uint32_t flags,xnn_operator_t * argmax_pooling_op_out)42 enum xnn_status xnn_create_argmax_pooling2d_nhwc_f32(
43 uint32_t input_padding_top,
44 uint32_t input_padding_right,
45 uint32_t input_padding_bottom,
46 uint32_t input_padding_left,
47 uint32_t pooling_height,
48 uint32_t pooling_width,
49 size_t channels,
50 size_t input_pixel_stride,
51 size_t output_pixel_stride,
52 float output_min,
53 float output_max,
54 uint32_t flags,
55 xnn_operator_t* argmax_pooling_op_out)
56 {
57 xnn_operator_t argmax_pooling_op = NULL;
58 enum xnn_status status = xnn_status_uninitialized;
59
60 if (!xnn_params.initialized) {
61 xnn_log_error("failed to create Argmax Pooling operator: XNNPACK is not initialized");
62 goto error;
63 }
64
65 status = xnn_status_invalid_parameter;
66
67 const uint32_t pooling_size = pooling_height * pooling_width;
68 if (pooling_size == 0) {
69 xnn_log_error(
70 "failed to create Argmax Pooling operator with %" PRIu32 "x%" PRIu32 " pooling size: "
71 "pooling size dimensions must be non-zero",
72 pooling_width, pooling_height);
73 goto error;
74 }
75
76 if (pooling_size == 1) {
77 xnn_log_error(
78 "failed to create Argmax Pooling operator with 1 pooling element: "
79 "1x1 pooling is meaningless");
80 goto error;
81 }
82
83 if (channels == 0) {
84 xnn_log_error(
85 "failed to create Argmax Pooling operator with %zu channels: "
86 "number of channels must be non-zero",
87 channels);
88 goto error;
89 }
90
91 if (input_pixel_stride < channels) {
92 xnn_log_error(
93 "failed to create Argmax Pooling operator with input pixel stride of %zu: "
94 "stride must be at least as large as the number of channels (%zu)",
95 input_pixel_stride, channels);
96 goto error;
97 }
98
99 if (output_pixel_stride < channels) {
100 xnn_log_error(
101 "failed to create Argmax Pooling operator with output pixel stride of %zu: "
102 "stride must be at least as large as the number of channels (%zu)",
103 output_pixel_stride, channels);
104 goto error;
105 }
106
107 if (isnan(output_min)) {
108 xnn_log_error(
109 "failed to create Argmax Pooling operator with NaN output lower bound: "
110 "lower bound must be non-NaN");
111 goto error;
112 }
113
114 if (isnan(output_max)) {
115 xnn_log_error(
116 "failed to create Argmax Pooling operator with NaN output upper bound: "
117 "upper bound must be non-NaN");
118 goto error;
119 }
120
121 if (output_min >= output_max) {
122 xnn_log_error(
123 "failed to create Argmax Pooling operator with [%.7g, %.7g] output range: "
124 "lower bound must be below upper bound",
125 output_min, output_max);
126 goto error;
127 }
128
129 status = xnn_status_out_of_memory;
130
131 argmax_pooling_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
132 if (argmax_pooling_op == NULL) {
133 xnn_log_error("failed to allocate %zu bytes for Argmax Pooling operator descriptor", sizeof(struct xnn_operator));
134 goto error;
135 }
136
137 argmax_pooling_op->padding_top = input_padding_top;
138 argmax_pooling_op->padding_right = input_padding_right;
139 argmax_pooling_op->padding_bottom = input_padding_bottom;
140 argmax_pooling_op->padding_left = input_padding_left;
141
142 argmax_pooling_op->kernel_height = pooling_height;
143 argmax_pooling_op->kernel_width = pooling_width;
144 argmax_pooling_op->stride_height = pooling_height;
145 argmax_pooling_op->stride_width = pooling_width;
146 argmax_pooling_op->dilation_height = 1;
147 argmax_pooling_op->dilation_width = 1;
148 argmax_pooling_op->channels = channels;
149 argmax_pooling_op->input_pixel_stride = input_pixel_stride;
150 argmax_pooling_op->output_pixel_stride = output_pixel_stride;
151
152 argmax_pooling_op->f32_output_params = xnn_init_f32_output_params(output_min, output_max);
153
154 argmax_pooling_op->type = xnn_operator_type_argmax_pooling_nhwc_f32;
155 argmax_pooling_op->ukernel.type = xnn_ukernel_type_argmax_pooling;
156
157 argmax_pooling_op->state = xnn_run_state_invalid;
158
159 *argmax_pooling_op_out = argmax_pooling_op;
160 return xnn_status_success;
161
162 error:
163 xnn_delete_operator(argmax_pooling_op);
164 return status;
165 }
166
xnn_setup_argmax_pooling2d_nhwc_f32(xnn_operator_t argmax_pooling_op,size_t batch_size,size_t input_height,size_t input_width,const float * input,float * output,uint32_t * index,pthreadpool_t threadpool)167 enum xnn_status xnn_setup_argmax_pooling2d_nhwc_f32(
168 xnn_operator_t argmax_pooling_op,
169 size_t batch_size,
170 size_t input_height,
171 size_t input_width,
172 const float* input,
173 float* output,
174 uint32_t* index,
175 pthreadpool_t threadpool)
176 {
177 if (argmax_pooling_op->type != xnn_operator_type_argmax_pooling_nhwc_f32) {
178 xnn_log_error("failed to setup Argmax Pooling (NHWC, F32) operator: operator type mismatch");
179 return xnn_status_invalid_parameter;
180 }
181 argmax_pooling_op->state = xnn_run_state_invalid;
182
183 if (!xnn_params.initialized) {
184 xnn_log_error("failed to setup Argmax Pooling operator: XNNPACK is not initialized");
185 return xnn_status_uninitialized;
186 }
187
188 if (input_width == 0 || input_height == 0) {
189 xnn_log_error(
190 "failed to setup Argmax Pooling operator with %zux%zu input: input dimensions must be non-zero",
191 input_width, input_height);
192 return xnn_status_invalid_parameter;
193 }
194
195 if (batch_size == 0) {
196 argmax_pooling_op->state = xnn_run_state_skip;
197 return xnn_status_success;
198 }
199
200 argmax_pooling_op->batch_size = batch_size;
201 argmax_pooling_op->input_height = input_height;
202 argmax_pooling_op->input_width = input_width;
203 argmax_pooling_op->input = input;
204
205 argmax_pooling_op->output_height = compute_output_dimension(
206 argmax_pooling_op->padding_top + input_height + argmax_pooling_op->padding_bottom,
207 argmax_pooling_op->kernel_height);
208 argmax_pooling_op->output_width = compute_output_dimension(
209 argmax_pooling_op->padding_left + input_width + argmax_pooling_op->padding_right,
210 argmax_pooling_op->kernel_width);
211
212 const size_t pooling_height = argmax_pooling_op->kernel_height;
213 const size_t pooling_width = argmax_pooling_op->kernel_width;
214 const size_t pooling_size = pooling_height * pooling_width;
215 const size_t output_height = argmax_pooling_op->output_height;
216 const size_t output_width = argmax_pooling_op->output_width;
217 const struct argmaxpool_parameters* ukernel = select_ukernel(pooling_size, xnn_params.f32.argmaxpool);
218 const uint32_t mr = ukernel->mr;
219
220 const size_t step_width = pooling_width;
221 const size_t step_height = pooling_size + (output_width - 1) * step_width * pooling_height;
222
223 if (input_height != argmax_pooling_op->last_input_height ||
224 input_width != argmax_pooling_op->last_input_width)
225 {
226 // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
227 const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + output_height * step_height);
228
229 const void** indirection_buffer = (const void**) xnn_reallocate_memory(argmax_pooling_op->indirection_buffer, indirection_buffer_size);
230 if (indirection_buffer == NULL) {
231 xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
232 return xnn_status_out_of_memory;
233 }
234 argmax_pooling_op->indirection_buffer = indirection_buffer;
235
236 xnn_indirection_init_maxpool2d(argmax_pooling_op, step_height, step_width, 2 /* log2(sizeof(float)) */);
237
238 argmax_pooling_op->last_input = input;
239 argmax_pooling_op->last_input_height = input_height;
240 argmax_pooling_op->last_input_width = input_width;
241 }
242
243 const size_t channels = argmax_pooling_op->channels;
244
245 const size_t indirect_input_height_stride = step_height * sizeof(void*);
246 const size_t output_width_stride = argmax_pooling_op->output_pixel_stride * sizeof(float);
247 const size_t output_height_stride = output_width * output_width_stride;
248 const size_t index_height_stride = output_width * channels * sizeof(uint32_t);
249
250 const uint32_t qr = ukernel->qr;
251 const size_t multipass_adjustment = qr == 0 ? 0 : round_up(pooling_size - mr, qr) + mr - qr;
252 argmax_pooling_op->context.argmax_pooling = (struct argmax_pooling_context) {
253 .indirect_input = argmax_pooling_op->indirection_buffer,
254 .indirect_input_height_stride = indirect_input_height_stride,
255 .input_offset = (size_t) ((uintptr_t) input - (uintptr_t) argmax_pooling_op->last_input),
256 .input_batch_stride = input_height * input_width * argmax_pooling_op->input_pixel_stride * sizeof(float),
257 .output = output,
258 .output_batch_stride = output_height * output_height_stride,
259 .output_height_stride = output_height_stride,
260 .output_width = output_width,
261 .index = index,
262 .index_batch_stride = output_height * index_height_stride,
263 .index_height_stride = index_height_stride,
264 .pooling_size = pooling_size,
265 .channels = channels,
266 .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
267 .output_increment = output_width_stride - channels * sizeof(float),
268 .params.f32 = argmax_pooling_op->f32_output_params,
269 };
270 argmax_pooling_op->compute.type = xnn_parallelization_type_2d;
271 argmax_pooling_op->compute.range[0] = batch_size;
272 argmax_pooling_op->compute.range[1] = output_height;
273
274 if (pooling_size <= mr) {
275 argmax_pooling_op->context.argmax_pooling.unipass_ukernel = ukernel->up;
276 argmax_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_argmax_pooling_unipass;
277 } else {
278 argmax_pooling_op->context.argmax_pooling.multipass_ukernel = ukernel->mp;
279 argmax_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_argmax_pooling_multipass;
280 }
281 argmax_pooling_op->state = xnn_run_state_ready;
282
283 return xnn_status_success;
284 }
285
286