• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/CL/kernels/CLPoolingLayerKernel.h"
25 
26 #include "arm_compute/core/CL/CLHelpers.h"
27 #include "arm_compute/core/CL/CLKernelLibrary.h"
28 #include "arm_compute/core/CL/ICLTensor.h"
29 #include "arm_compute/core/CL/OpenCL.h"
30 #include "arm_compute/core/Helpers.h"
31 #include "arm_compute/core/TensorInfo.h"
32 #include "arm_compute/core/Utils.h"
33 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
34 #include "src/core/AccessWindowStatic.h"
35 #include "src/core/CL/CLValidate.h"
36 #include "src/core/CL/ICLKernel.h"
37 #include "src/core/helpers/AutoConfiguration.h"
38 #include "src/core/helpers/WindowHelpers.h"
39 #include "support/StringSupport.h"
40 
41 #include <set>
42 #include <string>
43 #include <tuple>
44 
45 namespace arm_compute
46 {
47 using namespace arm_compute::misc::shape_calculator;
48 
49 namespace
50 {
51 // Internal window config info
52 using CLPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
53 
auto_init(const ITensorInfo * input,ITensorInfo * output,ITensorInfo * indices,PoolingLayerInfo pool_info)54 void auto_init(const ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, PoolingLayerInfo pool_info)
55 {
56     TensorShape out_shape = compute_pool_shape(*input, pool_info);
57     auto_init_if_empty(*output, input->clone()->set_tensor_shape(out_shape));
58     if(indices)
59     {
60         auto_init_if_empty(*indices, input->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
61     }
62 }
63 
validate_arguments(const ITensorInfo * input,const ITensorInfo * output,const PoolingLayerInfo & pool_info,const ITensorInfo * indices)64 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
65 {
66     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
67     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
68     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
69     ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type == PoolingType::L2),
70                                     "Unsupported combination of parameters!");
71 
72     // Check indices
73     if(indices)
74     {
75         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
76         ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
77         ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
78 
79         if(indices->total_size() != 0)
80         {
81             TensorInfo idx_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, DataType::U32));
82             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
83         }
84     }
85 
86     // Checks performed when output is configured
87     if(output->total_size() != 0)
88     {
89         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
90         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
91         TensorInfo out_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, output->data_type()));
92         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
93     }
94 
95     return Status{};
96 }
97 
validate_and_configure_window(ITensorInfo * input,ITensorInfo * output,const PoolingLayerInfo & pool_info,ITensorInfo * indices=nullptr)98 std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr)
99 {
100     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
101 
102     // Get data layout
103     const DataLayout data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
104     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
105     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
106 
107     int                 pool_stride_x   = 0;
108     int                 pool_stride_y   = 0;
109     unsigned int        pooled_w        = 0;
110     unsigned int        pooled_h        = 0;
111     int                 pool_size_x     = pool_info.is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
112     int                 pool_size_y     = pool_info.is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
113     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
114     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
115     const int  pool_pad_right  = pad_stride_info.pad_right();
116     const int  pool_pad_top    = pad_stride_info.pad_top();
117     const int  pool_pad_left   = pad_stride_info.pad_left();
118     const int  pool_pad_bottom = pad_stride_info.pad_bottom();
119     BorderSize border_size     = BorderSize();
120 
121     auto_init(input, output, indices, pool_info);
122     pooled_w = output->tensor_shape()[idx_width];
123     pooled_h = output->tensor_shape()[idx_height];
124 
125     const DataType data_type = input->data_type();
126 
127     const int input_width  = input->dimension(idx_width);
128     const int input_height = input->dimension(idx_height);
129 
130     unsigned int num_elems_processed_per_iteration = 0;
131     bool         window_changed                    = false;
132     Window       win{};
133     switch(data_layout)
134     {
135         case DataLayout::NCHW:
136         {
137             // Initialize border size
138             border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
139             // Change the number of elements processed per iteration
140             // for pooling 3x3 with stride less equal than 3
141             const bool can_optimize                         = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
142             num_elems_processed_per_iteration               = can_optimize ? 4 : 1;
143             const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x;
144 
145             // Number of iterations in X dimension
146             const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
147 
148             // Upper limit for the number of right/bottom border elements that are accessed
149             const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
150             const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
151 
152             border_size.right  = std::max(upper_bound_w, pool_pad_right);
153             border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
154 
155             win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
156 
157             AccessWindowRectangle input_access(input, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
158                                                pool_stride_x, pool_stride_y);
159             AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
160 
161             // Update indices window
162             if(indices)
163             {
164                 AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
165                 window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
166                 indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
167             }
168             else
169             {
170                 window_changed = update_window_and_padding(win, input_access, output_access);
171             }
172 
173             output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
174             break;
175         }
176         case DataLayout::NHWC:
177         {
178             // Initialize border size
179             border_size                       = BorderSize();
180             num_elems_processed_per_iteration = adjust_vec_size(4, output->dimension(0));
181             win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
182 
183             if(indices != nullptr)
184             {
185                 indices->set_valid_region(ValidRegion(Coordinates(), indices->tensor_shape()));
186             }
187 
188             output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
189             break;
190         }
191         default:
192             ARM_COMPUTE_ERROR("Not implemented");
193     }
194 
195     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
196     return std::make_tuple(err, win, CLPoolingConfig(num_elems_processed_per_iteration, border_size));
197 }
198 } // namespace
199 
CLPoolingLayerKernel()200 CLPoolingLayerKernel::CLPoolingLayerKernel()
201     : _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
202 {
203 }
204 
border_size() const205 BorderSize CLPoolingLayerKernel::border_size() const
206 {
207     return _border_size;
208 }
209 
configure(const ICLTensor * input,ICLTensor * output,const PoolingLayerInfo & pool_info,ICLTensor * indices)210 void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
211 {
212     configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices);
213 }
214 
configure(const CLCompileContext & compile_context,const ICLTensor * input,ICLTensor * output,const PoolingLayerInfo & pool_info,ICLTensor * indices)215 void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
216 {
217     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
218 
219     auto padding_info = get_padding_info({ input, output, indices });
220 
221     // Set instance variables
222     _input                              = input;
223     _output                             = output;
224     _pool_info                          = pool_info;
225     _data_layout                        = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
226     _indices                            = indices;
227     int                 pool_stride_x   = 0;
228     int                 pool_stride_y   = 0;
229     const PoolingType   pool_type       = pool_info.pool_type;
230     const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
231     const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
232     const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
233     const int           idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
234     const int           pool_size_x     = pool_info.is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width;
235     const int           pool_size_y     = pool_info.is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height;
236     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
237     const bool          exclude_padding = pool_info.exclude_padding;
238     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
239     const int pool_pad_top  = pad_stride_info.pad_top();
240     const int pool_pad_left = pad_stride_info.pad_left();
241 
242     // Set build options
243     CLBuildOptions build_opts;
244     const DataType data_type = input->info()->data_type();
245 
246     // Configure kernel window
247     auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, (indices ? indices->info() : nullptr));
248 
249     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
250     ICLKernel::configure_internal(std::get<1>(win_config));
251 
252     CLPoolingConfig pooling_config     = std::get<2>(win_config);
253     _num_elems_processed_per_iteration = pooling_config.first;
254     _border_size                       = pooling_config.second;
255 
256     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
257 
258     // Tensor paddings are used to calculate the indicies for MAX pooling
259     if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
260     {
261         build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(input->info()->padding().left));
262         build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(input->info()->padding().right));
263         build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(input->info()->padding().top));
264         build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(input->info()->padding().bottom));
265         build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
266         build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
267         build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
268     }
269 
270     if(is_data_type_quantized_asymmetric(data_type) && input->info()->quantization_info() != output->info()->quantization_info())
271     {
272         const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
273         const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
274 
275         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
276         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
277         build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
278         build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
279     }
280 
281     // Check output dimensions
282     auto_init(input->info(), output->info(), indices ? indices->info() : nullptr, pool_info);
283 
284     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr));
285 
286     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
287     build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
288     build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
289     build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
290     build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
291     build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
292     build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
293     build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
294 
295     // Set the initial value for the pooling operation accordingly with the data type
296     if(pool_type == PoolingType::MAX)
297     {
298         if(is_data_type_quantized(data_type))
299         {
300             PixelValue type_min{};
301             std::tie(type_min, std::ignore) = get_min_max(data_type);
302             build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>()));
303         }
304         else
305         {
306             build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
307         }
308     }
309     else
310     {
311         // Pool AVG and Pool L2 initial value
312         build_opts.add_option("-DINITIAL_VALUE=0");
313     }
314 
315     build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
316     build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
317 
318     // Create kernel
319     switch(_data_layout)
320     {
321         case DataLayout::NCHW:
322         {
323             const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
324             const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
325             const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
326             build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
327             build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
328 
329             if(pool_type != PoolingType::MAX)
330             {
331                 build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
332             }
333 
334             if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type))
335             {
336                 // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
337                 // each thread computes 4 output elements
338                 const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3);
339 
340                 std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
341                                           + support::cpp11::to_string(pool_size_x);
342                 _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
343             }
344             else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
345             {
346                 // For max pooling with pool2x2, store indicies which will be used in max unpooling
347                 if(data_type == DataType::F32)
348                 {
349                     std::string kernel_name = "pooling_layer_2_nchw_indices_fp32";
350                     _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
351                 }
352                 else if(data_type == DataType::F16)
353                 {
354                     std::string kernel_name = "pooling_layer_2_nchw_indices_fp16";
355                     _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
356                 }
357             }
358             else // Run general case
359             {
360                 std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw";
361                 _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
362             }
363             break;
364         }
365         case DataLayout::NHWC:
366         {
367             // Floating point mixed precision is support on F16 only
368             const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
369 
370             // Wider accumulation is required to avoid accuracy loss
371             // Case 1: Floating point mixed precision (fp16 input data and fp32 accumulation)
372             // Cast 2: Quantized (int8/uint8 input data and int32 accumulation )
373             DataType acc_data_type = data_type;
374 
375             if(use_fp_mixed_precision)
376             {
377                 acc_data_type = DataType::F32;
378             }
379             else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
380             {
381                 acc_data_type = DataType::S32;
382             }
383 
384             build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
385             build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
386             build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
387             build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
388             build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
389             build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
390             build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(output->info()->dimension(idx_channel)));
391             build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch_size)));
392             build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % _num_elems_processed_per_iteration));
393             if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
394             {
395                 build_opts.add_option_if(_indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
396 
397                 std::string kernel_name = "pooling_layer_2x2_nhwc";
398                 _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
399             }
400             else
401             {
402                 std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
403                 _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
404             }
405             break;
406         }
407         default:
408             ARM_COMPUTE_ERROR("Not implemented");
409     }
410 
411     // Set config_id for enabling LWS tuning
412     _config_id = "pooling_layer_";
413     _config_id += lower_string(string_from_data_type(data_type));
414     _config_id += "_";
415     _config_id += lower_string(string_from_data_layout(_data_layout));
416     _config_id += "_";
417     _config_id += support::cpp11::to_string(output->info()->dimension(idx_width));
418     _config_id += "_";
419     _config_id += support::cpp11::to_string(output->info()->dimension(idx_height));
420     _config_id += "_";
421     _config_id += support::cpp11::to_string(output->info()->dimension(idx_channel));
422     _config_id += "_";
423     _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
424 
425     ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
426 }
427 
validate(const ITensorInfo * input,const ITensorInfo * output,const PoolingLayerInfo & pool_info,const ITensorInfo * indices)428 Status CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
429 {
430     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
431     ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info)));
432 
433     return Status{};
434 }
435 
run(const Window & window,cl::CommandQueue & queue)436 void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
437 {
438     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
439     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
440 
441     unsigned int pool_stride_x = 0;
442     unsigned int pool_stride_y = 0;
443     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
444 
445     // Collapse window
446     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
447 
448     switch(_data_layout)
449     {
450         case DataLayout::NCHW:
451         {
452             Window slice = window_collapsed.first_slice_window_3D();
453             do
454             {
455                 // Upsample input by pool size
456                 Window in_slice(slice);
457                 in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info.pad_left(),
458                                                              (in_slice.x().end() - _pool_info.pad_stride_info.pad_left()) * pool_stride_x,
459                                                              pool_stride_x * _num_elems_processed_per_iteration));
460                 in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info.pad_top(),
461                                                              (in_slice.y().end() - _pool_info.pad_stride_info.pad_top()) * pool_stride_y,
462                                                              pool_stride_y));
463 
464                 // Set inputs
465                 unsigned int idx = 0;
466                 add_3D_tensor_argument(idx, _input, in_slice);
467                 add_3D_tensor_argument(idx, _output, slice);
468                 if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
469                 {
470                     add_3D_tensor_argument(idx, _indices, slice);
471                 }
472                 enqueue(queue, *this, slice, lws_hint());
473             }
474             while(window_collapsed.slide_window_slice_3D(slice));
475             break;
476         }
477         case DataLayout::NHWC:
478         {
479             const size_t batch_size = _output->info()->tensor_shape().total_size_upper(3);
480 
481             Window slice    = window_collapsed.first_slice_window_4D();
482             Window in_slice = window_collapsed.first_slice_window_4D();
483             in_slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration));
484             in_slice.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
485             in_slice.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
486             in_slice.set(3, Window::Dimension(0, batch_size, 1));
487             do
488             {
489                 // Set inputs
490                 unsigned int idx = 0;
491                 add_4D_tensor_argument(idx, _input, in_slice);
492                 add_4D_tensor_argument(idx, _output, slice);
493                 if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
494                 {
495                     add_4D_tensor_argument(idx, _indices, slice);
496                 }
497                 enqueue(queue, *this, slice, lws_hint());
498             }
499             while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
500             break;
501         }
502         default:
503             ARM_COMPUTE_ERROR("Not implemented");
504     }
505 }
506 } // namespace arm_compute
507