1 /*
2 * Copyright (c) 2018-2019 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 /*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32
33 #include <algorithm>
34 #include <cstdint>
35 #include "depthwise.hpp"
36 #include "padding.hpp"
37 #include "utils.hpp"
38
39 #pragma once
40
41 #define MEMBERFN(TOUT) template <\
42 unsigned int OutputTileRows, unsigned int OutputTileColumns,\
43 unsigned int KernelRows, unsigned int KernelColumns,\
44 unsigned int StrideRows, unsigned int StrideColumns,\
45 typename TIn, typename TBias, typename TOut,\
46 typename Derived\
47 > TOUT DepthwiseConvolutionBase<\
48 OutputTileRows, OutputTileColumns,\
49 KernelRows, KernelColumns,\
50 StrideRows, StrideColumns,\
51 TIn, TBias, TOut, Derived\
52 >
53
54 using namespace neon_convolution_kernels;
55
56 namespace depthwise
57 {
58
59 template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
60 struct PackParameters
61 {
62 static void execute(
63 unsigned int n_channels,
64 void *buffer,
65 const void *weights,
66 unsigned int weight_row_stride,
67 unsigned int weight_col_stride,
68 const void *biases
69 );
70 };
71
72 const unsigned int CHANNEL_BLOCK = 16;
73
MEMBERFN(int)74 MEMBERFN(int)::get_output_size(
75 const int dim_size, const unsigned int padding_before, const unsigned int padding_after
76 )
77 {
78 return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows);
79 }
80
MEMBERFN(int)81 MEMBERFN(int)::output_size(
82 const int dim_size, const unsigned int padding_before, const unsigned int padding_after
83 ) const
84 {
85 return get_output_size(dim_size, padding_before, padding_after);
86 }
87
MEMBERFN()88 MEMBERFN()::DepthwiseConvolutionBase(
89 const int n_batches,
90 const int n_input_rows,
91 const int n_input_cols,
92 const int n_channels,
93 ActivationFunction activation,
94 const unsigned int padding_top,
95 const unsigned int padding_left,
96 const unsigned int padding_bottom,
97 const unsigned int padding_right
98 ) : DepthwiseConvolutionBase(
99 n_batches, n_input_rows, n_input_cols, n_channels,
100 get_output_size(n_input_rows, padding_top, padding_bottom),
101 get_output_size(n_input_cols, padding_left, padding_right),
102 activation,
103 padding_top, padding_left, padding_bottom, padding_right
104 )
105 {
106 }
107
MEMBERFN()108 MEMBERFN()::DepthwiseConvolutionBase(
109 const int n_batches,
110 const int n_input_rows,
111 const int n_input_cols,
112 const int n_channels,
113 const int n_output_rows,
114 const int n_output_cols,
115 ActivationFunction activation,
116 const unsigned int padding_top,
117 const unsigned int padding_left,
118 const unsigned int padding_bottom,
119 const unsigned int padding_right
120 ) : _input(nullptr), _output(nullptr),
121 _packed_parameters(nullptr),
122 _working_space(nullptr),
123 _n_batches(n_batches),
124 _n_input_rows(n_input_rows),
125 _n_input_cols(n_input_cols),
126 _n_channels(n_channels),
127 _n_output_rows(n_output_rows),
128 _n_output_cols(n_output_cols),
129 _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)),
130 _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)),
131 _padding_top(padding_top),
132 _padding_left(padding_left),
133 _padding_bottom(padding_bottom),
134 _padding_right(padding_right),
135 _activation(activation),
136 _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0),
137 _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0)
138 {
139 }
140
MEMBERFN(void)141 MEMBERFN(void)::set_input(const void* const inptr)
142 {
143 set_input(inptr, _n_channels);
144 }
145
MEMBERFN(void)146 MEMBERFN(void)::set_input(const void* const inptr, const int ld_col)
147 {
148 set_input(inptr, _n_input_cols * ld_col, ld_col);
149 }
150
MEMBERFN(void)151 MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col)
152 {
153 set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col);
154 }
155
MEMBERFN(void)156 MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col)
157 {
158 _input = static_cast<const TIn *>(inptr);
159 _input_batch_stride = ld_batch;
160 _input_row_stride = ld_row;
161 _input_col_stride = ld_col;
162 }
163
MEMBERFN(void)164 MEMBERFN(void)::set_output(void* const outptr)
165 {
166 set_output(outptr, _n_channels);
167 }
168
MEMBERFN(void)169 MEMBERFN(void)::set_output(void* const outptr, const int ld_col)
170 {
171 set_output(outptr, _n_output_cols * ld_col, ld_col);
172 }
173
MEMBERFN(void)174 MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col)
175 {
176 set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col);
177 }
178
MEMBERFN(void)179 MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col)
180 {
181 _output = static_cast<TOut *>(outptr);
182 _output_batch_stride = ld_batch;
183 _output_row_stride = ld_row;
184 _output_col_stride = ld_col;
185 }
186
MEMBERFN(size_t)187 MEMBERFN(size_t)::get_packed_params_size(void) const
188 {
189 return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
190 }
191
MEMBERFN(void)192 MEMBERFN(void)::set_packed_params_buffer(void *buffer)
193 {
194 _packed_parameters = buffer;
195 }
196
MEMBERFN(void)197 MEMBERFN(void)::pack_params(const void *weights, const void *biases) const
198 {
199 static_cast<const Derived *>(this)->pack_params(_packed_parameters, weights, biases);
200 }
201
MEMBERFN(void)202 MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const
203 {
204 const unsigned int weight_col_stride = _n_channels;
205 const unsigned int weight_row_stride = KernelColumns * weight_col_stride;
206 static_cast<const Derived *>(this)->pack_params(
207 buffer, weights, weight_row_stride, weight_col_stride, biases
208 );
209 }
210
MEMBERFN(void)211 MEMBERFN(void)::pack_params(
212 void * const buffer,
213 const void * const weights,
214 const unsigned int weight_row_stride,
215 const unsigned int weight_col_stride,
216 const void * const biases
217 ) const
218 {
219 static_cast<const Derived *>(this)->_pack_params(
220 buffer, weights, weight_row_stride, weight_col_stride, biases
221 );
222 }
223
MEMBERFN(void)224 MEMBERFN(void)::_pack_params(
225 void * const buffer,
226 const void * const weights,
227 const unsigned int weight_row_stride,
228 const unsigned int weight_col_stride,
229 const void * const biases
230 ) const
231 {
232 // Default implementation
233 PackParameters<KernelRows, KernelColumns, sizeof(TIn), sizeof(TOut)>::execute(
234 _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases
235 );
236 }
237
MEMBERFN(size_t)238 MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
239 {
240 return nthreads * (
241 _get_input_working_space_size() + _get_output_working_space_size()
242 );
243 }
244
MEMBERFN(void)245 MEMBERFN(void)::set_working_space(void *buffer)
246 {
247 _working_space = buffer;
248 }
249
MEMBERFN(size_t)250 MEMBERFN(size_t)::_get_input_working_space_size(void) const
251 {
252 return sizeof(TIn) * _n_channels;
253 }
254
MEMBERFN(size_t)255 MEMBERFN(size_t)::_get_output_working_space_size(void) const
256 {
257 return sizeof(TOut) * _n_channels;
258 }
259
MEMBERFN(void *)260 MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const
261 {
262 return static_cast<uint8_t*>(_working_space) + threadid * (
263 _get_input_working_space_size() + _get_output_working_space_size()
264 );
265 }
266
MEMBERFN(void *)267 MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const
268 {
269 return static_cast<uint8_t*>(_get_input_working_space(threadid)) + _get_input_working_space_size();
270 }
271
MEMBERFN(unsigned int)272 MEMBERFN(unsigned int)::get_window() const
273 {
274 // Parallelise over blocks of channels.
275 return iceildiv(_n_channels, CHANNEL_BLOCK);
276 }
277
MEMBERFN(void)278 MEMBERFN(void)::run(
279 const unsigned int start,
280 const unsigned int stop,
281 const unsigned int threadid
282 )
283 {
284 // Clear the input padding buffer
285 TIn *buf = static_cast<TIn *>(_get_input_working_space(threadid));
286 const TIn pad_value = static_cast<Derived *>(this)->_input_padding_value();
287 for (int n = 0; n < _n_channels; n++)
288 {
289 buf[n] = pad_value;
290 }
291
292 // Parallelise over blocks of channels
293 const auto start_channel = CHANNEL_BLOCK * start;
294 const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop);
295 const auto params_size_per_channel = this->get_packed_params_size()/_n_channels;
296
297 // Compute top and bottom padding for input and output
298 const int input_pad_top = _padding_top;
299 const int input_pad_left = _padding_left;
300 constexpr int tile_overlap = kernel_rows - stride_rows;
301
302 // Perform the convolution by calling `process_tile_row` for each tile row in
303 // each batch.
304 for (int batch = 0; batch < _n_batches; batch++)
305 {
306 const TIn* const inptr_batch = _input + batch*_input_batch_stride;
307 TOut* const outptr_batch = _output + batch*_output_batch_stride;
308
309 // Loop over rows of tiles
310 for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++)
311 {
312 // Pointer to the row
313 const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top;
314 const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride);
315 TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride;
316
317 // Input padding (top + bottom) for the row
318 const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top;
319 const int input_row_bottom = input_row_top + inner_tile_rows;
320 const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0;
321 const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows);
322
323 // Output padding (bottom) for the row
324 const int output_row_bottom = (tile_i + 1)*output_tile_rows;
325 const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows);
326
327 // Get the offset into the packed parameters
328 const auto params_ptr = static_cast<const uint8_t*>(_packed_parameters) +
329 start_channel*params_size_per_channel;
330
331 // Process the row
332 process_tile_row(
333 threadid,
334 stop_channel - start_channel,
335 params_ptr,
336 inptr_row + start_channel,
337 outptr_row + start_channel,
338 input_row_pad_top, input_pad_left, input_row_pad_bottom,
339 output_row_pad_bottom,
340 _n_tile_cols, _n_input_cols, _n_output_cols
341 );
342 }
343 }
344 }
345
MEMBERFN(void)346 MEMBERFN(void)::process_tile_row(
347 const unsigned int threadid,
348 const int n_channels,
349 const void* const packed_params,
350 const TIn* const inptr,
351 TOut* const outptr,
352 const int row_pad_in_top,
353 const int row_pad_in_left,
354 const int row_pad_in_bottom,
355 const int row_pad_out_bottom,
356 const int n_tiles,
357 const int n_input_cols,
358 const int n_output_cols
359 )
360 {
361 constexpr int tile_overlap = kernel_cols - stride_cols;
362
363 // Loop over columns of tiles
364 for (int tile_j = 0; tile_j < n_tiles; tile_j++)
365 {
366 // Input padding (left + right) for the tile
367 const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0;
368 const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left;
369 const int t_in_end = t_in_start + inner_tile_cols;
370 const int t_pad_in_right = std::max(0, t_in_end - n_input_cols);
371
372 // Output padding (right) for the tile
373 const int t_out_end = (tile_j + 1) * output_tile_cols;
374 const int t_pad_out_right = std::max(0, t_out_end - n_output_cols);
375
376 // Get pointers into the inputs and outputs
377 const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left;
378 const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride);
379 TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride;
380
381 // Process just this tile
382 process_tile(
383 threadid, n_channels, packed_params, inptr_col, outptr_col,
384 row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, // Input paddings
385 row_pad_out_bottom, t_pad_out_right // Output paddings
386 );
387 }
388 }
389
MEMBERFN(TIn)390 MEMBERFN(TIn)::_input_padding_value(void) const
391 {
392 return static_cast<TIn>(0);
393 }
394
MEMBERFN(void)395 MEMBERFN(void)::process_tile(
396 const unsigned int threadid,
397 const int n_channels,
398 const void* const packed_params,
399 const TIn* const inptr,
400 TOut* const outptr,
401 const int pad_in_top,
402 const int pad_in_left,
403 const int pad_in_bottom,
404 const int pad_in_right,
405 const int pad_out_bottom,
406 const int pad_out_right
407 )
408 {
409 Derived * dthis = static_cast<Derived *>(this);
410 const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right;
411 const bool pad_output = pad_out_bottom || pad_out_right;
412
413 if (!pad_input && !pad_output)
414 {
415 switch(_activation)
416 {
417 case ActivationFunction::ReLU:
418 dthis->template execute_tile<ActivationFunction::ReLU>(
419 n_channels, packed_params,
420 inptr, _input_row_stride, _input_col_stride,
421 outptr, _output_row_stride, _output_col_stride
422 );
423 break;
424 case ActivationFunction::ReLU6:
425 dthis->template execute_tile<ActivationFunction::ReLU6>(
426 n_channels, packed_params,
427 inptr, _input_row_stride, _input_col_stride,
428 outptr, _output_row_stride, _output_col_stride
429 );
430 break;
431 default:
432 dthis->template execute_tile<ActivationFunction::None>(
433 n_channels, packed_params,
434 inptr, _input_row_stride, _input_col_stride,
435 outptr, _output_row_stride, _output_col_stride
436 );
437 break;
438 }
439 }
440 else
441 {
442 // Create arrays of input and output pointers, pointing padded elements to
443 // the working space padding buffers provided.
444 const TIn *inptrs[inner_tile_rows][inner_tile_cols];
445 for (int i = 0; i < inner_tile_rows; i++)
446 {
447 for (int j = 0; j < inner_tile_cols; j++)
448 {
449 if (i < pad_in_top || (inner_tile_rows - pad_in_bottom) <= i ||
450 j < pad_in_left || (inner_tile_cols - pad_in_right) <= j)
451 {
452 // Padded input
453 inptrs[i][j] = static_cast<const TIn *>(_get_input_working_space(threadid));
454 }
455 else
456 {
457 inptrs[i][j] = inptr + (i - pad_in_top)*_input_row_stride + (j - pad_in_left)*_input_col_stride;
458 }
459 }
460 }
461
462 TOut *outptrs[output_tile_rows][output_tile_cols];
463 for (int i = 0; i < output_tile_rows; i++)
464 {
465 for (int j = 0; j < output_tile_cols; j++)
466 {
467 if (i < (output_tile_rows - pad_out_bottom) &&
468 j < (output_tile_cols - pad_out_right))
469 {
470 outptrs[i][j] = outptr + i*_output_row_stride + j*_output_col_stride;
471 }
472 else
473 {
474 outptrs[i][j] = static_cast<TOut *>(_get_output_working_space(threadid));
475 }
476 }
477 }
478
479 switch(_activation)
480 {
481 case ActivationFunction::ReLU:
482 dthis->template execute_tile<ActivationFunction::ReLU>(
483 n_channels, packed_params, inptrs, outptrs
484 );
485 break;
486 case ActivationFunction::ReLU6:
487 dthis->template execute_tile<ActivationFunction::ReLU6>(
488 n_channels, packed_params, inptrs, outptrs
489 );
490 break;
491 default:
492 dthis->template execute_tile<ActivationFunction::None>(
493 n_channels, packed_params, inptrs, outptrs
494 );
495 break;
496 }
497 }
498 }
499
MEMBERFN(int)500 MEMBERFN(int)::n_channels(void) const
501 {
502 return _n_channels;
503 }
504
505 } // namespace depthwise
506