• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018-2019 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 /*
26  * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27  *
28  *          NOTE: Header to be included by implementation files only.
29  *
30  * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31  */
32 
33 #include <algorithm>
34 #include <cstdint>
35 #include "depthwise.hpp"
36 #include "padding.hpp"
37 #include "utils.hpp"
38 
39 #pragma once
40 
41 #define MEMBERFN(TOUT) template <\
42   unsigned int OutputTileRows, unsigned int OutputTileColumns,\
43   unsigned int KernelRows, unsigned int KernelColumns,\
44   unsigned int StrideRows, unsigned int StrideColumns,\
45   typename TIn, typename TBias, typename TOut,\
46   typename Derived\
47 > TOUT DepthwiseConvolutionBase<\
48   OutputTileRows, OutputTileColumns,\
49   KernelRows, KernelColumns,\
50   StrideRows, StrideColumns,\
51   TIn, TBias, TOut, Derived\
52 >
53 
54 using namespace neon_convolution_kernels;
55 
56 namespace depthwise
57 {
58 
59 template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
60 struct PackParameters
61 {
62   static void execute(
63     unsigned int n_channels,
64     void *buffer,
65     const void *weights,
66     unsigned int weight_row_stride,
67     unsigned int weight_col_stride,
68     const void *biases
69   );
70 };
71 
72 const unsigned int CHANNEL_BLOCK = 16;
73 
MEMBERFN(int)74 MEMBERFN(int)::get_output_size(
75   const int dim_size, const unsigned int padding_before, const unsigned int padding_after
76 )
77 {
78   return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows);
79 }
80 
MEMBERFN(int)81 MEMBERFN(int)::output_size(
82   const int dim_size, const unsigned int padding_before, const unsigned int padding_after
83 ) const
84 {
85   return get_output_size(dim_size, padding_before, padding_after);
86 }
87 
MEMBERFN()88 MEMBERFN()::DepthwiseConvolutionBase(
89   const int n_batches,
90   const int n_input_rows,
91   const int n_input_cols,
92   const int n_channels,
93   ActivationFunction activation,
94   const unsigned int padding_top,
95   const unsigned int padding_left,
96   const unsigned int padding_bottom,
97   const unsigned int padding_right
98 ) : DepthwiseConvolutionBase(
99       n_batches, n_input_rows, n_input_cols, n_channels,
100       get_output_size(n_input_rows, padding_top, padding_bottom),
101       get_output_size(n_input_cols, padding_left, padding_right),
102       activation,
103       padding_top, padding_left, padding_bottom, padding_right
104     )
105 {
106 }
107 
MEMBERFN()108 MEMBERFN()::DepthwiseConvolutionBase(
109   const int n_batches,
110   const int n_input_rows,
111   const int n_input_cols,
112   const int n_channels,
113   const int n_output_rows,
114   const int n_output_cols,
115   ActivationFunction activation,
116   const unsigned int padding_top,
117   const unsigned int padding_left,
118   const unsigned int padding_bottom,
119   const unsigned int padding_right
120 ) : _input(nullptr), _output(nullptr),
121     _packed_parameters(nullptr),
122     _working_space(nullptr),
123     _n_batches(n_batches),
124     _n_input_rows(n_input_rows),
125     _n_input_cols(n_input_cols),
126     _n_channels(n_channels),
127     _n_output_rows(n_output_rows),
128     _n_output_cols(n_output_cols),
129     _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)),
130     _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)),
131     _padding_top(padding_top),
132     _padding_left(padding_left),
133     _padding_bottom(padding_bottom),
134     _padding_right(padding_right),
135     _activation(activation),
136     _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0),
137     _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0)
138 {
139 }
140 
MEMBERFN(void)141 MEMBERFN(void)::set_input(const void* const inptr)
142 {
143   set_input(inptr, _n_channels);
144 }
145 
MEMBERFN(void)146 MEMBERFN(void)::set_input(const void* const inptr, const int ld_col)
147 {
148   set_input(inptr, _n_input_cols * ld_col, ld_col);
149 }
150 
MEMBERFN(void)151 MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col)
152 {
153   set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col);
154 }
155 
MEMBERFN(void)156 MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col)
157 {
158   _input = static_cast<const TIn *>(inptr);
159   _input_batch_stride = ld_batch;
160   _input_row_stride = ld_row;
161   _input_col_stride = ld_col;
162 }
163 
MEMBERFN(void)164 MEMBERFN(void)::set_output(void* const outptr)
165 {
166   set_output(outptr, _n_channels);
167 }
168 
MEMBERFN(void)169 MEMBERFN(void)::set_output(void* const outptr, const int ld_col)
170 {
171   set_output(outptr, _n_output_cols * ld_col, ld_col);
172 }
173 
MEMBERFN(void)174 MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col)
175 {
176   set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col);
177 }
178 
MEMBERFN(void)179 MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col)
180 {
181   _output = static_cast<TOut *>(outptr);
182   _output_batch_stride = ld_batch;
183   _output_row_stride = ld_row;
184   _output_col_stride = ld_col;
185 }
186 
MEMBERFN(size_t)187 MEMBERFN(size_t)::get_packed_params_size(void) const
188 {
189   return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
190 }
191 
MEMBERFN(void)192 MEMBERFN(void)::set_packed_params_buffer(void *buffer)
193 {
194   _packed_parameters = buffer;
195 }
196 
MEMBERFN(void)197 MEMBERFN(void)::pack_params(const void *weights, const void *biases) const
198 {
199   static_cast<const Derived *>(this)->pack_params(_packed_parameters, weights, biases);
200 }
201 
MEMBERFN(void)202 MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const
203 {
204   const unsigned int weight_col_stride = _n_channels;
205   const unsigned int weight_row_stride = KernelColumns * weight_col_stride;
206   static_cast<const Derived *>(this)->pack_params(
207     buffer, weights, weight_row_stride, weight_col_stride, biases
208   );
209 }
210 
MEMBERFN(void)211 MEMBERFN(void)::pack_params(
212   void * const buffer,
213   const void * const weights,
214   const unsigned int weight_row_stride,
215   const unsigned int weight_col_stride,
216   const void * const biases
217 ) const
218 {
219   static_cast<const Derived *>(this)->_pack_params(
220     buffer, weights, weight_row_stride, weight_col_stride, biases
221   );
222 }
223 
MEMBERFN(void)224 MEMBERFN(void)::_pack_params(
225   void * const buffer,
226   const void * const weights,
227   const unsigned int weight_row_stride,
228   const unsigned int weight_col_stride,
229   const void * const biases
230 ) const
231 {
232   // Default implementation
233   PackParameters<KernelRows, KernelColumns, sizeof(TIn), sizeof(TOut)>::execute(
234     _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases
235   );
236 }
237 
MEMBERFN(size_t)238 MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
239 {
240   return nthreads * (
241     _get_input_working_space_size() + _get_output_working_space_size()
242   );
243 }
244 
MEMBERFN(void)245 MEMBERFN(void)::set_working_space(void *buffer)
246 {
247   _working_space = buffer;
248 }
249 
MEMBERFN(size_t)250 MEMBERFN(size_t)::_get_input_working_space_size(void) const
251 {
252   return sizeof(TIn) * _n_channels;
253 }
254 
MEMBERFN(size_t)255 MEMBERFN(size_t)::_get_output_working_space_size(void) const
256 {
257   return sizeof(TOut) * _n_channels;
258 }
259 
MEMBERFN(void *)260 MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const
261 {
262   return static_cast<uint8_t*>(_working_space) + threadid * (
263     _get_input_working_space_size() + _get_output_working_space_size()
264   );
265 }
266 
MEMBERFN(void *)267 MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const
268 {
269   return static_cast<uint8_t*>(_get_input_working_space(threadid)) + _get_input_working_space_size();
270 }
271 
MEMBERFN(unsigned int)272 MEMBERFN(unsigned int)::get_window() const
273 {
274   // Parallelise over blocks of channels.
275   return iceildiv(_n_channels, CHANNEL_BLOCK);
276 }
277 
MEMBERFN(void)278 MEMBERFN(void)::run(
279   const unsigned int start,
280   const unsigned int stop,
281   const unsigned int threadid
282 )
283 {
284   // Clear the input padding buffer
285   TIn *buf = static_cast<TIn *>(_get_input_working_space(threadid));
286   const TIn pad_value = static_cast<Derived *>(this)->_input_padding_value();
287   for (int n = 0; n < _n_channels; n++)
288   {
289     buf[n] = pad_value;
290   }
291 
292   // Parallelise over blocks of channels
293   const auto start_channel = CHANNEL_BLOCK * start;
294   const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop);
295   const auto params_size_per_channel = this->get_packed_params_size()/_n_channels;
296 
297   // Compute top and bottom padding for input and output
298   const int input_pad_top = _padding_top;
299   const int input_pad_left = _padding_left;
300   constexpr int tile_overlap = kernel_rows - stride_rows;
301 
302   // Perform the convolution by calling `process_tile_row` for each tile row in
303   // each batch.
304   for (int batch = 0; batch < _n_batches; batch++)
305   {
306     const TIn* const inptr_batch = _input + batch*_input_batch_stride;
307     TOut* const outptr_batch = _output + batch*_output_batch_stride;
308 
309     // Loop over rows of tiles
310     for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++)
311     {
312       // Pointer to the row
313       const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top;
314       const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride);
315       TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride;
316 
317       // Input padding (top + bottom) for the row
318       const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top;
319       const int input_row_bottom = input_row_top + inner_tile_rows;
320       const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0;
321       const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows);
322 
323       // Output padding (bottom) for the row
324       const int output_row_bottom = (tile_i + 1)*output_tile_rows;
325       const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows);
326 
327       // Get the offset into the packed parameters
328       const auto params_ptr = static_cast<const uint8_t*>(_packed_parameters) +
329         start_channel*params_size_per_channel;
330 
331       // Process the row
332       process_tile_row(
333         threadid,
334         stop_channel - start_channel,
335         params_ptr,
336         inptr_row + start_channel,
337         outptr_row + start_channel,
338         input_row_pad_top, input_pad_left, input_row_pad_bottom,
339         output_row_pad_bottom,
340         _n_tile_cols, _n_input_cols, _n_output_cols
341       );
342     }
343   }
344 }
345 
MEMBERFN(void)346 MEMBERFN(void)::process_tile_row(
347   const unsigned int threadid,
348   const int n_channels,
349   const void* const packed_params,
350   const TIn* const inptr,
351   TOut* const outptr,
352   const int row_pad_in_top,
353   const int row_pad_in_left,
354   const int row_pad_in_bottom,
355   const int row_pad_out_bottom,
356   const int n_tiles,
357   const int n_input_cols,
358   const int n_output_cols
359 )
360 {
361   constexpr int tile_overlap = kernel_cols - stride_cols;
362 
363   // Loop over columns of tiles
364   for (int tile_j = 0; tile_j < n_tiles; tile_j++)
365   {
366     // Input padding (left + right) for the tile
367     const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0;
368     const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left;
369     const int t_in_end = t_in_start + inner_tile_cols;
370     const int t_pad_in_right = std::max(0, t_in_end - n_input_cols);
371 
372     // Output padding (right) for the tile
373     const int t_out_end = (tile_j + 1) * output_tile_cols;
374     const int t_pad_out_right = std::max(0, t_out_end - n_output_cols);
375 
376     // Get pointers into the inputs and outputs
377     const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left;
378     const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride);
379     TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride;
380 
381     // Process just this tile
382     process_tile(
383       threadid, n_channels, packed_params, inptr_col, outptr_col,
384       row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right,  // Input paddings
385       row_pad_out_bottom, t_pad_out_right  // Output paddings
386     );
387   }
388 }
389 
MEMBERFN(TIn)390 MEMBERFN(TIn)::_input_padding_value(void) const
391 {
392   return static_cast<TIn>(0);
393 }
394 
MEMBERFN(void)395 MEMBERFN(void)::process_tile(
396   const unsigned int threadid,
397   const int n_channels,
398   const void* const packed_params,
399   const TIn* const inptr,
400   TOut* const outptr,
401   const int pad_in_top,
402   const int pad_in_left,
403   const int pad_in_bottom,
404   const int pad_in_right,
405   const int pad_out_bottom,
406   const int pad_out_right
407 )
408 {
409   Derived * dthis = static_cast<Derived *>(this);
410   const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right;
411   const bool pad_output = pad_out_bottom || pad_out_right;
412 
413   if (!pad_input && !pad_output)
414   {
415     switch(_activation)
416     {
417       case ActivationFunction::ReLU:
418         dthis->template execute_tile<ActivationFunction::ReLU>(
419           n_channels, packed_params,
420           inptr, _input_row_stride, _input_col_stride,
421           outptr, _output_row_stride, _output_col_stride
422         );
423         break;
424       case ActivationFunction::ReLU6:
425         dthis->template execute_tile<ActivationFunction::ReLU6>(
426           n_channels, packed_params,
427           inptr, _input_row_stride, _input_col_stride,
428           outptr, _output_row_stride, _output_col_stride
429         );
430         break;
431       default:
432         dthis->template execute_tile<ActivationFunction::None>(
433           n_channels, packed_params,
434           inptr, _input_row_stride, _input_col_stride,
435           outptr, _output_row_stride, _output_col_stride
436         );
437         break;
438     }
439   }
440   else
441   {
442     // Create arrays of input and output pointers, pointing padded elements to
443     // the working space padding buffers provided.
444     const TIn *inptrs[inner_tile_rows][inner_tile_cols];
445     for (int i = 0; i < inner_tile_rows; i++)
446     {
447       for (int j = 0; j < inner_tile_cols; j++)
448       {
449         if (i < pad_in_top || (inner_tile_rows - pad_in_bottom) <= i ||
450             j < pad_in_left || (inner_tile_cols - pad_in_right) <= j)
451         {
452           // Padded input
453           inptrs[i][j] = static_cast<const TIn *>(_get_input_working_space(threadid));
454         }
455         else
456         {
457           inptrs[i][j] = inptr + (i - pad_in_top)*_input_row_stride + (j - pad_in_left)*_input_col_stride;
458         }
459       }
460     }
461 
462     TOut *outptrs[output_tile_rows][output_tile_cols];
463     for (int i = 0; i < output_tile_rows; i++)
464     {
465       for (int j = 0; j < output_tile_cols; j++)
466       {
467         if (i < (output_tile_rows - pad_out_bottom) &&
468             j < (output_tile_cols - pad_out_right))
469         {
470           outptrs[i][j] = outptr + i*_output_row_stride + j*_output_col_stride;
471         }
472         else
473         {
474           outptrs[i][j] = static_cast<TOut *>(_get_output_working_space(threadid));
475         }
476       }
477     }
478 
479     switch(_activation)
480     {
481       case ActivationFunction::ReLU:
482         dthis->template execute_tile<ActivationFunction::ReLU>(
483           n_channels, packed_params, inptrs, outptrs
484         );
485         break;
486       case ActivationFunction::ReLU6:
487         dthis->template execute_tile<ActivationFunction::ReLU6>(
488           n_channels, packed_params, inptrs, outptrs
489         );
490         break;
491       default:
492         dthis->template execute_tile<ActivationFunction::None>(
493           n_channels, packed_params, inptrs, outptrs
494         );
495         break;
496     }
497   }
498 }
499 
MEMBERFN(int)500 MEMBERFN(int)::n_channels(void) const
501 {
502   return _n_channels;
503 }
504 
505 }  // namespace depthwise
506