• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018-2019 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include <arm_neon.h>
28 #include "activation.hpp"
29 #include "padding.hpp"
30 
31 namespace depthwise
32 {
33 
34 namespace nck = neon_convolution_kernels;
35 
36 class IDepthwiseConvolution
37 {
38   public:
39     virtual ~IDepthwiseConvolution() = default;
40 
41     virtual int output_size(
42       int dim_size,
43       unsigned int padding_before,
44       unsigned int padding_after
45     ) const = 0;
46 
47     /* Set input tensor and stride. */
48     virtual void set_input(const void *inptr) = 0;
49     virtual void set_input(const void *inptr, int column_stride) = 0;
50     virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0;
51     virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0;
52 
53     /* Set output tensor and stride. */
54     virtual void set_output(void *outptr) = 0;
55     virtual void set_output(void *outptr, int column_stride) = 0;
56     virtual void set_output(void *outptr, int row_stride, int column_stride) = 0;
57     virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0;
58 
59     /* Weights and biases are re-ordered to improve memory access patterns. Use
60      * these methods to determine the size of the re-pack buffer and to set the
61      * address (and implicitly reorder the weights and biases into) the buffer.
62      */
63     virtual size_t get_packed_params_size(void) const = 0;
64     virtual void set_packed_params_buffer(void *) = 0;
65 
66     virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0;
67     virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0;
68     virtual void pack_params(
69       void *buffer,
70       const void* weights,
71       unsigned int weight_row_stride,
72       unsigned int weight_col_stride,
73       const void *biases=nullptr
74     ) const = 0;
75 
76     /* Working space is used to pad tensors on the fly. Before running any
77      * inference check the amount of space required, allocate and provide a
78      * pointer to the convolution engine.
79      */
80     virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
81     virtual void set_working_space(void *) = 0;
82 
83     virtual unsigned int get_window(void) const = 0;
84     virtual void run(
85       unsigned int start,
86       unsigned int stop,
87       unsigned int threadid=0
88     ) = 0;
89 };
90 
91 template <
92   unsigned int OutputTileRows, unsigned int OutputTileCols,
93   unsigned int KernelRows, unsigned int KernelCols,
94   unsigned int StrideRows, unsigned int StrideCols,
95   typename TIn, typename TBias, typename TOut,
96   typename Derived
97 >
98 class DepthwiseConvolutionBase : public IDepthwiseConvolution
99 {
100   public:
101     // Information about the specific convolution instance
102     using InputType = TIn;
103     using BiasType = TBias;
104     using OutputType = TOut;
105     static constexpr int output_tile_rows = OutputTileRows;
106     static constexpr int output_tile_cols = OutputTileCols;
107     static constexpr int kernel_rows = KernelRows;
108     static constexpr int kernel_cols = KernelCols;
109     static constexpr int stride_rows = StrideRows;
110     static constexpr int stride_cols = StrideCols;
111     static constexpr int inner_tile_rows = stride_rows * (output_tile_rows - 1) + kernel_rows;
112     static constexpr int inner_tile_cols = stride_cols * (output_tile_cols - 1) + kernel_cols;
113 
114     /** Create a new depthwise convolution engine.
115      *
116      * @param[in] n_batches Number of batches tensors.
117      * @param[in] n_input_rows Number of rows in input tensor.
118      * @param[in] n_input_cols Number of columns in input tensor.
119      * @param[in] n_channels Number of channels in input and output tensors.
120      */
121     DepthwiseConvolutionBase(
122       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
123       nck::ActivationFunction activation,
124       unsigned int padding_top,
125       unsigned int padding_left,
126       unsigned int padding_bottom,
127       unsigned int padding_right
128     );
129 
130     /** Create a new depthwise convolution engine.
131      *
132      * @param[in] n_batches Number of batches tensors.
133      * @param[in] n_input_rows Number of rows in input tensor.
134      * @param[in] n_input_cols Number of columns in input tensor.
135      * @param[in] n_channels Number of channels in input and output tensors.
136      */
137     DepthwiseConvolutionBase(
138       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
139       int n_output_rows, int n_output_cols,
140       nck::ActivationFunction activation,
141       unsigned int padding_top,
142       unsigned int padding_left,
143       unsigned int padding_bottom,
144       unsigned int padding_right
145     );
146 
147     // Cannot copy or move a DepthwiseConvolution.
148     DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
149     DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
150 
151     /* Set input tensor and stride. */
152     void set_input(const void *inptr) override;
153     void set_input(const void *inptr, int column_stride) override;
154     void set_input(const void *inptr, int row_stride, int column_stride) override;
155     void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
156 
157     /* Set output tensor and stride. */
158     void set_output(void *outptr) override;
159     void set_output(void *outptr, int column_stride) override;
160     void set_output(void *outptr, int row_stride, int column_stride) override;
161     void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
162 
163     /** Get the number of output rows/columns.
164      *
165      * @param[in] dim_size Number of elements in the dimension (rows/columns)
166      * @param[in] same_padding True if the padding is SAME, otherwise false.
167      */
168     static int get_output_size(
169       int dim_size, unsigned int padding_before, unsigned int padding_after
170     );
171 
172     int output_size(
173       int dim_size, unsigned int padding_before, unsigned int padding_after
174     ) const override;
175 
176     /* Determine how much memory is required to store the packed weights and
177      * biases.
178      */
179     size_t get_packed_params_size(void) const override;
180 
181     /* Set the buffer for the packed weights and biases, and perform the
182      * packing.
183      */
184     void set_packed_params_buffer(void *buffer) override;
185 
186     void pack_params(const void *weights, const void *biases=nullptr) const override;
187 
188     void pack_params(
189       void *buffer,
190       const void *weights,
191       const void *biases=nullptr
192     ) const override;
193 
194     void pack_params(
195       void *buffer,
196       const void *weights,
197       unsigned int weight_row_stride,
198       unsigned int weight_col_stride,
199       const void *biases=nullptr
200     ) const override;
201 
202     /** Query the amount of working space required.
203      * @param[in] The largest number of threads which will be used to execute
204      *            the kernel.
205      */
206     size_t get_working_space_size(unsigned int n_threads=1) const override;
207 
208     /** Set the working space buffer.
209      */
210     void set_working_space(void *buffer) override;
211 
212     /** Get the window of work to be performed by an instance of the operator.
213      */
214     unsigned int get_window(void) const override;
215 
216     /** Perform a portion of the work associated with the operator.
217      *
218      * Will perform the window of work described by $[start, stop)$.
219      *
220      * @param[in] start Start of the window of work to perform.
221      * @param[in] stop End of the work to perform.
222      * @param[in] ID of the thread performing the work.
223      */
224     void run(
225       unsigned int start,
226       unsigned int stop,
227       unsigned int threadid=0
228     ) override;
229 
230   protected:
231     /** Get the value to use to pad the tensor.
232      */
233     TIn _input_padding_value(void) const;
234 
235     /** Implementation of the parameter packing.
236      */
237     void _pack_params(
238       void *buffer,
239       const void *weights,
240       unsigned int weight_row_stride,
241       unsigned int weight_col_stride,
242       const void *biases=nullptr
243     ) const;
244 
245     /** Process a tile-row of the tensors.
246      */
247     void process_tile_row(
248       unsigned int threadid,
249       int n_channels,
250       const void* packed_params,
251       const InputType* inptr,
252       OutputType* outptr,
253       int row_pad_in_top,
254       int row_pad_in_left,
255       int row_pad_in_bottom,
256       int row_pad_out_bottom,
257       int n_tiles,
258       int n_input_cols,
259       int n_output_cols
260     );
261 
262     /** Process a single tile of the tensor.
263      *
264      * This method will apply input/output padding (if required) and call the
265      * depthwise tile implementation.
266      */
267     void process_tile(
268       unsigned int threadid,
269       int n_channels,
270       const void* packed_params,
271       const InputType* inptr,
272       OutputType* outptr,
273       int pad_in_top,
274       int pad_in_left,
275       int pad_in_bottom,
276       int pad_in_right,
277       int pad_out_bottom,
278       int pad_out_right
279     );
280 
281     /** Perform depthwise convolution on a single tile.
282      */
283     template <nck::ActivationFunction Activation>
284     void execute_tile(
285       int n_channels,
286       const void* packed_params,
287       const InputType* inptr,
288       unsigned int in_row_stride,
289       unsigned int in_col_stride,
290       OutputType* outptr,
291       unsigned int out_row_stride,
292       unsigned int out_col_stride
293     );
294 
295     template <nck::ActivationFunction Activation>
296     void execute_tile(
297       int n_channels,
298       const void* packed_params,
299       const InputType* inptrs[inner_tile_rows][inner_tile_cols],
300       OutputType* outptrs[output_tile_rows][output_tile_cols]
301     );
302 
303     int n_channels(void) const;
304 
305   private:
306     // Member variables of instances of a convolution engine.
307     const InputType* _input;
308     OutputType* _output;
309     void* _packed_parameters;
310     void* _working_space;  // Per-thread working space
311     const int _n_batches, _n_input_rows, _n_input_cols, _n_channels,
312               _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols;
313     const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right;
314     const nck::ActivationFunction _activation;
315 
316     // Stride information for a convolution instance
317     int _input_col_stride, _input_row_stride, _input_batch_stride;
318     int _output_col_stride, _output_row_stride, _output_batch_stride;
319 
320     // Methods for getting access to working space
321     size_t _get_input_working_space_size(void) const;
322     size_t _get_output_working_space_size(void) const;
323 
324     void *_get_input_working_space(unsigned int threadid) const;
325     void *_get_output_working_space(unsigned int threadid) const;
326 };
327 
328 
329 template <
330   unsigned int OutputTileRows, unsigned int OutputTileCols,
331   unsigned int KernelRows, unsigned int KernelCols,
332   unsigned int StrideRows, unsigned int StrideCols,
333   typename TIn, typename TBias, typename TOut
334 >
335 class DepthwiseConvolution : public DepthwiseConvolutionBase<
336   OutputTileRows, OutputTileCols,
337   KernelRows, KernelCols,
338   StrideRows, StrideCols,
339   TIn, TBias, TOut,
340   DepthwiseConvolution<
341     OutputTileRows, OutputTileCols,
342     KernelRows, KernelCols,
343     StrideRows, StrideCols,
344     TIn, TBias, TOut
345   >
346 >
347 {
348   using Base = DepthwiseConvolutionBase<
349     OutputTileRows, OutputTileCols,
350     KernelRows, KernelCols,
351     StrideRows, StrideCols,
352     TIn, TBias, TOut,
353     DepthwiseConvolution<
354       OutputTileRows, OutputTileCols,
355       KernelRows, KernelCols,
356       StrideRows, StrideCols,
357       TIn, TBias, TOut
358   > >;
359   friend Base;
360   using InputType = typename Base::InputType;
361   using OutputType = typename Base::OutputType;
362 
363   public:
364     using Base::DepthwiseConvolutionBase;
365 
366   protected:
367     template <nck::ActivationFunction Activation>
368     void execute_tile(
369       int n_channels,
370       const void* packed_params,
371       const TIn* inptr,
372       unsigned int in_row_stride,
373       unsigned int in_col_stride,
374       TOut* outptr,
375       unsigned int out_row_stride,
376       unsigned int out_col_stride
377     );
378 
379     template <nck::ActivationFunction Activation>
380     void execute_tile(
381       int n_channels,
382       const void* packed_params,
383       const InputType* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
384       OutputType* outptrs[Base::output_tile_rows][Base::output_tile_cols]
385     );
386 };
387 
388 
389 template <
390   unsigned int OutputTileRows, unsigned int OutputTileCols,
391   unsigned int KernelRows, unsigned int KernelCols,
392   unsigned int StrideRows, unsigned int StrideCols
393 >
394 class DepthwiseConvolution<
395   OutputTileRows, OutputTileCols,
396   KernelRows, KernelCols,
397   StrideRows, StrideCols,
398   float, float, float
399 > : public DepthwiseConvolutionBase<
400   OutputTileRows, OutputTileCols,
401   KernelRows, KernelCols,
402   StrideRows, StrideCols,
403   float, float, float,
404   DepthwiseConvolution<
405     OutputTileRows, OutputTileCols,
406     KernelRows, KernelCols,
407     StrideRows, StrideCols,
408     float, float, float
409   >
410 >
411 {
412   using Base = DepthwiseConvolutionBase<
413     OutputTileRows, OutputTileCols,
414     KernelRows, KernelCols,
415     StrideRows, StrideCols,
416     float, float, float,
417     DepthwiseConvolution<
418       OutputTileRows, OutputTileCols,
419       KernelRows, KernelCols,
420       StrideRows, StrideCols,
421       float, float, float
422   > >;
423   friend Base;
424   using InputType = typename Base::InputType;
425   using OutputType = typename Base::OutputType;
426 
427   public:
428     DepthwiseConvolution(
429       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
430       nck::ActivationFunction activation,
431       unsigned int padding_top,
432       unsigned int padding_left,
433       unsigned int padding_bottom,
434       unsigned int padding_right
435     );
436 
437     DepthwiseConvolution(
438       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
439       int n_output_rows, int n_output_cols,
440       nck::ActivationFunction activation,
441       unsigned int padding_top,
442       unsigned int padding_left,
443       unsigned int padding_bottom,
444       unsigned int padding_right
445     );
446 
447   protected:
448     template <nck::ActivationFunction Activation>
449     void execute_tile(
450       int n_channels,
451       const void* packed_params,
452       const float* inptr,
453       unsigned int in_row_stride,
454       unsigned int in_col_stride,
455       float* outptr,
456       unsigned int out_row_stride,
457       unsigned int out_col_stride
458     );
459 
460     template <nck::ActivationFunction Activation>
461     void execute_tile(
462       int n_channels,
463       const void* packed_params,
464       const float* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
465       float* outptrs[Base::output_tile_rows][Base::output_tile_cols]
466     );
467 };
468 
469 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
470 template <
471   unsigned int OutputTileRows, unsigned int OutputTileCols,
472   unsigned int KernelRows, unsigned int KernelCols,
473   unsigned int StrideRows, unsigned int StrideCols
474 >
475 class DepthwiseConvolution<
476   OutputTileRows, OutputTileCols,
477   KernelRows, KernelCols,
478   StrideRows, StrideCols,
479   float16_t, float16_t, float16_t
480 > : public DepthwiseConvolutionBase<
481   OutputTileRows, OutputTileCols,
482   KernelRows, KernelCols,
483   StrideRows, StrideCols,
484   float16_t, float16_t, float16_t,
485   DepthwiseConvolution<
486     OutputTileRows, OutputTileCols,
487     KernelRows, KernelCols,
488     StrideRows, StrideCols,
489     float16_t, float16_t, float16_t
490   >
491 >
492 {
493   using Base = DepthwiseConvolutionBase<
494     OutputTileRows, OutputTileCols,
495     KernelRows, KernelCols,
496     StrideRows, StrideCols,
497     float16_t, float16_t, float16_t,
498     DepthwiseConvolution<
499       OutputTileRows, OutputTileCols,
500       KernelRows, KernelCols,
501       StrideRows, StrideCols,
502       float16_t, float16_t, float16_t
503   > >;
504   friend Base;
505   using InputType = typename Base::InputType;
506   using OutputType = typename Base::OutputType;
507 
508   public:
509     DepthwiseConvolution(
510       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
511       nck::ActivationFunction activation,
512       unsigned int padding_top,
513       unsigned int padding_left,
514       unsigned int padding_bottom,
515       unsigned int padding_right
516     );
517 
518     DepthwiseConvolution(
519       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
520       int n_output_rows, int n_output_cols,
521       nck::ActivationFunction activation,
522       unsigned int padding_top,
523       unsigned int padding_left,
524       unsigned int padding_bottom,
525       unsigned int padding_right
526     );
527 
528   protected:
529     template <nck::ActivationFunction Activation>
530     void execute_tile(
531       int n_channels,
532       const void* packed_params,
533       const float16_t* inptr,
534       unsigned int in_row_stride,
535       unsigned int in_col_stride,
536       float16_t* outptr,
537       unsigned int out_row_stride,
538       unsigned int out_col_stride
539     );
540 
541     template <nck::ActivationFunction Activation>
542     void execute_tile(
543       int n_channels,
544       const void* packed_params,
545       const float16_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
546       float16_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
547     );
548 };
549 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
550 
551 }  // namespace depthwise
552