• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
25 #define ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
26 
27 #include "src/core/NEON/INEKernel.h"
28 #include "src/core/NEON/kernels/convolution/common/convolution.hpp"
29 #include "src/core/NEON/kernels/convolution/common/tensor.hpp"
30 
31 #include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
32 
33 namespace arm_compute
34 {
35 // Forward declarations
36 class ITensor;
37 
38 /** Interface for the NEON kernel to perform Winograd input transform. */
39 class INEWinogradLayerTransformInputKernel : public INEKernel
40 {
41 public:
42     /** Get the working space required to perform the transformation.
43      *
44      * Note, the working space is only required when performing the
45      * transformation - hence it can be reused whenever the transformation is
46      * not running.
47      *
48      * @param num_threads The greatest number of threads that will be used to execute the transform.
49      * @return Size of working space required in bytes.
50      */
51     virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0;
52 
53     /** Determine how much memory (in units of TIn) to allocate for the
54      * transformed input.
55      *
56      * @param[in] num_batches  Number of batches in the input tensor.
57      * @param[in] num_channels Number of feature maps in the input tensor.
58      * @param[in] num_rows     Number of rows in each feature map.
59      * @param[in] num_cols     Number of columns in each feature map.
60      * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
61      *
62      * @return Storage size (in units of TIn) required.
63      */
64     virtual unsigned int get_input_storage_size(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0;
65 
66     /** Gets the stride between matrices in the input worspace
67      *
68      * @param[in] num_batches  Number of batches in the input tensor.
69      * @param[in] num_channels Number of feature maps in the input tensor.
70      * @param[in] num_rows     Number of rows in each feature map.
71      * @param[in] num_cols     Number of columns in each feature map.
72      * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
73      *
74      * @return Stride expressed in bytes.
75      */
76     virtual int get_matrix_stride(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0;
77 
78     /** Configure the output transform kernel.
79      *
80      * @param[in]  input_nhwc    Input tensor in NHWC data layout format.
81      * @param[in]  num_batches   Number of batches in input tensor.
82      * @param[in]  num_rows      Number of rows in input tensor.
83      * @param[in]  num_cols      Number of columns in input tensor.
84      * @param[in]  num_channels  Number of channels in input tensor.
85      * @param[in]  padding       Padding type.
86      * @param[out] output        Base of output matrices.
87      * @param[in]  matrix_stride Stride between output matrices.
88      * @param[in]  workspace     Tensor to be used as the working space during the computation.
89      */
90     virtual void configure(const ITensor *input_nhwc, const int num_batches, const int num_rows, const int num_cols, const int num_channels,
91                            const PaddingType padding, ITensor *output, const int matrix_stride, ITensor *workspace) = 0;
92 
93     /** Destructor */
~INEWinogradLayerTransformInputKernel()94     virtual ~INEWinogradLayerTransformInputKernel()
95     {
96     }
97 };
98 
99 /** NEON kernel to perform Winograd input transform. */
100 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
101 class NEWinogradLayerTransformInputKernel : public INEWinogradLayerTransformInputKernel
102 {
103 public:
104     /** Prevent instances of this class from being copied (As this class contains pointers) */
105     NEWinogradLayerTransformInputKernel(const NEWinogradLayerTransformInputKernel &) = delete;
106     /** Prevent instances of this class from being copied (As this class contains pointers) */
107     NEWinogradLayerTransformInputKernel &operator=(const NEWinogradLayerTransformInputKernel &) = delete;
108     /** Allow instances of this class to be moved */
109     NEWinogradLayerTransformInputKernel(NEWinogradLayerTransformInputKernel &&) = default;
110     /** Allow instances of this class to be moved */
111     NEWinogradLayerTransformInputKernel &operator=(NEWinogradLayerTransformInputKernel &&) = default;
112     /** Default destructor */
113     ~NEWinogradLayerTransformInputKernel() = default;
114 
115     /** Determine how much memory (in units of TIn) to allocate for the
116      * transformed input.
117      *
118      * @param[in] num_batches  Number of batches in the input tensor.
119      * @param[in] num_channels Number of feature maps in the input tensor.
120      * @param[in] num_rows     Number of rows in each feature map.
121      * @param[in] num_cols     Number of columns in each feature map.
122      * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
123      *
124      * @return Storage size (in units of TIn) required.
125      */
126     unsigned int get_input_storage_size(
127         int  num_batches,
128         int  num_channels,
129         int  num_rows,
130         int  num_cols,
131         bool same_padding) const override;
132 
133     /** Get the working space required to perform the transformation.
134      *
135      * Note, the working space is only required when performing the
136      * transformation - hence it can be reused whenever the transformation is
137      * not running.
138      *
139      * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
140      *
141      * @return Size of working space required in bytes.
142      */
143     unsigned int get_working_space_size(unsigned int num_threads) const override;
144 
145     /** Gets the stride between matrices in the input worspace
146      *
147      * @param[in] num_batches  Number of batches in the input tensor.
148      * @param[in] num_channels Number of feature maps in the input tensor.
149      * @param[in] num_rows     Number of rows in each feature map.
150      * @param[in] num_cols     Number of columns in each feature map.
151      * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
152      *
153      * @return Stride expressed in bytes.
154      */
155     int get_matrix_stride(
156         int  num_batches,
157         int  num_channels,
158         int  num_rows,
159         int  num_cols,
160         bool same_padding) const override;
161 
162     /** Default constructor */
163     NEWinogradLayerTransformInputKernel();
164 
name()165     const char *name() const override
166     {
167         return "NEWinogradLayerTransformInputKernel";
168     }
169 
170     /** Configure the output transform kernel.
171      *
172      * @param[in]  input_nhwc    Input tensor.  Data types supported: F16/F32. Layout supported NHWC.
173      * @param[in]  num_batches   Number of batches in input tensor.
174      * @param[in]  num_rows      Number of rows in input tensor.
175      * @param[in]  num_cols      Number of columns in input tensor.
176      * @param[in]  num_channels  Number of channels in input tensor.
177      * @param[in]  padding       Padding type.
178      * @param[out] output        Base of output matrices.
179      * @param[in]  matrix_stride Stride between output matrices.
180      * @param[in]  workspace     Tensor to be used as the working space during the computation.
181      */
182     void configure(
183         const ITensor    *input_nhwc,
184         const int         num_batches,
185         const int         num_rows,
186         const int         num_cols,
187         const int         num_channels,
188         const PaddingType padding,
189         ITensor          *output,
190         const int         matrix_stride,
191         ITensor          *workspace) override;
192 
193     // Inherited methods overridden:
194     void run(const Window &window, const ThreadInfo &info) override;
195 
196     /** Winograd base kernel */
197     using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
198     /** Winograd convolution kernel */
199     using WinogradConv = typename WinogradBase::template Convolution<T, T>;
200 
201     /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformInputKernel
202      *
203      * @param[in] input         First tensor input info. Data types supported: F16/F32.
204      * @param[in] output        Output tensor info. Data types supported: same as @p input.
205      * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
206      *
207      * @return a status
208      */
209     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
210 
211 private:
212     using InputTransform = typename WinogradBase::template InputTransform<T, T>;
213 
214     std::unique_ptr<InputTransform> _transform{ nullptr };
215     const ITensor                  *_input_nhwc;
216     int                             _num_batches;    /**< Number of batches in input tensor. */
217     int                             _num_rows;       /**< Number of rows in input tensor. */
218     int                             _num_cols;       /**< Number of columns in input tensor. */
219     int                             _num_channels;   /**< Number of channels in input tensor. */
220     PaddingType                     _padding;        /**< Padding type. */
221     ITensor                        *_output;         /**< Base of output matrices. */
222     int                             _matrix_stride;  /**< Stride between output matrices. */
223     int                             _padding_top;    /**< Padding to apply to the top of the image. */
224     int                             _padding_left;   /**< Padding to apply to the left of the image. */
225     int                             _padding_right;  /**< Padding to apply to the right of the image. */
226     int                             _padding_bottom; /**< Padding to apply to the bottom of the image. */
227     ITensor                        *_workspace;
228 };
229 
230 /** Interface for the NEON kernel to perform Winograd output transform. */
231 class INEWinogradLayerTransformOutputKernel : public INEKernel
232 {
233 public:
234     /** Get the working space required to perform the transformation.
235      *
236      * Note, the working space is only required when performing the
237      * transformation - hence it can be reused whenever the transformation is
238      * not running.
239      *
240      * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
241      *
242      * @return Size of working space required in bytes.
243      */
244     virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0;
245 
246     /** Determine how much memory (in units of TOut) to allocate for the
247      * (Winograd domain) output.
248      *
249      * @param[in] num_batches         Number of batches in the output tensor.
250      * @param[in] num_rows            Number of rows in each feature map of the input tensor.
251      * @param[in] num_cols            Number of columns in each feature map of the input tensor.
252      * @param[in] num_output_channels Number of feature maps in the output tensor.
253      *
254      * @return Storage size (in units of TOut) required.
255      */
256     virtual unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0;
257 
258     /** Gets the stride between matrices in the output worspace
259      *
260      * @param[in] num_batches         Number of batches in the output tensor.
261      * @param[in] num_rows            Number of rows in each feature map of the input tensor.
262      * @param[in] num_cols            Number of columns in each feature map of the input tensor.
263      * @param[in] num_output_channels Number of feature maps in the output tensor.
264      *
265      * @return Stride expressed in bytes.
266      */
267     virtual int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0;
268 
269     /** Get the output shape of a convolution.
270      *
271      * @param[in] num_rows     Number of rows in each feature map of the input tensor.
272      * @param[in] num_cols     Number of columns in each feature map of the input tensor.
273      * @param[in] padding_same True if padding is SAME, false otherwise
274      *
275      * @return Shape of the output tensor
276      */
277     virtual std::pair<unsigned int, unsigned int> get_output_shape(
278         int  num_rows,    /* Number of rows in each feature map of the input tensor. */
279         int  num_cols,    /* Number of columns in each feature map of the input tensor. */
280         bool padding_same /* True if padding is SAME, false otherwise */
281     ) const = 0;
282 
283     /** Configure the output transform kernel.
284      *
285      * @param[in]  biases             Pointer to the biases tensor.
286      * @param[in]  transformed_output Pointer to working space for the output tensor in the Winograd domain.
287      * @param[in]  matrix_stride      Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride()
288      * @param[out] output_nhwc        Pointer to a tensor in NHWC data layout ordered output tensor, in the spatial domain.
289      * @param[in]  num_batches        Number of batches in the input tensor.
290      * @param[in]  num_rows           Number of rows in output tensor.
291      * @param[in]  num_cols           Number of columns in output tensor.
292      * @param[in]  num_channels       Number of feature maps in the output tensor.
293      * @param[in]  workspace          Tensor to be used as the working space during the computation.
294      * @param[in]  activation         Activation to be used
295      */
296     virtual void configure(
297         const ITensor              *biases,
298         const ITensor              *transformed_output,
299         const int                   matrix_stride,
300         ITensor                    *output_nhwc,
301         const int                   num_batches,
302         const int                   num_rows,
303         const int                   num_cols,
304         const int                   num_channels,
305         ITensor                    *workspace,
306         const arm_gemm::Activation &activation) = 0;
307 
~INEWinogradLayerTransformOutputKernel()308     virtual ~INEWinogradLayerTransformOutputKernel()
309     {
310     }
311 };
312 
313 /** NEON kernel to perform Winograd output transform. */
314 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
315 class NEWinogradLayerTransformOutputKernel : public INEWinogradLayerTransformOutputKernel
316 {
317 public:
name()318     const char *name() const override
319     {
320         return "NEWinogradLayerTransformOutputKernel";
321     }
322     /** Constructor */
323     NEWinogradLayerTransformOutputKernel();
324 
325     /** Prevent instances of this class from being copied (As this class contains pointers) */
326     NEWinogradLayerTransformOutputKernel(const NEWinogradLayerTransformOutputKernel &) = delete;
327     /** Prevent instances of this class from being copied (As this class contains pointers) */
328     NEWinogradLayerTransformOutputKernel &operator=(const NEWinogradLayerTransformOutputKernel &) = delete;
329     /** Allow instances of this class to be moved */
330     NEWinogradLayerTransformOutputKernel(NEWinogradLayerTransformOutputKernel &&) = default;
331     /** Allow instances of this class to be moved */
332     NEWinogradLayerTransformOutputKernel &operator=(NEWinogradLayerTransformOutputKernel &&) = default;
333     /** Default destructor */
334     ~NEWinogradLayerTransformOutputKernel() = default;
335 
336     // Inherited methods overridden:
337     /** Determine how much memory (in units of TOut) to allocate for the
338      * (Winograd domain) output.
339      *
340      * @param[in] num_batches         Number of batches in the output tensor.
341      * @param[in] num_rows            Number of rows in each feature map of the input tensor.
342      * @param[in] num_cols            Number of columns in each feature map of the input tensor.
343      * @param[in] num_output_channels Number of feature maps in the output tensor.
344      *
345      * @return Storage size (in units of TOut) required.
346      */
347     unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const override;
348 
349     /** Gets the stride between matrices in the output worspace
350      *
351      * @param[in] num_batches         Number of batches in the output tensor.
352      * @param[in] num_rows            Number of rows in each feature map of the input tensor.
353      * @param[in] num_cols            Number of columns in each feature map of the input tensor.
354      * @param[in] num_output_channels Number of feature maps in the output tensor.
355      *
356      * @return Stride expressed in bytes.
357      */
358     int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const override;
359     /** Get the output shape of a convolution.
360      *
361      * @param[in] num_rows     Number of rows in each feature map of the input tensor.
362      * @param[in] num_cols     Number of columns in each feature map of the input tensor.
363      * @param[in] padding_same True if padding is SAME, false otherwise
364      *
365      * @return Shape of the output tensor
366      */
367     std::pair<unsigned int, unsigned int> get_output_shape(
368         int  num_rows, /* Number of rows in each feature map of the input tensor. */
369         int  num_cols, /* Number of columns in each feature map of the input tensor. */
370         bool padding_same) const override;
371 
372     /** Get the working space required to perform the transformation.
373      *
374      * Note, the working space is only required when performing the
375      * transformation - hence it can be reused whenever the transformation is
376      * not running.
377      *
378      * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
379      *
380      * @return Size of working space required in bytes.
381      */
382     unsigned int get_working_space_size(unsigned int num_threads) const override;
383 
384     /** Configure the output transform kernel.
385      *
386      * @param[in]  biases             Pointer to the biases tensor.
387      * @param[in]  transformed_output Pointer to working space for the output tensor in the Winograd domain.
388      * @param[in]  matrix_stride      Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride()
389      * @param[out] output_nhwc        Pointer to a tensor with NHWC data layout, in the spatial domain.
390      * @param[in]  num_batches        Number of batches in the input tensor.
391      * @param[in]  num_rows           Number of rows in output tensor.
392      * @param[in]  num_cols           Number of columns in output tensor.
393      * @param[in]  num_channels       Number of feature maps in the output tensor.
394      * @param[in]  workspace          Tensor to be used as the working space during the computation.
395      * @param[in]  activation         Activation to be used
396      */
397     void configure(
398         const ITensor              *biases,
399         const ITensor              *transformed_output,
400         const int                   matrix_stride,
401         ITensor                    *output_nhwc,
402         const int                   num_batches,
403         const int                   num_rows,
404         const int                   num_cols,
405         const int                   num_channels,
406         ITensor                    *workspace,
407         const arm_gemm::Activation &activation) override;
408 
409     void run(const Window &window, const ThreadInfo &info) override;
410 
411     /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformOutputKernel
412      *
413      * @param[in] input         Source tensor info with shape [C, N, 16, batches] or [C, N, 36, batches]. Data types supported: F16/F32.
414      * @param[in] bias          Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
415      * @param[in] output        Destination tensor info with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input
416      * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
417      *
418      * @return a status
419      */
420     static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info);
421 
422 private:
423     using WinogradBase    = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
424     using WinogradConv    = typename WinogradBase::template Convolution<T, T>;
425     using OutputTransform = typename WinogradBase::template OutputTransform<T, T>;
426 
427     std::unique_ptr<OutputTransform> _transform{ nullptr };
428     const ITensor                   *_biases;
429     const ITensor                   *_transformed_output;
430     ITensor                         *_workspace;
431     int                              _matrix_stride;
432     int                              _matrix_row_stride;
433     ITensor                         *_output_nhwc;
434     int                              _num_batches;
435     int                              _num_rows;
436     int                              _num_cols;
437     int                              _num_channels;
438 };
439 
440 /** Interface for the NEON kernel to perform Winograd weights transform. */
441 class INEWinogradLayerTransformWeightsKernel : public INEKernel
442 {
443 public:
444     /** Prevent instances of this class from being copied (As this class contains pointers) */
445     INEWinogradLayerTransformWeightsKernel(const INEWinogradLayerTransformWeightsKernel &) = default;
446     /** Prevent instances of this class from being copied (As this class contains pointers) */
447     INEWinogradLayerTransformWeightsKernel &operator=(const INEWinogradLayerTransformWeightsKernel &) = default;
448     /** Allow instances of this class to be moved */
449     INEWinogradLayerTransformWeightsKernel(INEWinogradLayerTransformWeightsKernel &&) = default;
450     /** Allow instances of this class to be moved */
451     INEWinogradLayerTransformWeightsKernel &operator=(INEWinogradLayerTransformWeightsKernel &&) = default;
452 
INEWinogradLayerTransformWeightsKernel()453     INEWinogradLayerTransformWeightsKernel()
454     {
455     }
~INEWinogradLayerTransformWeightsKernel()456     virtual ~INEWinogradLayerTransformWeightsKernel()
457     {
458     }
459     /** Determine how much memory (in units of T) to allocate for the
460      * transformed weights.
461      *
462      * @param[in] num_output_channels Number of output feature maps.
463      * @param[in] num_input_channels  Number of input feature maps.
464      *
465      * @return Storage size (in units of T) required.
466      */
467     virtual unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const = 0;
468     /** Gets the stride between matrices in the kernel worspace
469      *
470      * @param[in] num_output_channels Number of output feature maps.
471      * @param[in] num_input_channels  Number of input feature maps.
472      *
473      * @return Stride expressed in bytes.
474      */
475     virtual int get_matrix_stride(int num_output_channels, int num_input_channels) const = 0;
476 
477     /** Configure the weights transform kernel.
478      *
479      * @param[in]  weights_hwio        Pointer to the weights tensor
480      * @param[out] output              Pointer to working space for the output tensor in the Winograd domain.
481      * @param[in]  matrix_stride       Stride across matrices in the output workspace.
482      * @param[in]  num_output_channels Number of filters.
483      * @param[in]  num_input_channels  Number of channels in each filter.
484      */
485 
486     virtual void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0;
487 
488     /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformWeightsKernel
489      *
490      * @param[in] input   First tensor input info. Data types supported: F16/F32.
491      * @param[in] weights Weights tensor info. Data types supported: same as @p input.
492      *
493      * @return a status
494      */
495     static Status validate(const ITensorInfo *input, const ITensorInfo *weights);
496 };
497 
498 /** NEON kernel to perform Winograd weights transform. */
499 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
500 class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformWeightsKernel
501 {
502 public:
503     /** Prevent instances of this class from being copied (As this class contains pointers) */
504     NEWinogradLayerTransformWeightsKernel(const NEWinogradLayerTransformWeightsKernel &) = delete;
505     /** Prevent instances of this class from being copied (As this class contains pointers) */
506     NEWinogradLayerTransformWeightsKernel &operator=(const NEWinogradLayerTransformWeightsKernel &) = delete;
507     /** Allow instances of this class to be moved */
508     NEWinogradLayerTransformWeightsKernel(NEWinogradLayerTransformWeightsKernel &&) = default;
509     /** Allow instances of this class to be moved */
510     NEWinogradLayerTransformWeightsKernel &operator=(NEWinogradLayerTransformWeightsKernel &&) = default;
511     /** Default destructor */
512     ~NEWinogradLayerTransformWeightsKernel() = default;
513 
514     /** Default constructor. */
515     NEWinogradLayerTransformWeightsKernel();
name()516     const char *name() const override
517     {
518         return "NEWinogradLayerTransformWeightsKernel";
519     }
520 
521     /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformWeightsKernel
522      *
523      * @param[in] input         Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout).
524      *                          kernel_x must be 3 and equal to kernel_y. Data types supported: F16/F32.
525      * @param[in] output        Destination tensor info. The output is a 3D tensor with dimensions [OFM, IFM, 16] or [OFM, IFM, 36]. Data type supported: same as @p input
526      * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
527      *
528      * @return a status
529      */
530     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
531 
532     // Inherited methods overridden:
533 
534 #ifndef DOXYGEN_SKIP_THIS
535     /** Configure the weights transform kernel.
536      *
537      * @param[in]  weights_hwio        Pointer to the weights tensor
538      * @param[out] output              Pointer to working space for the output tensor in the Winograd domain.
539      * @param[in]  matrix_stride       Stride across matrices in the output workspace.
540      * @param[in]  num_output_channels Number of filters.
541      * @param[in]  num_input_channels  Number of channels in each filter.
542      */
543     void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override;
544 #endif /* DOXYGEN_SKIP_THIS */
545 
546     /** Determine how much memory (in units of T) to allocate for the
547      * transformed weights.
548      *
549      * @param[in] num_output_channels Number of output feature maps.
550      * @param[in] num_input_channels  Number of input feature maps.
551      *
552      * @return Storage size (in units of T) required.
553      */
554     unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const override;
555 
556     /** Gets the stride between matrices in the input worspace
557      *
558      * @param[in] num_output_channels Number of output feature maps.
559      * @param[in] num_input_channels  Number of input feature maps.
560      *
561      * @return Stride expressed in bytes.
562      */
563     int get_matrix_stride(int num_output_channels, int num_input_channels) const override;
564     void run(const Window &window, const ThreadInfo &info) override;
565     bool is_parallelisable() const override;
566 
567 private:
568     using WinogradBase     = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
569     using WinogradConv     = typename WinogradBase::template Convolution<T, T>;
570     using WeightsTransform = typename WinogradBase::template WeightsTransform<T, T>;
571 
572     std::unique_ptr<WeightsTransform> _transform{ nullptr };
573     const ITensor                    *_weights_hwio;
574     ITensor                          *_output;
575     int                               _matrix_stride;
576     int                               _num_output_channels;
577     int                               _num_input_channels;
578 };
579 
580 /** NEON kernel to perform Winograd. */
581 template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
582 class NEWinogradLayerConfiguration
583 {
584 public:
585     /** Winograd base kernel */
586     using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
587     /** Winograd convolution kernel */
588 
589     using WinogradConv = typename WinogradBase::template Convolution<TIn, TOut>;
590 
591     using TransformInputKernel   = NEWinogradLayerTransformInputKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
592     using TransformWeightsKernel = NEWinogradLayerTransformWeightsKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
593     using TransformOutputKernel  = NEWinogradLayerTransformOutputKernel<TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
594 };
595 
596 } // namespace arm_compute
597 #endif /*ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H*/
598