• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "arm_compute/core/Error.h"
28 
29 #include "src/core/NEON/kernels/assembly/winograd.hpp"
30 
31 #include "src/core/NEON/kernels/arm_conv/addressing.hpp"
32 #include <algorithm>
33 #include <cstring>
34 #include <functional>
35 
36 namespace arm_conv {
37 namespace winograd {
38 namespace input_transform {
39 
40 namespace {
41 
42 template <typename T>
iceildiv(const T a,const T b)43 constexpr T iceildiv(const T a, const T b)
44 {
45   return (a + b - 1) / b;
46 }
47 
48 }
49 
50 /* Driver class for the Winograd input transforms.
51  *
52  * This provides a base implementation which handles iteration over the input
53  * tensor; subclasses are responsible for managing working space and executing
54  * the transform on individual tiles.
55  */
56 template <typename TIn, typename TOut=TIn>
57 class TransformBase : public ITransform
58 {
59   const std::string m_name;
60   const unsigned int m_input_rows, m_input_cols;
61 
62   protected:
get_working_space_per_thread(const ConvolutionArgs &) const63   virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const
64   {
65     return 0;
66   }
67 
initialise_thread_working_space(const ConvolutionArgs &,void *) const68   virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const
69   {
70     // Nothing to do
71   }
72 
73   virtual void execute_tile(
74     unsigned int n_channels,
75     const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
76     TOut *outptr, size_t ld_out_matrix,
77     unsigned int pad_top, unsigned int valid_rows,
78     unsigned int pad_left, unsigned int valid_cols,
79     void *working_space
80   ) const = 0;
81 
execute_internal(const ConvolutionArgs & args,const TIn * inptr,size_t ld_in_batch,size_t ld_in_row,size_t ld_in_col,TOut * outptr,size_t ld_out_batch,size_t ld_out_matrix,size_t ld_out_row,void * working_space,unsigned int thread_id,unsigned int n_threads) const82   void execute_internal(
83     const ConvolutionArgs &args,
84     const TIn *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
85     TOut *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
86     void *working_space, unsigned int thread_id, unsigned int n_threads
87   ) const
88   {
89     // Get the working space for this thread, and initialise it.
90     working_space = reinterpret_cast<char *>(working_space) +
91                     this->get_working_space_per_thread(args) * thread_id;
92     this->initialise_thread_working_space(args, working_space);
93 
94     // Get tile traversal parameters
95     const auto tile_stride_rows = std::max(1u, m_input_rows - args.kernel_shape.rows + 1);
96     const auto tile_stride_cols = std::max(1u, m_input_cols - args.kernel_shape.cols + 1);
97     const auto n_tile_rows = iceildiv(
98       args.output_shape.rows, m_input_rows - args.kernel_shape.rows + 1);
99     const auto n_tile_cols = iceildiv(
100       args.output_shape.cols, m_input_cols - args.kernel_shape.cols + 1);
101 
102     // Execute over all batches
103     for (unsigned int batch = 0; batch < args.n_batches; batch++)
104     {
105       auto outptr_tile = outptr + thread_id * n_tile_cols * ld_out_row;
106 
107       // For a single batch, stripe the rows over the threads.
108       for (auto tile_i = thread_id; tile_i < n_tile_rows; tile_i += n_threads)
109       {
110         // Compute pointers and padding for this row of tiles
111         const auto start_i = tile_i * tile_stride_rows;
112         const auto pad_top = start_i < args.pad_top ? args.pad_top - start_i : 0;
113         const auto inptr_row = inptr + (pad_top ? 0 : start_i - args.pad_top) * ld_in_row;
114         const auto valid_rows = args.input_shape.rows - (pad_top ? 0 : start_i - args.pad_top);
115 
116         // Iterate over columns
117         for (auto tile_j = 0u; tile_j < n_tile_cols; tile_j++)
118         {
119           // Compute pointers and padding for this tile, then delegate to
120           // execute the kernel.
121           const auto start_j = tile_j * tile_stride_cols;
122           const auto pad_left = start_j < args.pad_left ? args.pad_left - start_j : 0;
123           const auto inptr_tile = inptr_row + (pad_left ? 0 : start_j - args.pad_left) * ld_in_col;
124           const auto valid_cols = args.input_shape.cols - (pad_left ? 0 : start_j - args.pad_left);
125 
126           this->execute_tile(
127             args.n_input_channels,
128             inptr_tile, ld_in_row, ld_in_col,
129             outptr_tile, ld_out_matrix,
130             pad_top, valid_rows, pad_left, valid_cols,
131             working_space
132           );
133           outptr_tile += ld_out_row;
134         }
135 
136         outptr_tile += (n_threads - 1) * n_tile_cols * ld_out_row;
137       }
138 
139       inptr += ld_in_batch;
140       outptr += ld_out_batch;
141     }
142   }
143 
144   public:
TransformBase(const std::string & name,unsigned int input_rows,unsigned int input_cols)145   TransformBase(const std::string &name, unsigned int input_rows, unsigned int input_cols)
146   : m_name(name), m_input_rows(input_rows), m_input_cols(input_cols)
147   {
148   }
149 
get_name(void) const150   const std::string &get_name(void) const override { return m_name; }
151 
get_input_rows(void) const152   unsigned int get_input_rows(void) const override final { return m_input_rows; }
get_input_cols(void) const153   unsigned int get_input_cols(void) const override final { return m_input_cols; }
154 
get_working_space_size(const ConvolutionArgs & args,unsigned int n_threads) const155   size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override
156   {
157     return n_threads * this->get_working_space_per_thread(args);
158   }
159 
execute(const ConvolutionArgs & args,const void * inptr,size_t ld_in_batch,size_t ld_in_row,size_t ld_in_col,void * outptr,size_t ld_out_batch,size_t ld_out_matrix,size_t ld_out_row,void * working_space,unsigned int thread_id,unsigned int n_threads) const160   void execute(
161     const ConvolutionArgs &args,
162     const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
163     void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
164     void *working_space, unsigned int thread_id, unsigned int n_threads
165   ) const override
166   {
167     execute_internal(
168       args,
169       reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_row, ld_in_col,
170       reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_matrix, ld_out_row,
171       working_space, thread_id, n_threads
172     );
173   }
174 };
175 
176 template <typename TIn, typename TOut=TIn>
177 class TransformDirect : public TransformBase<TIn, TOut>
178 {
179   using Kernel = std::function<void(
180     unsigned int,  // Number of channels
181     const TIn *,  size_t, size_t,  // Pointer to first valid input element, row and column stride
182     unsigned int, unsigned int, unsigned int, unsigned int,  // Top, left, bottom and right padding
183     TOut *, size_t  // Base output pointer, stride between matrices
184   )>;
185   const Kernel m_kernel;
186 
187   protected:
execute_tile(unsigned int n_channels,const TIn * inptr,size_t ld_in_row,size_t ld_in_col,TOut * outptr,size_t ld_out_matrix,unsigned int pad_top,unsigned int valid_rows,unsigned int pad_left,unsigned int valid_cols,void * working_space) const188   void execute_tile(
189     unsigned int n_channels,
190     const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
191     TOut *outptr, size_t ld_out_matrix,
192     unsigned int pad_top, unsigned int valid_rows,
193     unsigned int pad_left, unsigned int valid_cols,
194     void *working_space
195   ) const override
196   {
197     ARM_COMPUTE_UNUSED(working_space);
198     const auto end_i = this->get_input_rows() - pad_top;
199     const auto pad_bottom = end_i < valid_rows ? 0 : end_i - valid_rows;
200     const auto end_j = this->get_input_cols() - pad_left;
201     const auto pad_right = end_j < valid_cols ? 0 : end_j - valid_cols;
202 
203     // Execute the kernel
204     m_kernel(
205       n_channels, inptr, ld_in_row, ld_in_col,
206       pad_top, pad_left, pad_bottom, pad_right,
207       outptr, ld_out_matrix
208     );
209   }
210 
211   public:
TransformDirect(const std::string & name,unsigned int input_rows,unsigned int input_cols,Kernel kernel)212   TransformDirect(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
213   : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
214   {
215   }
216 };
217 
218 template <typename TIn, typename TOut=TIn>
219 class TransformIndirect : public TransformBase<TIn, TOut>
220 {
221   using Kernel = std::function<void(
222     unsigned int,  // Number of channels
223     const TIn *const *,  // Input pointers (one per point)
224     TOut *, size_t   // Base output pointer, stride between matrices
225   )>;
226   const Kernel m_kernel;
227 
228   struct Workspace
229   {
230     const TIn **inptrs;
231     const TIn *input_buffer;
232   };
233 
sizeof_inptr_array(void) const234   size_t sizeof_inptr_array(void) const
235   {
236     return sizeof(const TIn **) * this->get_input_rows() * this->get_input_cols();
237   }
238 
239   protected:
get_working_space_per_thread(const ConvolutionArgs & args) const240   size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
241   {
242     return sizeof(Workspace) + sizeof_inptr_array() + sizeof(TIn) * args.n_input_channels;
243   }
244 
initialise_thread_working_space(const ConvolutionArgs & args,void * buffer) const245   void initialise_thread_working_space(const ConvolutionArgs &args, void *buffer) const override
246   {
247     Workspace *ws = reinterpret_cast<Workspace *>(buffer);
248     buffer = ws + 1;
249 
250     ws->inptrs = reinterpret_cast<const TIn **>(buffer);
251     buffer = reinterpret_cast<char *>(buffer) + sizeof_inptr_array();
252 
253     ws->input_buffer = reinterpret_cast<const TIn *>(buffer);
254     memset(buffer, 0, sizeof(TIn) * args.n_input_channels);
255   }
256 
execute_tile(unsigned int n_channels,const TIn * inptr,size_t ld_in_row,size_t ld_in_col,TOut * outptr,size_t ld_out_matrix,unsigned int pad_top,unsigned int valid_rows,unsigned int pad_left,unsigned int valid_cols,void * working_space) const257   void execute_tile(
258     unsigned int n_channels,
259     const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
260     TOut *outptr, size_t ld_out_matrix,
261     unsigned int pad_top, unsigned int valid_rows,
262     unsigned int pad_left, unsigned int valid_cols,
263     void *working_space
264   ) const override
265   {
266     // Get the working space
267     auto ws = reinterpret_cast<Workspace *>(working_space);
268 
269     // Construct the input pointer array based on the given arguments
270     fill_pointer_array<const TIn>(
271       ws->inptrs, this->get_input_rows(), this->get_input_cols(),
272       inptr, ld_in_row, ld_in_col,
273       ws->input_buffer,
274       pad_top, valid_rows,
275       pad_left, valid_cols
276     );
277 
278     // Execute the kernel
279     m_kernel(n_channels, ws->inptrs, outptr, ld_out_matrix);
280   }
281 
282   public:
TransformIndirect(const std::string & name,unsigned int input_rows,unsigned int input_cols,Kernel kernel)283   TransformIndirect(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
284   : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
285   {
286   }
287 };
288 
289 template <typename TIn, typename TOut=TIn>
290 class TransformUnpadded : public TransformBase<TIn, TOut>
291 {
292   using Kernel = std::function<void(
293     unsigned int,  // Number of channels
294     const TIn *,  size_t, size_t,  // Pointer to first input element, row and column stride
295     TOut *, size_t // Base output pointer, stride between matrices
296   )>;
297   const Kernel m_kernel;
298 
299   protected:
get_working_space_per_thread(const ConvolutionArgs & args) const300   size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
301   {
302     const auto input_points = this->get_input_rows() * this->get_input_cols();
303     return sizeof(TIn) * input_points * args.n_input_channels;
304   }
305 
execute_tile(unsigned int n_channels,const TIn * inptr,size_t ld_in_row,size_t ld_in_col,TOut * const outptr,const size_t ld_out_matrix,const unsigned int pad_top,const unsigned int valid_rows,const unsigned int pad_left,const unsigned int valid_cols,void * const working_space) const306   void execute_tile(
307     unsigned int n_channels,
308     const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
309     TOut *const outptr, const size_t ld_out_matrix,
310     const unsigned int pad_top, const unsigned int valid_rows,
311     const unsigned int pad_left, const unsigned int valid_cols,
312     void *const working_space
313   ) const override
314   {
315     // If there's any padding, then copy the valid portion of the tensor into
316     // the working space and reset the pointer, row and column strides to point
317     // at this copy of the data.
318     if (pad_top || valid_rows < this->get_input_rows() ||
319         pad_left || valid_cols < this->get_input_cols())
320     {
321       const auto patch_ld_col = n_channels;
322       const auto patch_ld_row = patch_ld_col * this->get_input_cols();
323       auto patch = reinterpret_cast<TIn *>(working_space) +
324                    pad_top*patch_ld_row + pad_left*patch_ld_col;
325 
326       // Fill the input patch with padding
327       memset(working_space, 0, sizeof(TIn) * this->get_input_rows() * patch_ld_row);
328 
329       // Determine the bounds for which to copy
330       const auto last_i = std::min(valid_rows + pad_top, this->get_input_rows());
331       const auto last_j = std::min(valid_cols + pad_left, this->get_input_cols());
332 
333       // Copy across the valid portion of the patch
334       for (auto i = pad_top; i < last_i; i++)
335       {
336         auto inptr_col = inptr;
337         inptr += ld_in_row;
338 
339         auto patch_col = patch;
340         patch += patch_ld_row;
341 
342         for (auto j = pad_left; j < last_j; j++)
343         {
344           // Perform the copy and progress both input and patch pointers
345           memcpy(patch_col, inptr_col, n_channels * sizeof(TIn));
346           inptr_col += ld_in_col;
347           patch_col += patch_ld_col;
348         }
349       }
350 
351       // Override the input pointer and strides
352       inptr = reinterpret_cast<const TIn *>(working_space);
353       ld_in_col = patch_ld_col;
354       ld_in_row = patch_ld_row;
355     }
356 
357     // Call the kernel
358     m_kernel(n_channels, inptr, ld_in_row, ld_in_col, outptr, ld_out_matrix);
359   }
360 
361   public:
TransformUnpadded(const std::string & name,unsigned int input_rows,unsigned int input_cols,Kernel kernel)362   TransformUnpadded(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
363   : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
364   {
365   }
366 
367   /* Utility method which can be used to get a transposed version of a kernel,
368    * this just calls the kernel with the input row and column strides reversed.
369    */
get_transposed_kernel(const Kernel & kernel)370   static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
371   {
372     return [kernel] (
373       const unsigned int n_channels,
374       const TIn *const inptr, const size_t ld_in_row, const size_t ld_in_col,
375       TOut *const outptr, const size_t ld_out_matrix
376     ) {
377       kernel(n_channels, inptr, ld_in_col, ld_in_row, outptr, ld_out_matrix);
378     };
379   }
380 };
381 
382 }  // namespace input_transform
383 }  // namespace winograd
384 }  // namespace arm_conv
385