1 /*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #pragma once
26
27 #include "arm_compute/core/Error.h"
28
29 #include "src/core/NEON/kernels/assembly/winograd.hpp"
30
31 #include "src/core/NEON/kernels/arm_conv/addressing.hpp"
32 #include <algorithm>
33 #include <cstring>
34 #include <functional>
35
36 namespace arm_conv {
37 namespace winograd {
38 namespace input_transform {
39
40 namespace {
41
42 template <typename T>
iceildiv(const T a,const T b)43 constexpr T iceildiv(const T a, const T b)
44 {
45 return (a + b - 1) / b;
46 }
47
48 }
49
50 /* Driver class for the Winograd input transforms.
51 *
52 * This provides a base implementation which handles iteration over the input
53 * tensor; subclasses are responsible for managing working space and executing
54 * the transform on individual tiles.
55 */
56 template <typename TIn, typename TOut=TIn>
57 class TransformBase : public ITransform
58 {
59 const std::string m_name;
60 const unsigned int m_input_rows, m_input_cols;
61
62 protected:
get_working_space_per_thread(const ConvolutionArgs &) const63 virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const
64 {
65 return 0;
66 }
67
initialise_thread_working_space(const ConvolutionArgs &,void *) const68 virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const
69 {
70 // Nothing to do
71 }
72
73 virtual void execute_tile(
74 unsigned int n_channels,
75 const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
76 TOut *outptr, size_t ld_out_matrix,
77 unsigned int pad_top, unsigned int valid_rows,
78 unsigned int pad_left, unsigned int valid_cols,
79 void *working_space
80 ) const = 0;
81
execute_internal(const ConvolutionArgs & args,const TIn * inptr,size_t ld_in_batch,size_t ld_in_row,size_t ld_in_col,TOut * outptr,size_t ld_out_batch,size_t ld_out_matrix,size_t ld_out_row,void * working_space,unsigned int thread_id,unsigned int n_threads) const82 void execute_internal(
83 const ConvolutionArgs &args,
84 const TIn *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
85 TOut *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
86 void *working_space, unsigned int thread_id, unsigned int n_threads
87 ) const
88 {
89 // Get the working space for this thread, and initialise it.
90 working_space = reinterpret_cast<char *>(working_space) +
91 this->get_working_space_per_thread(args) * thread_id;
92 this->initialise_thread_working_space(args, working_space);
93
94 // Get tile traversal parameters
95 const auto tile_stride_rows = std::max(1u, m_input_rows - args.kernel_shape.rows + 1);
96 const auto tile_stride_cols = std::max(1u, m_input_cols - args.kernel_shape.cols + 1);
97 const auto n_tile_rows = iceildiv(
98 args.output_shape.rows, m_input_rows - args.kernel_shape.rows + 1);
99 const auto n_tile_cols = iceildiv(
100 args.output_shape.cols, m_input_cols - args.kernel_shape.cols + 1);
101
102 // Execute over all batches
103 for (unsigned int batch = 0; batch < args.n_batches; batch++)
104 {
105 auto outptr_tile = outptr + thread_id * n_tile_cols * ld_out_row;
106
107 // For a single batch, stripe the rows over the threads.
108 for (auto tile_i = thread_id; tile_i < n_tile_rows; tile_i += n_threads)
109 {
110 // Compute pointers and padding for this row of tiles
111 const auto start_i = tile_i * tile_stride_rows;
112 const auto pad_top = start_i < args.pad_top ? args.pad_top - start_i : 0;
113 const auto inptr_row = inptr + (pad_top ? 0 : start_i - args.pad_top) * ld_in_row;
114 const auto valid_rows = args.input_shape.rows - (pad_top ? 0 : start_i - args.pad_top);
115
116 // Iterate over columns
117 for (auto tile_j = 0u; tile_j < n_tile_cols; tile_j++)
118 {
119 // Compute pointers and padding for this tile, then delegate to
120 // execute the kernel.
121 const auto start_j = tile_j * tile_stride_cols;
122 const auto pad_left = start_j < args.pad_left ? args.pad_left - start_j : 0;
123 const auto inptr_tile = inptr_row + (pad_left ? 0 : start_j - args.pad_left) * ld_in_col;
124 const auto valid_cols = args.input_shape.cols - (pad_left ? 0 : start_j - args.pad_left);
125
126 this->execute_tile(
127 args.n_input_channels,
128 inptr_tile, ld_in_row, ld_in_col,
129 outptr_tile, ld_out_matrix,
130 pad_top, valid_rows, pad_left, valid_cols,
131 working_space
132 );
133 outptr_tile += ld_out_row;
134 }
135
136 outptr_tile += (n_threads - 1) * n_tile_cols * ld_out_row;
137 }
138
139 inptr += ld_in_batch;
140 outptr += ld_out_batch;
141 }
142 }
143
144 public:
TransformBase(const std::string & name,unsigned int input_rows,unsigned int input_cols)145 TransformBase(const std::string &name, unsigned int input_rows, unsigned int input_cols)
146 : m_name(name), m_input_rows(input_rows), m_input_cols(input_cols)
147 {
148 }
149
get_name(void) const150 const std::string &get_name(void) const override { return m_name; }
151
get_input_rows(void) const152 unsigned int get_input_rows(void) const override final { return m_input_rows; }
get_input_cols(void) const153 unsigned int get_input_cols(void) const override final { return m_input_cols; }
154
get_working_space_size(const ConvolutionArgs & args,unsigned int n_threads) const155 size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override
156 {
157 return n_threads * this->get_working_space_per_thread(args);
158 }
159
execute(const ConvolutionArgs & args,const void * inptr,size_t ld_in_batch,size_t ld_in_row,size_t ld_in_col,void * outptr,size_t ld_out_batch,size_t ld_out_matrix,size_t ld_out_row,void * working_space,unsigned int thread_id,unsigned int n_threads) const160 void execute(
161 const ConvolutionArgs &args,
162 const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
163 void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
164 void *working_space, unsigned int thread_id, unsigned int n_threads
165 ) const override
166 {
167 execute_internal(
168 args,
169 reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_row, ld_in_col,
170 reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_matrix, ld_out_row,
171 working_space, thread_id, n_threads
172 );
173 }
174 };
175
176 template <typename TIn, typename TOut=TIn>
177 class TransformDirect : public TransformBase<TIn, TOut>
178 {
179 using Kernel = std::function<void(
180 unsigned int, // Number of channels
181 const TIn *, size_t, size_t, // Pointer to first valid input element, row and column stride
182 unsigned int, unsigned int, unsigned int, unsigned int, // Top, left, bottom and right padding
183 TOut *, size_t // Base output pointer, stride between matrices
184 )>;
185 const Kernel m_kernel;
186
187 protected:
execute_tile(unsigned int n_channels,const TIn * inptr,size_t ld_in_row,size_t ld_in_col,TOut * outptr,size_t ld_out_matrix,unsigned int pad_top,unsigned int valid_rows,unsigned int pad_left,unsigned int valid_cols,void * working_space) const188 void execute_tile(
189 unsigned int n_channels,
190 const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
191 TOut *outptr, size_t ld_out_matrix,
192 unsigned int pad_top, unsigned int valid_rows,
193 unsigned int pad_left, unsigned int valid_cols,
194 void *working_space
195 ) const override
196 {
197 ARM_COMPUTE_UNUSED(working_space);
198 const auto end_i = this->get_input_rows() - pad_top;
199 const auto pad_bottom = end_i < valid_rows ? 0 : end_i - valid_rows;
200 const auto end_j = this->get_input_cols() - pad_left;
201 const auto pad_right = end_j < valid_cols ? 0 : end_j - valid_cols;
202
203 // Execute the kernel
204 m_kernel(
205 n_channels, inptr, ld_in_row, ld_in_col,
206 pad_top, pad_left, pad_bottom, pad_right,
207 outptr, ld_out_matrix
208 );
209 }
210
211 public:
TransformDirect(const std::string & name,unsigned int input_rows,unsigned int input_cols,Kernel kernel)212 TransformDirect(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
213 : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
214 {
215 }
216 };
217
218 template <typename TIn, typename TOut=TIn>
219 class TransformIndirect : public TransformBase<TIn, TOut>
220 {
221 using Kernel = std::function<void(
222 unsigned int, // Number of channels
223 const TIn *const *, // Input pointers (one per point)
224 TOut *, size_t // Base output pointer, stride between matrices
225 )>;
226 const Kernel m_kernel;
227
228 struct Workspace
229 {
230 const TIn **inptrs;
231 const TIn *input_buffer;
232 };
233
sizeof_inptr_array(void) const234 size_t sizeof_inptr_array(void) const
235 {
236 return sizeof(const TIn **) * this->get_input_rows() * this->get_input_cols();
237 }
238
239 protected:
get_working_space_per_thread(const ConvolutionArgs & args) const240 size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
241 {
242 return sizeof(Workspace) + sizeof_inptr_array() + sizeof(TIn) * args.n_input_channels;
243 }
244
initialise_thread_working_space(const ConvolutionArgs & args,void * buffer) const245 void initialise_thread_working_space(const ConvolutionArgs &args, void *buffer) const override
246 {
247 Workspace *ws = reinterpret_cast<Workspace *>(buffer);
248 buffer = ws + 1;
249
250 ws->inptrs = reinterpret_cast<const TIn **>(buffer);
251 buffer = reinterpret_cast<char *>(buffer) + sizeof_inptr_array();
252
253 ws->input_buffer = reinterpret_cast<const TIn *>(buffer);
254 memset(buffer, 0, sizeof(TIn) * args.n_input_channels);
255 }
256
execute_tile(unsigned int n_channels,const TIn * inptr,size_t ld_in_row,size_t ld_in_col,TOut * outptr,size_t ld_out_matrix,unsigned int pad_top,unsigned int valid_rows,unsigned int pad_left,unsigned int valid_cols,void * working_space) const257 void execute_tile(
258 unsigned int n_channels,
259 const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
260 TOut *outptr, size_t ld_out_matrix,
261 unsigned int pad_top, unsigned int valid_rows,
262 unsigned int pad_left, unsigned int valid_cols,
263 void *working_space
264 ) const override
265 {
266 // Get the working space
267 auto ws = reinterpret_cast<Workspace *>(working_space);
268
269 // Construct the input pointer array based on the given arguments
270 fill_pointer_array<const TIn>(
271 ws->inptrs, this->get_input_rows(), this->get_input_cols(),
272 inptr, ld_in_row, ld_in_col,
273 ws->input_buffer,
274 pad_top, valid_rows,
275 pad_left, valid_cols
276 );
277
278 // Execute the kernel
279 m_kernel(n_channels, ws->inptrs, outptr, ld_out_matrix);
280 }
281
282 public:
TransformIndirect(const std::string & name,unsigned int input_rows,unsigned int input_cols,Kernel kernel)283 TransformIndirect(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
284 : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
285 {
286 }
287 };
288
289 template <typename TIn, typename TOut=TIn>
290 class TransformUnpadded : public TransformBase<TIn, TOut>
291 {
292 using Kernel = std::function<void(
293 unsigned int, // Number of channels
294 const TIn *, size_t, size_t, // Pointer to first input element, row and column stride
295 TOut *, size_t // Base output pointer, stride between matrices
296 )>;
297 const Kernel m_kernel;
298
299 protected:
get_working_space_per_thread(const ConvolutionArgs & args) const300 size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
301 {
302 const auto input_points = this->get_input_rows() * this->get_input_cols();
303 return sizeof(TIn) * input_points * args.n_input_channels;
304 }
305
execute_tile(unsigned int n_channels,const TIn * inptr,size_t ld_in_row,size_t ld_in_col,TOut * const outptr,const size_t ld_out_matrix,const unsigned int pad_top,const unsigned int valid_rows,const unsigned int pad_left,const unsigned int valid_cols,void * const working_space) const306 void execute_tile(
307 unsigned int n_channels,
308 const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
309 TOut *const outptr, const size_t ld_out_matrix,
310 const unsigned int pad_top, const unsigned int valid_rows,
311 const unsigned int pad_left, const unsigned int valid_cols,
312 void *const working_space
313 ) const override
314 {
315 // If there's any padding, then copy the valid portion of the tensor into
316 // the working space and reset the pointer, row and column strides to point
317 // at this copy of the data.
318 if (pad_top || valid_rows < this->get_input_rows() ||
319 pad_left || valid_cols < this->get_input_cols())
320 {
321 const auto patch_ld_col = n_channels;
322 const auto patch_ld_row = patch_ld_col * this->get_input_cols();
323 auto patch = reinterpret_cast<TIn *>(working_space) +
324 pad_top*patch_ld_row + pad_left*patch_ld_col;
325
326 // Fill the input patch with padding
327 memset(working_space, 0, sizeof(TIn) * this->get_input_rows() * patch_ld_row);
328
329 // Determine the bounds for which to copy
330 const auto last_i = std::min(valid_rows + pad_top, this->get_input_rows());
331 const auto last_j = std::min(valid_cols + pad_left, this->get_input_cols());
332
333 // Copy across the valid portion of the patch
334 for (auto i = pad_top; i < last_i; i++)
335 {
336 auto inptr_col = inptr;
337 inptr += ld_in_row;
338
339 auto patch_col = patch;
340 patch += patch_ld_row;
341
342 for (auto j = pad_left; j < last_j; j++)
343 {
344 // Perform the copy and progress both input and patch pointers
345 memcpy(patch_col, inptr_col, n_channels * sizeof(TIn));
346 inptr_col += ld_in_col;
347 patch_col += patch_ld_col;
348 }
349 }
350
351 // Override the input pointer and strides
352 inptr = reinterpret_cast<const TIn *>(working_space);
353 ld_in_col = patch_ld_col;
354 ld_in_row = patch_ld_row;
355 }
356
357 // Call the kernel
358 m_kernel(n_channels, inptr, ld_in_row, ld_in_col, outptr, ld_out_matrix);
359 }
360
361 public:
TransformUnpadded(const std::string & name,unsigned int input_rows,unsigned int input_cols,Kernel kernel)362 TransformUnpadded(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
363 : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
364 {
365 }
366
367 /* Utility method which can be used to get a transposed version of a kernel,
368 * this just calls the kernel with the input row and column strides reversed.
369 */
get_transposed_kernel(const Kernel & kernel)370 static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
371 {
372 return [kernel] (
373 const unsigned int n_channels,
374 const TIn *const inptr, const size_t ld_in_row, const size_t ld_in_col,
375 TOut *const outptr, const size_t ld_out_matrix
376 ) {
377 kernel(n_channels, inptr, ld_in_col, ld_in_row, outptr, ld_out_matrix);
378 };
379 }
380 };
381
382 } // namespace input_transform
383 } // namespace winograd
384 } // namespace arm_conv
385