1 /* 2 * Copyright (c) 2017-2020 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 #ifndef ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H 25 #define ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H 26 27 #include "src/core/NEON/INEKernel.h" 28 #include "src/core/NEON/kernels/convolution/common/convolution.hpp" 29 #include "src/core/NEON/kernels/convolution/common/tensor.hpp" 30 31 #include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp" 32 33 namespace arm_compute 34 { 35 // Forward declarations 36 class ITensor; 37 38 /** Interface for the NEON kernel to perform Winograd input transform. */ 39 class INEWinogradLayerTransformInputKernel : public INEKernel 40 { 41 public: 42 /** Get the working space required to perform the transformation. 43 * 44 * Note, the working space is only required when performing the 45 * transformation - hence it can be reused whenever the transformation is 46 * not running. 47 * 48 * @param num_threads The greatest number of threads that will be used to execute the transform. 49 * @return Size of working space required in bytes. 50 */ 51 virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0; 52 53 /** Determine how much memory (in units of TIn) to allocate for the 54 * transformed input. 55 * 56 * @param[in] num_batches Number of batches in the input tensor. 57 * @param[in] num_channels Number of feature maps in the input tensor. 58 * @param[in] num_rows Number of rows in each feature map. 59 * @param[in] num_cols Number of columns in each feature map. 60 * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". 61 * 62 * @return Storage size (in units of TIn) required. 63 */ 64 virtual unsigned int get_input_storage_size(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0; 65 66 /** Gets the stride between matrices in the input worspace 67 * 68 * @param[in] num_batches Number of batches in the input tensor. 69 * @param[in] num_channels Number of feature maps in the input tensor. 70 * @param[in] num_rows Number of rows in each feature map. 71 * @param[in] num_cols Number of columns in each feature map. 72 * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". 73 * 74 * @return Stride expressed in bytes. 75 */ 76 virtual int get_matrix_stride(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0; 77 78 /** Configure the output transform kernel. 79 * 80 * @param[in] input_nhwc Input tensor in NHWC data layout format. 81 * @param[in] num_batches Number of batches in input tensor. 82 * @param[in] num_rows Number of rows in input tensor. 83 * @param[in] num_cols Number of columns in input tensor. 84 * @param[in] num_channels Number of channels in input tensor. 85 * @param[in] padding Padding type. 86 * @param[out] output Base of output matrices. 87 * @param[in] matrix_stride Stride between output matrices. 88 * @param[in] workspace Tensor to be used as the working space during the computation. 89 */ 90 virtual void configure(const ITensor *input_nhwc, const int num_batches, const int num_rows, const int num_cols, const int num_channels, 91 const PaddingType padding, ITensor *output, const int matrix_stride, ITensor *workspace) = 0; 92 93 /** Destructor */ ~INEWinogradLayerTransformInputKernel()94 virtual ~INEWinogradLayerTransformInputKernel() 95 { 96 } 97 }; 98 99 /** NEON kernel to perform Winograd input transform. */ 100 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> 101 class NEWinogradLayerTransformInputKernel : public INEWinogradLayerTransformInputKernel 102 { 103 public: 104 /** Prevent instances of this class from being copied (As this class contains pointers) */ 105 NEWinogradLayerTransformInputKernel(const NEWinogradLayerTransformInputKernel &) = delete; 106 /** Prevent instances of this class from being copied (As this class contains pointers) */ 107 NEWinogradLayerTransformInputKernel &operator=(const NEWinogradLayerTransformInputKernel &) = delete; 108 /** Allow instances of this class to be moved */ 109 NEWinogradLayerTransformInputKernel(NEWinogradLayerTransformInputKernel &&) = default; 110 /** Allow instances of this class to be moved */ 111 NEWinogradLayerTransformInputKernel &operator=(NEWinogradLayerTransformInputKernel &&) = default; 112 /** Default destructor */ 113 ~NEWinogradLayerTransformInputKernel() = default; 114 115 /** Determine how much memory (in units of TIn) to allocate for the 116 * transformed input. 117 * 118 * @param[in] num_batches Number of batches in the input tensor. 119 * @param[in] num_channels Number of feature maps in the input tensor. 120 * @param[in] num_rows Number of rows in each feature map. 121 * @param[in] num_cols Number of columns in each feature map. 122 * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". 123 * 124 * @return Storage size (in units of TIn) required. 125 */ 126 unsigned int get_input_storage_size( 127 int num_batches, 128 int num_channels, 129 int num_rows, 130 int num_cols, 131 bool same_padding) const override; 132 133 /** Get the working space required to perform the transformation. 134 * 135 * Note, the working space is only required when performing the 136 * transformation - hence it can be reused whenever the transformation is 137 * not running. 138 * 139 * @param[in] num_threads The greatest number of threads that will be used to execute the transform. 140 * 141 * @return Size of working space required in bytes. 142 */ 143 unsigned int get_working_space_size(unsigned int num_threads) const override; 144 145 /** Gets the stride between matrices in the input worspace 146 * 147 * @param[in] num_batches Number of batches in the input tensor. 148 * @param[in] num_channels Number of feature maps in the input tensor. 149 * @param[in] num_rows Number of rows in each feature map. 150 * @param[in] num_cols Number of columns in each feature map. 151 * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". 152 * 153 * @return Stride expressed in bytes. 154 */ 155 int get_matrix_stride( 156 int num_batches, 157 int num_channels, 158 int num_rows, 159 int num_cols, 160 bool same_padding) const override; 161 162 /** Default constructor */ 163 NEWinogradLayerTransformInputKernel(); 164 name()165 const char *name() const override 166 { 167 return "NEWinogradLayerTransformInputKernel"; 168 } 169 170 /** Configure the output transform kernel. 171 * 172 * @param[in] input_nhwc Input tensor. Data types supported: F16/F32. Layout supported NHWC. 173 * @param[in] num_batches Number of batches in input tensor. 174 * @param[in] num_rows Number of rows in input tensor. 175 * @param[in] num_cols Number of columns in input tensor. 176 * @param[in] num_channels Number of channels in input tensor. 177 * @param[in] padding Padding type. 178 * @param[out] output Base of output matrices. 179 * @param[in] matrix_stride Stride between output matrices. 180 * @param[in] workspace Tensor to be used as the working space during the computation. 181 */ 182 void configure( 183 const ITensor *input_nhwc, 184 const int num_batches, 185 const int num_rows, 186 const int num_cols, 187 const int num_channels, 188 const PaddingType padding, 189 ITensor *output, 190 const int matrix_stride, 191 ITensor *workspace) override; 192 193 // Inherited methods overridden: 194 void run(const Window &window, const ThreadInfo &info) override; 195 196 /** Winograd base kernel */ 197 using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>; 198 /** Winograd convolution kernel */ 199 using WinogradConv = typename WinogradBase::template Convolution<T, T>; 200 201 /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformInputKernel 202 * 203 * @param[in] input First tensor input info. Data types supported: F16/F32. 204 * @param[in] output Output tensor info. Data types supported: same as @p input. 205 * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo 206 * 207 * @return a status 208 */ 209 static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info); 210 211 private: 212 using InputTransform = typename WinogradBase::template InputTransform<T, T>; 213 214 std::unique_ptr<InputTransform> _transform{ nullptr }; 215 const ITensor *_input_nhwc; 216 int _num_batches; /**< Number of batches in input tensor. */ 217 int _num_rows; /**< Number of rows in input tensor. */ 218 int _num_cols; /**< Number of columns in input tensor. */ 219 int _num_channels; /**< Number of channels in input tensor. */ 220 PaddingType _padding; /**< Padding type. */ 221 ITensor *_output; /**< Base of output matrices. */ 222 int _matrix_stride; /**< Stride between output matrices. */ 223 int _padding_top; /**< Padding to apply to the top of the image. */ 224 int _padding_left; /**< Padding to apply to the left of the image. */ 225 int _padding_right; /**< Padding to apply to the right of the image. */ 226 int _padding_bottom; /**< Padding to apply to the bottom of the image. */ 227 ITensor *_workspace; 228 }; 229 230 /** Interface for the NEON kernel to perform Winograd output transform. */ 231 class INEWinogradLayerTransformOutputKernel : public INEKernel 232 { 233 public: 234 /** Get the working space required to perform the transformation. 235 * 236 * Note, the working space is only required when performing the 237 * transformation - hence it can be reused whenever the transformation is 238 * not running. 239 * 240 * @param[in] num_threads The greatest number of threads that will be used to execute the transform. 241 * 242 * @return Size of working space required in bytes. 243 */ 244 virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0; 245 246 /** Determine how much memory (in units of TOut) to allocate for the 247 * (Winograd domain) output. 248 * 249 * @param[in] num_batches Number of batches in the output tensor. 250 * @param[in] num_rows Number of rows in each feature map of the input tensor. 251 * @param[in] num_cols Number of columns in each feature map of the input tensor. 252 * @param[in] num_output_channels Number of feature maps in the output tensor. 253 * 254 * @return Storage size (in units of TOut) required. 255 */ 256 virtual unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0; 257 258 /** Gets the stride between matrices in the output worspace 259 * 260 * @param[in] num_batches Number of batches in the output tensor. 261 * @param[in] num_rows Number of rows in each feature map of the input tensor. 262 * @param[in] num_cols Number of columns in each feature map of the input tensor. 263 * @param[in] num_output_channels Number of feature maps in the output tensor. 264 * 265 * @return Stride expressed in bytes. 266 */ 267 virtual int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0; 268 269 /** Get the output shape of a convolution. 270 * 271 * @param[in] num_rows Number of rows in each feature map of the input tensor. 272 * @param[in] num_cols Number of columns in each feature map of the input tensor. 273 * @param[in] padding_same True if padding is SAME, false otherwise 274 * 275 * @return Shape of the output tensor 276 */ 277 virtual std::pair<unsigned int, unsigned int> get_output_shape( 278 int num_rows, /* Number of rows in each feature map of the input tensor. */ 279 int num_cols, /* Number of columns in each feature map of the input tensor. */ 280 bool padding_same /* True if padding is SAME, false otherwise */ 281 ) const = 0; 282 283 /** Configure the output transform kernel. 284 * 285 * @param[in] biases Pointer to the biases tensor. 286 * @param[in] transformed_output Pointer to working space for the output tensor in the Winograd domain. 287 * @param[in] matrix_stride Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride() 288 * @param[out] output_nhwc Pointer to a tensor in NHWC data layout ordered output tensor, in the spatial domain. 289 * @param[in] num_batches Number of batches in the input tensor. 290 * @param[in] num_rows Number of rows in output tensor. 291 * @param[in] num_cols Number of columns in output tensor. 292 * @param[in] num_channels Number of feature maps in the output tensor. 293 * @param[in] workspace Tensor to be used as the working space during the computation. 294 * @param[in] activation Activation to be used 295 */ 296 virtual void configure( 297 const ITensor *biases, 298 const ITensor *transformed_output, 299 const int matrix_stride, 300 ITensor *output_nhwc, 301 const int num_batches, 302 const int num_rows, 303 const int num_cols, 304 const int num_channels, 305 ITensor *workspace, 306 const arm_gemm::Activation &activation) = 0; 307 ~INEWinogradLayerTransformOutputKernel()308 virtual ~INEWinogradLayerTransformOutputKernel() 309 { 310 } 311 }; 312 313 /** NEON kernel to perform Winograd output transform. */ 314 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> 315 class NEWinogradLayerTransformOutputKernel : public INEWinogradLayerTransformOutputKernel 316 { 317 public: name()318 const char *name() const override 319 { 320 return "NEWinogradLayerTransformOutputKernel"; 321 } 322 /** Constructor */ 323 NEWinogradLayerTransformOutputKernel(); 324 325 /** Prevent instances of this class from being copied (As this class contains pointers) */ 326 NEWinogradLayerTransformOutputKernel(const NEWinogradLayerTransformOutputKernel &) = delete; 327 /** Prevent instances of this class from being copied (As this class contains pointers) */ 328 NEWinogradLayerTransformOutputKernel &operator=(const NEWinogradLayerTransformOutputKernel &) = delete; 329 /** Allow instances of this class to be moved */ 330 NEWinogradLayerTransformOutputKernel(NEWinogradLayerTransformOutputKernel &&) = default; 331 /** Allow instances of this class to be moved */ 332 NEWinogradLayerTransformOutputKernel &operator=(NEWinogradLayerTransformOutputKernel &&) = default; 333 /** Default destructor */ 334 ~NEWinogradLayerTransformOutputKernel() = default; 335 336 // Inherited methods overridden: 337 /** Determine how much memory (in units of TOut) to allocate for the 338 * (Winograd domain) output. 339 * 340 * @param[in] num_batches Number of batches in the output tensor. 341 * @param[in] num_rows Number of rows in each feature map of the input tensor. 342 * @param[in] num_cols Number of columns in each feature map of the input tensor. 343 * @param[in] num_output_channels Number of feature maps in the output tensor. 344 * 345 * @return Storage size (in units of TOut) required. 346 */ 347 unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const override; 348 349 /** Gets the stride between matrices in the output worspace 350 * 351 * @param[in] num_batches Number of batches in the output tensor. 352 * @param[in] num_rows Number of rows in each feature map of the input tensor. 353 * @param[in] num_cols Number of columns in each feature map of the input tensor. 354 * @param[in] num_output_channels Number of feature maps in the output tensor. 355 * 356 * @return Stride expressed in bytes. 357 */ 358 int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const override; 359 /** Get the output shape of a convolution. 360 * 361 * @param[in] num_rows Number of rows in each feature map of the input tensor. 362 * @param[in] num_cols Number of columns in each feature map of the input tensor. 363 * @param[in] padding_same True if padding is SAME, false otherwise 364 * 365 * @return Shape of the output tensor 366 */ 367 std::pair<unsigned int, unsigned int> get_output_shape( 368 int num_rows, /* Number of rows in each feature map of the input tensor. */ 369 int num_cols, /* Number of columns in each feature map of the input tensor. */ 370 bool padding_same) const override; 371 372 /** Get the working space required to perform the transformation. 373 * 374 * Note, the working space is only required when performing the 375 * transformation - hence it can be reused whenever the transformation is 376 * not running. 377 * 378 * @param[in] num_threads The greatest number of threads that will be used to execute the transform. 379 * 380 * @return Size of working space required in bytes. 381 */ 382 unsigned int get_working_space_size(unsigned int num_threads) const override; 383 384 /** Configure the output transform kernel. 385 * 386 * @param[in] biases Pointer to the biases tensor. 387 * @param[in] transformed_output Pointer to working space for the output tensor in the Winograd domain. 388 * @param[in] matrix_stride Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride() 389 * @param[out] output_nhwc Pointer to a tensor with NHWC data layout, in the spatial domain. 390 * @param[in] num_batches Number of batches in the input tensor. 391 * @param[in] num_rows Number of rows in output tensor. 392 * @param[in] num_cols Number of columns in output tensor. 393 * @param[in] num_channels Number of feature maps in the output tensor. 394 * @param[in] workspace Tensor to be used as the working space during the computation. 395 * @param[in] activation Activation to be used 396 */ 397 void configure( 398 const ITensor *biases, 399 const ITensor *transformed_output, 400 const int matrix_stride, 401 ITensor *output_nhwc, 402 const int num_batches, 403 const int num_rows, 404 const int num_cols, 405 const int num_channels, 406 ITensor *workspace, 407 const arm_gemm::Activation &activation) override; 408 409 void run(const Window &window, const ThreadInfo &info) override; 410 411 /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformOutputKernel 412 * 413 * @param[in] input Source tensor info with shape [C, N, 16, batches] or [C, N, 36, batches]. Data types supported: F16/F32. 414 * @param[in] bias Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input 415 * @param[in] output Destination tensor info with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input 416 * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo 417 * 418 * @return a status 419 */ 420 static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info); 421 422 private: 423 using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>; 424 using WinogradConv = typename WinogradBase::template Convolution<T, T>; 425 using OutputTransform = typename WinogradBase::template OutputTransform<T, T>; 426 427 std::unique_ptr<OutputTransform> _transform{ nullptr }; 428 const ITensor *_biases; 429 const ITensor *_transformed_output; 430 ITensor *_workspace; 431 int _matrix_stride; 432 int _matrix_row_stride; 433 ITensor *_output_nhwc; 434 int _num_batches; 435 int _num_rows; 436 int _num_cols; 437 int _num_channels; 438 }; 439 440 /** Interface for the NEON kernel to perform Winograd weights transform. */ 441 class INEWinogradLayerTransformWeightsKernel : public INEKernel 442 { 443 public: 444 /** Prevent instances of this class from being copied (As this class contains pointers) */ 445 INEWinogradLayerTransformWeightsKernel(const INEWinogradLayerTransformWeightsKernel &) = default; 446 /** Prevent instances of this class from being copied (As this class contains pointers) */ 447 INEWinogradLayerTransformWeightsKernel &operator=(const INEWinogradLayerTransformWeightsKernel &) = default; 448 /** Allow instances of this class to be moved */ 449 INEWinogradLayerTransformWeightsKernel(INEWinogradLayerTransformWeightsKernel &&) = default; 450 /** Allow instances of this class to be moved */ 451 INEWinogradLayerTransformWeightsKernel &operator=(INEWinogradLayerTransformWeightsKernel &&) = default; 452 INEWinogradLayerTransformWeightsKernel()453 INEWinogradLayerTransformWeightsKernel() 454 { 455 } ~INEWinogradLayerTransformWeightsKernel()456 virtual ~INEWinogradLayerTransformWeightsKernel() 457 { 458 } 459 /** Determine how much memory (in units of T) to allocate for the 460 * transformed weights. 461 * 462 * @param[in] num_output_channels Number of output feature maps. 463 * @param[in] num_input_channels Number of input feature maps. 464 * 465 * @return Storage size (in units of T) required. 466 */ 467 virtual unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const = 0; 468 /** Gets the stride between matrices in the kernel worspace 469 * 470 * @param[in] num_output_channels Number of output feature maps. 471 * @param[in] num_input_channels Number of input feature maps. 472 * 473 * @return Stride expressed in bytes. 474 */ 475 virtual int get_matrix_stride(int num_output_channels, int num_input_channels) const = 0; 476 477 /** Configure the weights transform kernel. 478 * 479 * @param[in] weights_hwio Pointer to the weights tensor 480 * @param[out] output Pointer to working space for the output tensor in the Winograd domain. 481 * @param[in] matrix_stride Stride across matrices in the output workspace. 482 * @param[in] num_output_channels Number of filters. 483 * @param[in] num_input_channels Number of channels in each filter. 484 */ 485 486 virtual void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0; 487 488 /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformWeightsKernel 489 * 490 * @param[in] input First tensor input info. Data types supported: F16/F32. 491 * @param[in] weights Weights tensor info. Data types supported: same as @p input. 492 * 493 * @return a status 494 */ 495 static Status validate(const ITensorInfo *input, const ITensorInfo *weights); 496 }; 497 498 /** NEON kernel to perform Winograd weights transform. */ 499 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> 500 class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformWeightsKernel 501 { 502 public: 503 /** Prevent instances of this class from being copied (As this class contains pointers) */ 504 NEWinogradLayerTransformWeightsKernel(const NEWinogradLayerTransformWeightsKernel &) = delete; 505 /** Prevent instances of this class from being copied (As this class contains pointers) */ 506 NEWinogradLayerTransformWeightsKernel &operator=(const NEWinogradLayerTransformWeightsKernel &) = delete; 507 /** Allow instances of this class to be moved */ 508 NEWinogradLayerTransformWeightsKernel(NEWinogradLayerTransformWeightsKernel &&) = default; 509 /** Allow instances of this class to be moved */ 510 NEWinogradLayerTransformWeightsKernel &operator=(NEWinogradLayerTransformWeightsKernel &&) = default; 511 /** Default destructor */ 512 ~NEWinogradLayerTransformWeightsKernel() = default; 513 514 /** Default constructor. */ 515 NEWinogradLayerTransformWeightsKernel(); name()516 const char *name() const override 517 { 518 return "NEWinogradLayerTransformWeightsKernel"; 519 } 520 521 /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformWeightsKernel 522 * 523 * @param[in] input Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout). 524 * kernel_x must be 3 and equal to kernel_y. Data types supported: F16/F32. 525 * @param[in] output Destination tensor info. The output is a 3D tensor with dimensions [OFM, IFM, 16] or [OFM, IFM, 36]. Data type supported: same as @p input 526 * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo 527 * 528 * @return a status 529 */ 530 static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info); 531 532 // Inherited methods overridden: 533 534 #ifndef DOXYGEN_SKIP_THIS 535 /** Configure the weights transform kernel. 536 * 537 * @param[in] weights_hwio Pointer to the weights tensor 538 * @param[out] output Pointer to working space for the output tensor in the Winograd domain. 539 * @param[in] matrix_stride Stride across matrices in the output workspace. 540 * @param[in] num_output_channels Number of filters. 541 * @param[in] num_input_channels Number of channels in each filter. 542 */ 543 void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override; 544 #endif /* DOXYGEN_SKIP_THIS */ 545 546 /** Determine how much memory (in units of T) to allocate for the 547 * transformed weights. 548 * 549 * @param[in] num_output_channels Number of output feature maps. 550 * @param[in] num_input_channels Number of input feature maps. 551 * 552 * @return Storage size (in units of T) required. 553 */ 554 unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const override; 555 556 /** Gets the stride between matrices in the input worspace 557 * 558 * @param[in] num_output_channels Number of output feature maps. 559 * @param[in] num_input_channels Number of input feature maps. 560 * 561 * @return Stride expressed in bytes. 562 */ 563 int get_matrix_stride(int num_output_channels, int num_input_channels) const override; 564 void run(const Window &window, const ThreadInfo &info) override; 565 bool is_parallelisable() const override; 566 567 private: 568 using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>; 569 using WinogradConv = typename WinogradBase::template Convolution<T, T>; 570 using WeightsTransform = typename WinogradBase::template WeightsTransform<T, T>; 571 572 std::unique_ptr<WeightsTransform> _transform{ nullptr }; 573 const ITensor *_weights_hwio; 574 ITensor *_output; 575 int _matrix_stride; 576 int _num_output_channels; 577 int _num_input_channels; 578 }; 579 580 /** NEON kernel to perform Winograd. */ 581 template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> 582 class NEWinogradLayerConfiguration 583 { 584 public: 585 /** Winograd base kernel */ 586 using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>; 587 /** Winograd convolution kernel */ 588 589 using WinogradConv = typename WinogradBase::template Convolution<TIn, TOut>; 590 591 using TransformInputKernel = NEWinogradLayerTransformInputKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>; 592 using TransformWeightsKernel = NEWinogradLayerTransformWeightsKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>; 593 using TransformOutputKernel = NEWinogradLayerTransformOutputKernel<TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>; 594 }; 595 596 } // namespace arm_compute 597 #endif /*ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H*/ 598