1 /* 2 * Copyright (c) 2018-2019 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #pragma once 26 27 #include <arm_neon.h> 28 #include "activation.hpp" 29 #include "padding.hpp" 30 31 namespace depthwise 32 { 33 34 namespace nck = neon_convolution_kernels; 35 36 class IDepthwiseConvolution 37 { 38 public: 39 virtual ~IDepthwiseConvolution() = default; 40 41 virtual int output_size( 42 int dim_size, 43 unsigned int padding_before, 44 unsigned int padding_after 45 ) const = 0; 46 47 /* Set input tensor and stride. */ 48 virtual void set_input(const void *inptr) = 0; 49 virtual void set_input(const void *inptr, int column_stride) = 0; 50 virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0; 51 virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0; 52 53 /* Set output tensor and stride. */ 54 virtual void set_output(void *outptr) = 0; 55 virtual void set_output(void *outptr, int column_stride) = 0; 56 virtual void set_output(void *outptr, int row_stride, int column_stride) = 0; 57 virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0; 58 59 /* Weights and biases are re-ordered to improve memory access patterns. Use 60 * these methods to determine the size of the re-pack buffer and to set the 61 * address (and implicitly reorder the weights and biases into) the buffer. 62 */ 63 virtual size_t get_packed_params_size(void) const = 0; 64 virtual void set_packed_params_buffer(void *) = 0; 65 66 virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0; 67 virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0; 68 virtual void pack_params( 69 void *buffer, 70 const void* weights, 71 unsigned int weight_row_stride, 72 unsigned int weight_col_stride, 73 const void *biases=nullptr 74 ) const = 0; 75 76 /* Working space is used to pad tensors on the fly. Before running any 77 * inference check the amount of space required, allocate and provide a 78 * pointer to the convolution engine. 79 */ 80 virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0; 81 virtual void set_working_space(void *) = 0; 82 83 virtual unsigned int get_window(void) const = 0; 84 virtual void run( 85 unsigned int start, 86 unsigned int stop, 87 unsigned int threadid=0 88 ) = 0; 89 }; 90 91 template < 92 unsigned int OutputTileRows, unsigned int OutputTileCols, 93 unsigned int KernelRows, unsigned int KernelCols, 94 unsigned int StrideRows, unsigned int StrideCols, 95 typename TIn, typename TBias, typename TOut, 96 typename Derived 97 > 98 class DepthwiseConvolutionBase : public IDepthwiseConvolution 99 { 100 public: 101 // Information about the specific convolution instance 102 using InputType = TIn; 103 using BiasType = TBias; 104 using OutputType = TOut; 105 static constexpr int output_tile_rows = OutputTileRows; 106 static constexpr int output_tile_cols = OutputTileCols; 107 static constexpr int kernel_rows = KernelRows; 108 static constexpr int kernel_cols = KernelCols; 109 static constexpr int stride_rows = StrideRows; 110 static constexpr int stride_cols = StrideCols; 111 static constexpr int inner_tile_rows = stride_rows * (output_tile_rows - 1) + kernel_rows; 112 static constexpr int inner_tile_cols = stride_cols * (output_tile_cols - 1) + kernel_cols; 113 114 /** Create a new depthwise convolution engine. 115 * 116 * @param[in] n_batches Number of batches tensors. 117 * @param[in] n_input_rows Number of rows in input tensor. 118 * @param[in] n_input_cols Number of columns in input tensor. 119 * @param[in] n_channels Number of channels in input and output tensors. 120 */ 121 DepthwiseConvolutionBase( 122 int n_batches, int n_input_rows, int n_input_cols, int n_channels, 123 nck::ActivationFunction activation, 124 unsigned int padding_top, 125 unsigned int padding_left, 126 unsigned int padding_bottom, 127 unsigned int padding_right 128 ); 129 130 /** Create a new depthwise convolution engine. 131 * 132 * @param[in] n_batches Number of batches tensors. 133 * @param[in] n_input_rows Number of rows in input tensor. 134 * @param[in] n_input_cols Number of columns in input tensor. 135 * @param[in] n_channels Number of channels in input and output tensors. 136 */ 137 DepthwiseConvolutionBase( 138 int n_batches, int n_input_rows, int n_input_cols, int n_channels, 139 int n_output_rows, int n_output_cols, 140 nck::ActivationFunction activation, 141 unsigned int padding_top, 142 unsigned int padding_left, 143 unsigned int padding_bottom, 144 unsigned int padding_right 145 ); 146 147 // Cannot copy or move a DepthwiseConvolution. 148 DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete; 149 DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete; 150 151 /* Set input tensor and stride. */ 152 void set_input(const void *inptr) override; 153 void set_input(const void *inptr, int column_stride) override; 154 void set_input(const void *inptr, int row_stride, int column_stride) override; 155 void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override; 156 157 /* Set output tensor and stride. */ 158 void set_output(void *outptr) override; 159 void set_output(void *outptr, int column_stride) override; 160 void set_output(void *outptr, int row_stride, int column_stride) override; 161 void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override; 162 163 /** Get the number of output rows/columns. 164 * 165 * @param[in] dim_size Number of elements in the dimension (rows/columns) 166 * @param[in] same_padding True if the padding is SAME, otherwise false. 167 */ 168 static int get_output_size( 169 int dim_size, unsigned int padding_before, unsigned int padding_after 170 ); 171 172 int output_size( 173 int dim_size, unsigned int padding_before, unsigned int padding_after 174 ) const override; 175 176 /* Determine how much memory is required to store the packed weights and 177 * biases. 178 */ 179 size_t get_packed_params_size(void) const override; 180 181 /* Set the buffer for the packed weights and biases, and perform the 182 * packing. 183 */ 184 void set_packed_params_buffer(void *buffer) override; 185 186 void pack_params(const void *weights, const void *biases=nullptr) const override; 187 188 void pack_params( 189 void *buffer, 190 const void *weights, 191 const void *biases=nullptr 192 ) const override; 193 194 void pack_params( 195 void *buffer, 196 const void *weights, 197 unsigned int weight_row_stride, 198 unsigned int weight_col_stride, 199 const void *biases=nullptr 200 ) const override; 201 202 /** Query the amount of working space required. 203 * @param[in] The largest number of threads which will be used to execute 204 * the kernel. 205 */ 206 size_t get_working_space_size(unsigned int n_threads=1) const override; 207 208 /** Set the working space buffer. 209 */ 210 void set_working_space(void *buffer) override; 211 212 /** Get the window of work to be performed by an instance of the operator. 213 */ 214 unsigned int get_window(void) const override; 215 216 /** Perform a portion of the work associated with the operator. 217 * 218 * Will perform the window of work described by $[start, stop)$. 219 * 220 * @param[in] start Start of the window of work to perform. 221 * @param[in] stop End of the work to perform. 222 * @param[in] ID of the thread performing the work. 223 */ 224 void run( 225 unsigned int start, 226 unsigned int stop, 227 unsigned int threadid=0 228 ) override; 229 230 protected: 231 /** Get the value to use to pad the tensor. 232 */ 233 TIn _input_padding_value(void) const; 234 235 /** Implementation of the parameter packing. 236 */ 237 void _pack_params( 238 void *buffer, 239 const void *weights, 240 unsigned int weight_row_stride, 241 unsigned int weight_col_stride, 242 const void *biases=nullptr 243 ) const; 244 245 /** Process a tile-row of the tensors. 246 */ 247 void process_tile_row( 248 unsigned int threadid, 249 int n_channels, 250 const void* packed_params, 251 const InputType* inptr, 252 OutputType* outptr, 253 int row_pad_in_top, 254 int row_pad_in_left, 255 int row_pad_in_bottom, 256 int row_pad_out_bottom, 257 int n_tiles, 258 int n_input_cols, 259 int n_output_cols 260 ); 261 262 /** Process a single tile of the tensor. 263 * 264 * This method will apply input/output padding (if required) and call the 265 * depthwise tile implementation. 266 */ 267 void process_tile( 268 unsigned int threadid, 269 int n_channels, 270 const void* packed_params, 271 const InputType* inptr, 272 OutputType* outptr, 273 int pad_in_top, 274 int pad_in_left, 275 int pad_in_bottom, 276 int pad_in_right, 277 int pad_out_bottom, 278 int pad_out_right 279 ); 280 281 /** Perform depthwise convolution on a single tile. 282 */ 283 template <nck::ActivationFunction Activation> 284 void execute_tile( 285 int n_channels, 286 const void* packed_params, 287 const InputType* inptr, 288 unsigned int in_row_stride, 289 unsigned int in_col_stride, 290 OutputType* outptr, 291 unsigned int out_row_stride, 292 unsigned int out_col_stride 293 ); 294 295 template <nck::ActivationFunction Activation> 296 void execute_tile( 297 int n_channels, 298 const void* packed_params, 299 const InputType* inptrs[inner_tile_rows][inner_tile_cols], 300 OutputType* outptrs[output_tile_rows][output_tile_cols] 301 ); 302 303 int n_channels(void) const; 304 305 private: 306 // Member variables of instances of a convolution engine. 307 const InputType* _input; 308 OutputType* _output; 309 void* _packed_parameters; 310 void* _working_space; // Per-thread working space 311 const int _n_batches, _n_input_rows, _n_input_cols, _n_channels, 312 _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols; 313 const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right; 314 const nck::ActivationFunction _activation; 315 316 // Stride information for a convolution instance 317 int _input_col_stride, _input_row_stride, _input_batch_stride; 318 int _output_col_stride, _output_row_stride, _output_batch_stride; 319 320 // Methods for getting access to working space 321 size_t _get_input_working_space_size(void) const; 322 size_t _get_output_working_space_size(void) const; 323 324 void *_get_input_working_space(unsigned int threadid) const; 325 void *_get_output_working_space(unsigned int threadid) const; 326 }; 327 328 329 template < 330 unsigned int OutputTileRows, unsigned int OutputTileCols, 331 unsigned int KernelRows, unsigned int KernelCols, 332 unsigned int StrideRows, unsigned int StrideCols, 333 typename TIn, typename TBias, typename TOut 334 > 335 class DepthwiseConvolution : public DepthwiseConvolutionBase< 336 OutputTileRows, OutputTileCols, 337 KernelRows, KernelCols, 338 StrideRows, StrideCols, 339 TIn, TBias, TOut, 340 DepthwiseConvolution< 341 OutputTileRows, OutputTileCols, 342 KernelRows, KernelCols, 343 StrideRows, StrideCols, 344 TIn, TBias, TOut 345 > 346 > 347 { 348 using Base = DepthwiseConvolutionBase< 349 OutputTileRows, OutputTileCols, 350 KernelRows, KernelCols, 351 StrideRows, StrideCols, 352 TIn, TBias, TOut, 353 DepthwiseConvolution< 354 OutputTileRows, OutputTileCols, 355 KernelRows, KernelCols, 356 StrideRows, StrideCols, 357 TIn, TBias, TOut 358 > >; 359 friend Base; 360 using InputType = typename Base::InputType; 361 using OutputType = typename Base::OutputType; 362 363 public: 364 using Base::DepthwiseConvolutionBase; 365 366 protected: 367 template <nck::ActivationFunction Activation> 368 void execute_tile( 369 int n_channels, 370 const void* packed_params, 371 const TIn* inptr, 372 unsigned int in_row_stride, 373 unsigned int in_col_stride, 374 TOut* outptr, 375 unsigned int out_row_stride, 376 unsigned int out_col_stride 377 ); 378 379 template <nck::ActivationFunction Activation> 380 void execute_tile( 381 int n_channels, 382 const void* packed_params, 383 const InputType* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], 384 OutputType* outptrs[Base::output_tile_rows][Base::output_tile_cols] 385 ); 386 }; 387 388 389 template < 390 unsigned int OutputTileRows, unsigned int OutputTileCols, 391 unsigned int KernelRows, unsigned int KernelCols, 392 unsigned int StrideRows, unsigned int StrideCols 393 > 394 class DepthwiseConvolution< 395 OutputTileRows, OutputTileCols, 396 KernelRows, KernelCols, 397 StrideRows, StrideCols, 398 float, float, float 399 > : public DepthwiseConvolutionBase< 400 OutputTileRows, OutputTileCols, 401 KernelRows, KernelCols, 402 StrideRows, StrideCols, 403 float, float, float, 404 DepthwiseConvolution< 405 OutputTileRows, OutputTileCols, 406 KernelRows, KernelCols, 407 StrideRows, StrideCols, 408 float, float, float 409 > 410 > 411 { 412 using Base = DepthwiseConvolutionBase< 413 OutputTileRows, OutputTileCols, 414 KernelRows, KernelCols, 415 StrideRows, StrideCols, 416 float, float, float, 417 DepthwiseConvolution< 418 OutputTileRows, OutputTileCols, 419 KernelRows, KernelCols, 420 StrideRows, StrideCols, 421 float, float, float 422 > >; 423 friend Base; 424 using InputType = typename Base::InputType; 425 using OutputType = typename Base::OutputType; 426 427 public: 428 DepthwiseConvolution( 429 int n_batches, int n_input_rows, int n_input_cols, int n_channels, 430 nck::ActivationFunction activation, 431 unsigned int padding_top, 432 unsigned int padding_left, 433 unsigned int padding_bottom, 434 unsigned int padding_right 435 ); 436 437 DepthwiseConvolution( 438 int n_batches, int n_input_rows, int n_input_cols, int n_channels, 439 int n_output_rows, int n_output_cols, 440 nck::ActivationFunction activation, 441 unsigned int padding_top, 442 unsigned int padding_left, 443 unsigned int padding_bottom, 444 unsigned int padding_right 445 ); 446 447 protected: 448 template <nck::ActivationFunction Activation> 449 void execute_tile( 450 int n_channels, 451 const void* packed_params, 452 const float* inptr, 453 unsigned int in_row_stride, 454 unsigned int in_col_stride, 455 float* outptr, 456 unsigned int out_row_stride, 457 unsigned int out_col_stride 458 ); 459 460 template <nck::ActivationFunction Activation> 461 void execute_tile( 462 int n_channels, 463 const void* packed_params, 464 const float* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], 465 float* outptrs[Base::output_tile_rows][Base::output_tile_cols] 466 ); 467 }; 468 469 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 470 template < 471 unsigned int OutputTileRows, unsigned int OutputTileCols, 472 unsigned int KernelRows, unsigned int KernelCols, 473 unsigned int StrideRows, unsigned int StrideCols 474 > 475 class DepthwiseConvolution< 476 OutputTileRows, OutputTileCols, 477 KernelRows, KernelCols, 478 StrideRows, StrideCols, 479 float16_t, float16_t, float16_t 480 > : public DepthwiseConvolutionBase< 481 OutputTileRows, OutputTileCols, 482 KernelRows, KernelCols, 483 StrideRows, StrideCols, 484 float16_t, float16_t, float16_t, 485 DepthwiseConvolution< 486 OutputTileRows, OutputTileCols, 487 KernelRows, KernelCols, 488 StrideRows, StrideCols, 489 float16_t, float16_t, float16_t 490 > 491 > 492 { 493 using Base = DepthwiseConvolutionBase< 494 OutputTileRows, OutputTileCols, 495 KernelRows, KernelCols, 496 StrideRows, StrideCols, 497 float16_t, float16_t, float16_t, 498 DepthwiseConvolution< 499 OutputTileRows, OutputTileCols, 500 KernelRows, KernelCols, 501 StrideRows, StrideCols, 502 float16_t, float16_t, float16_t 503 > >; 504 friend Base; 505 using InputType = typename Base::InputType; 506 using OutputType = typename Base::OutputType; 507 508 public: 509 DepthwiseConvolution( 510 int n_batches, int n_input_rows, int n_input_cols, int n_channels, 511 nck::ActivationFunction activation, 512 unsigned int padding_top, 513 unsigned int padding_left, 514 unsigned int padding_bottom, 515 unsigned int padding_right 516 ); 517 518 DepthwiseConvolution( 519 int n_batches, int n_input_rows, int n_input_cols, int n_channels, 520 int n_output_rows, int n_output_cols, 521 nck::ActivationFunction activation, 522 unsigned int padding_top, 523 unsigned int padding_left, 524 unsigned int padding_bottom, 525 unsigned int padding_right 526 ); 527 528 protected: 529 template <nck::ActivationFunction Activation> 530 void execute_tile( 531 int n_channels, 532 const void* packed_params, 533 const float16_t* inptr, 534 unsigned int in_row_stride, 535 unsigned int in_col_stride, 536 float16_t* outptr, 537 unsigned int out_row_stride, 538 unsigned int out_col_stride 539 ); 540 541 template <nck::ActivationFunction Activation> 542 void execute_tile( 543 int n_channels, 544 const void* packed_params, 545 const float16_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], 546 float16_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] 547 ); 548 }; 549 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 550 551 } // namespace depthwise 552