android-13.0.0_r83/s

/*
 * Copyright (c) 2017-2019 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "arm_gemm.hpp"

#include <cstddef>
#include <utility>

namespace winograd
{

class ITransform
{
  public:
    virtual ~ITransform() = default;

    /**
     * Get the working space required to perform the transformation.
     *
     * Note, the working space is only required when performing the
     * transformation - hence it can be reused whenever the transformation is
     * not running.
     *
     * @param nthreads The greatest number of threads that will be used to execute the transform.
     * @return Size of working space required in bytes.
     */
    virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;

    /**
     * Set the working space to be used by the transformation.
     *
     * Note, the working space is only required when performing the
     * transformation - hence it can be reused whenever the transformation is
     * not running.
     *
     * @param Pointer to the working space.
     */
    virtual void set_working_space(void *buffer) = 0;

    /**
     * Get the window of work a given operator can perform.
     */
    virtual unsigned int get_window() const = 0;

    /**
     * Perform work upon a window of the transform.
     */
    virtual void run(unsigned int start, unsigned int stop, unsigned int threadid=0) = 0;
};

class IInputTransform : public ITransform
{
  public:
    virtual ~IInputTransform() = default;

    /**
     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
     */
    virtual void set_input_tensor(const void *input) = 0;

    /**
     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
     */
    virtual void set_input_tensor(const void *input, int col_stride) = 0;

    /**
     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
     */
    virtual void set_input_tensor(const void *input, int row_stride, int col_stride) = 0;

    /**
     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
     * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes).
     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
     */
    virtual void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) = 0;

    /**
     * Set pointers to the matrices written by the transform.
     * @param matrices Pointer to the start of the first matrix representing the transformed input.
     * @param inter_matrix_stride Stride (in elements) between matrices.
     * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
     */
    virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
};

class IOutputTransform : public ITransform
{
  public:
    virtual ~IOutputTransform() = default;

    /**
     * Set pointers to the matrices written by the transform.
     * @param matrices Pointer to the start of the first matrix representing the input to the transform.
     * @param inter_matrix_stride Stride (in elements) between matrices.
     * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
     */
    virtual void set_input_matrices(const void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;

    /**
     * Set pointer to the bias tensor (can be ignored or called with nullptr for no bias.
     */
    virtual void set_bias(const void *bias=nullptr) = 0;

    /**
     * Set pointer to the output tensor produced by the transform.
     */
    virtual void set_output_tensor(void *output) = 0;

    /**
     * Set pointer to the output tensor produced by the transform.
     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
     */
    virtual void set_output_tensor(void *output, int col_stride) = 0;

    /**
     * Set pointer to the output tensor produced by the transform.
     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
     */
    virtual void set_output_tensor(void *output, int row_stride, int col_stride) = 0;

    /**
     * Set pointer to the output tensor produced by the transform.
     * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes).
     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
     */
    virtual void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) = 0;
};

class IWeightTransform : public ITransform
{
  public:
    virtual ~IWeightTransform() = default;

    /** Set pointer to the weight tensor read by the transform. */
    virtual void set_weight_tensor(const void *weights) = 0;

    /**
     * Set pointers to the matrices written by the transform.
     * @param matrices Pointer to the start of the first matrix representing the transformed input.
     * @param inter_matrix_stride Stride (in elements) between matrices.
     * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
     */
    virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
};

enum class WinogradRoots
{
  Integers,
};

template <int InnerTileRows, int InnerTileCols, typename TIn, typename TOut, WinogradRoots Roots>
class InputTransform : public IInputTransform
{
  public:
    /** Create an InputTransform operator fixed on a given problem and set of
     * pointers.
     */
    InputTransform(
        int kernel_rows,     /**< Number of rows in the kernel */
        int kernel_cols,     /**< Number of columns in the kernel */
        int n_batches,       /**< Number of batches in input tensor. */
        int n_rows,          /**< Number of rows in input tensor. */
        int n_cols,          /**< Number of columns in input tensor. */
        int n_channels,      /**< Number of channels in input tensor. */
        int padding_top,     /**< Padding to apply to the top of the image. */
        int padding_left,    /**< Padding to apply to the left of the image. */
        int padding_bottom,  /**< Padding to apply to the bottom of the image. */
        int padding_right    /**< Padding to apply to the right of the image. */
    );

    InputTransform(InputTransform&) = delete;
    InputTransform operator=(InputTransform&) = delete;

    /** Set pointers to the input tensor read by the transform. */
    void set_input_tensor(const void *input) override;
    void set_input_tensor(const void *input, int col_stride) override;
    void set_input_tensor(const void *input, int row_stride, int col_stride) override;
    void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override;

    /** Set pointers to the matrices written by the transform. */
    void set_output_matrices(void *matrices, int iter_matrix_stride, int matrix_row_stride) override;

    /** Get the working space required to perform the transformation. */
    size_t get_working_space_size(unsigned int nthreads=1) const override;
    void set_working_space(void *buffer) override;

    /** Get the window of work a given operator can perform. */
    unsigned int get_window() const override;
    static constexpr unsigned int WINDOW_BLOCK = 16;  // Base size of window

    /** Perform work upon a window of the input. */
    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;

  protected:
    const int _n_batches, _n_rows, _n_cols, _n_channels;

  private:
    void transform_unpadded_tile(
      unsigned int threadid,
      int n_channels,
      TOut *outptr,
      const TIn *inptr
    );

    void transform_padded_tile(
      unsigned int threadid,
      int n_channels,
      TOut *outptr,
      const TIn *inptr,
      int padding_top,
      int padding_left,
      int padding_bottom,
      int padding_right
    );

    /* Tile implementation */
    static void transform_tile(
      int n_channels,         /** @param[in] Number of channels in the tensor. */
      const TIn* inptr_base,  /** @param[in] Pointer to the base of the input tile. */
      int input_row_stride,   /** @param[in] Stride between rows of the input tensor. */
      int input_col_stride,   /** @param[in] Stride between columns of the input tensor. */
      TOut* mptr_base,        /** @param[out] Base pointer to transformed input matrices. */
      int matrix_stride       /** @param[in] Stride between matrices in the input space. */
    );

    /** Get the working space for a thread. */
    void * get_working_space(unsigned int threadid) const;

    const TIn* _inptr;
    TOut* _outptr;

    const int _overlap_rows, _overlap_cols;
    const int _padding_top, _padding_left, _padding_bottom, _padding_right;
    const int _tiles_M, _tiles_N;
    int _matrix_stride, _matrix_row_stride, _matrix_batch_stride;
    int _in_col_stride, _in_row_stride, _in_batch_stride;

    const int _working_space_col_stride, _working_space_row_stride;
    TIn *_working_space;
};

template <int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots>
class InputTransform<InnerTileRows, 1, TIn, TOut, Roots> :
  public InputTransform<1, InnerTileRows, TIn, TOut, Roots>
{
  using Base = InputTransform<1, InnerTileRows, TIn, TOut, Roots>;

  public:
    InputTransform(
      int kernel_rows,     /**< Number of rows in the kernel. */
      int kernel_cols,     /**< Number of columns in the kernel. */
      int n_batches,       /**< Number of batches in input tensor. */
      int n_rows,          /**< Number of rows in input tensor. */
      int n_cols,          /**< Number of columns in input tensor. */
      int n_channels,      /**< Number of channels in input tensor. */
      int padding_top,     /**< Padding to apply to the top of the image. */
      int padding_left,    /**< Padding to apply to the left of the image. */
      int padding_bottom,  /**< Padding to apply to the bottom of the image. */
      int padding_right    /**< Padding to apply to the right of the image. */
    );

    /** Set pointers to the input tensor read by the transform. */
    void set_input_tensor(const void *input) override;
    void set_input_tensor(const void *input, int col_stride) override;
    void set_input_tensor(const void *input, int row_stride, int col_stride) override;
    void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override;
};

template <
  int KernelRows, int KernelCols,
  int InnerTileRows, int InnerTileCols,
  typename TIn, typename TOut,
  WinogradRoots Roots
>
class OutputTransform : public IOutputTransform
{
  public:
    OutputTransform(
      int n_batches,  /**< Number of batches in output tensor. */
      int n_rows,     /**< Number of rows in output tensor. */
      int n_cols,     /**< Number of columns in output tensor. */
      int n_channels, /**< Number of channels in output tensor. */
      const arm_gemm::Activation &activation
    );

    OutputTransform(OutputTransform&) = delete;
    OutputTransform operator=(OutputTransform&) = delete;

    /** Set pointers to the matrices read by the transform. */
    void set_input_matrices(const void *matrices, int iter_matrix_stride, int matrix_row_stride) override;

    /** Set pointer to the bias tensor (can be ignored or called with nullptr for no bias */
    void set_bias(const void *bias=nullptr) override;

    /** Set pointers to the output tensor written by the transform. */
    void set_output_tensor(void *output) override;
    void set_output_tensor(void *output, int col_stride) override;
    void set_output_tensor(void *output, int row_stride, int col_stride) override;
    void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override;

    /** Get the working space required to perform the transformation. */
    size_t get_working_space_size(unsigned int nthreads=1) const override;
    void set_working_space(void *buffer) override;

    /** Get the window of work a given operator can perform. */
    unsigned int get_window() const override;
    static constexpr unsigned int WINDOW_BLOCK = 16;  // Base size of window

    /** Perform work upon a window of the input. */
    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;

  protected:
    static constexpr int inner_tile_rows = InnerTileRows;
    static constexpr int inner_tile_cols = InnerTileCols;
    static constexpr int output_tile_rows = InnerTileRows - KernelRows + 1;
    static constexpr int output_tile_cols = InnerTileCols - KernelCols + 1;

    const int _n_batches, _n_rows, _n_cols, _n_channels;
    const TOut _output_min, _output_max;

  private:
    void transform_uncropped_tile(
      unsigned int threadid,
      int n_channels,
      TOut *outptr,
      const TIn *inptr,
      const TOut *biases
    );

    void transform_cropped_tile(
      unsigned int threadid,
      int n_channels,
      TOut *outptr,
      const TIn *inptr,
      const TOut *biases,
      int pad_bottom,
      int pad_right
    );

    /** Implementation of the tile transformation method. */
    static void transform_tile(
      int n_channels,
      const TIn* matrix_base,
      int matrix_stride,
      const TOut* biases,
      TOut* output,
      int output_row_stride,
      int output_col_stride,
      TOut output_min,
      TOut output_max
    );

    /** Get the working space for a thread. */
    void * get_working_space(unsigned int threadid) const;

    const TIn* _matrix_base;
    const TOut* _biases;
    int _matrix_stride, _matrix_row_stride, _matrix_batch_stride;
    TOut* _outptr;
    const int _tiles_M, _tiles_N;
    int _out_col_stride, _out_row_stride, _out_batch_stride;

    const int _working_space_col_stride, _working_space_row_stride;
    TOut *_working_space;
};

template <
  int KernelRows,
  int InnerTileRows,
  typename TIn, typename TOut,
  WinogradRoots Roots
>
class OutputTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> :
  public OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>
{
  using Base = OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>;

  public:
    OutputTransform(
      int n_batches,  /**< Number of batches in output tensor. */
      int n_rows,     /**< Number of rows in output tensor. */
      int n_cols,     /**< Number of columns in output tensor. */
      int n_channels, /**< Number of channels in output tensor. */
      const arm_gemm::Activation &activation
    );

    /** Set pointers to the output tensor written by the transform. */
    void set_output_tensor(void *output) override;
    void set_output_tensor(void *output, int col_stride) override;
    void set_output_tensor(void *output, int row_stride, int col_stride) override;
    void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override;
};

template <
  int KernelRows, int KernelCols,
  int InnerTileRows, int InnerTileCols,
  typename TIn, typename TOut,
  WinogradRoots Roots
>
class WeightTransform : public IWeightTransform
{
  public:
    WeightTransform(
      int n_output_channels,  /**< Number of output channels in the kernel. */
      int n_input_channels    /**< Number of input channels in the kernel. */
    );

    WeightTransform(WeightTransform&) = delete;
    WeightTransform operator=(WeightTransform&) = delete;

    /** Set pointer to the weight tensor read by the transform. */
    void set_weight_tensor(const void *weights) override;

    /** Set pointer to the matrices written by the transform. */
    void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) override;

    /** Get the working space required to perform the transformation. */
    size_t get_working_space_size(unsigned int nthreads=1) const override;
    void set_working_space(void *buffer) override;

    /** Get the window of work a given operator can perform. */
    unsigned int get_window() const override;
    static constexpr unsigned int WINDOW_BLOCK = 16;  // Base size of window

    /** Perform work upon a window of the input. */
    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;

  protected:
    static const int kernel_rows = KernelRows;
    static const int kernel_cols = KernelCols;
    static const int inner_tile_rows = InnerTileRows;
    static const int inner_tile_cols = InnerTileCols;

  private:
    /** Apply the transform to a tensor. */
    static void execute(
      int n_output_channels,
      int n_input_channels,
      const TIn* input,
      TOut* output,
      int matrix_stride,
      int matrix_row_stride
    );

    const int _n_output_channels, _n_input_channels;
    TOut *_matrices;
    int _matrix_stride, _matrix_row_stride;
    const TIn *_weights;
};

template <int KernelRows, int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots>
class WeightTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> :
  public WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>
{
  public:
    using WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>::WeightTransform;
};

template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols, WinogradRoots Roots>
class WinogradGEMM
{
  public:
    // Information about the specific Winograd instance
    static constexpr int output_tile_rows = OutputTileRows;
    static constexpr int output_tile_cols = OutputTileCols;
    static constexpr int kernel_rows = KernelRows;
    static constexpr int kernel_cols = KernelCols;
    static constexpr int inner_tile_rows = output_tile_rows + kernel_rows - 1;
    static constexpr int inner_tile_cols = output_tile_cols + kernel_cols - 1;
    static constexpr int N_GEMMS = inner_tile_rows * inner_tile_cols;

    /** Transform weights from the spatial to the Winograd domain. */
    template <typename TIn, typename TOut>
    using WeightsTransform = WeightTransform<
      KernelRows, KernelCols, inner_tile_rows, inner_tile_cols,
      TIn, TOut, Roots
    >;

    /** Transform input feature maps from the spatial to the Winograd domain.
     */
    template <typename TIn, typename TOut>
    using InputTransform = InputTransform<
      inner_tile_rows, inner_tile_cols, TIn, TOut, Roots
    >;

    /** Transform output feature maps from the Winograd to the spatial domain.
     */
    template <typename TIn, typename TOut>
    using OutputTransform = OutputTransform<
      KernelRows, KernelCols, inner_tile_rows, inner_tile_cols,
      TIn, TOut, Roots
    >;

    /** Perform a convolution.
     */
    template <typename TOut, typename TIn, typename TInGEMM=TIn, typename TOutGEMM=TOut>
    class Convolution
    {
      public:
        // Information about the typed Winograd instance
        typedef TOut OutputType;
        typedef TOutGEMM GemmOutputType;
        typedef TInGEMM GemmInputType;
        typedef TIn InputType;

        /** Get the output shape of a convolution. */
        static std::pair<unsigned int, unsigned int> get_output_shape(
            const std::pair<unsigned int, unsigned int> input_shape,
            bool padding_same);

        /** Get the memory required to store the kernel transformed into the
         * Winograd domain.
         */
        static size_t get_kernel_storage_size(unsigned int n_input_channels,
                                              unsigned int n_output_channels);

        /** Get the memory required to store the input tensor transformed into
         * the Winograd domain.
         */
        static size_t get_input_storage_size(
            unsigned int n_batches,  // Number of batches
            unsigned int n_rows,     // Number of input rows
            unsigned int n_cols,     // Number of input columns
            unsigned int n_channels, // Number of input channels
            bool padding_same);

        /** Get the memory required to store the output tensor in the Winograd
         * domain.
         */
        static size_t get_output_storage_size(
            unsigned int n_batches, // Number of batches
            unsigned int n_rows,    // Number of output rows
            unsigned int n_cols,    // Number of output columns
            unsigned int n_channels // Number of output channels
            );

        /** Get the memory required to apply a Winograd operator to some input.
         */
        static size_t get_working_space_size(
            unsigned int n_batches,
            unsigned int n_rows,            // Number of input rows
            unsigned int n_cols,            // Number of input columns
            unsigned int n_input_channels,  // Number of input channels
            unsigned int n_output_channels, // Number of output channels
            bool padding_same);

        /* Get the memory required by a single "input" matrix.
         */
        static size_t get_input_matrix_size(
            unsigned int n_batches,  // Number of batches
            unsigned int n_rows,     // Number of input rows
            unsigned int n_cols,     // Number of input columns
            unsigned int n_channels, // Number of input channels
            bool padding_same);

        static int get_input_matrix_stride(
            unsigned int n_batches,  // Number of batches
            unsigned int n_rows,     // Number of input rows
            unsigned int n_cols,     // Number of input columns
            unsigned int n_channels, // Number of input channels
            bool padding_same);

        /* Get the memory required by a single "output" matrix.
         */
        static size_t get_output_matrix_size(
            unsigned int n_batches, // Number of batches
            unsigned int n_rows,    // Number of output rows
            unsigned int n_cols,    // Number of output columns
            unsigned int n_channels // Number of output channels
            );

        static int get_output_matrix_stride(
            unsigned int n_batches, // Number of batches
            unsigned int n_rows,    // Number of output rows
            unsigned int n_cols,    // Number of output columns
            unsigned int n_channels // Number of output channels
            );

        /* Get the memory required by a single "kernel" matrix.
         */
        static size_t get_kernel_matrix_size(unsigned int n_input_channels,
                                             unsigned int n_output_channels);
        static int get_kernel_matrix_stride(unsigned int n_input_channels,
                                            unsigned int n_output_channels);

        static constexpr int M_BLOCK = 4;   /** Size of block used by GEMM. */
        static constexpr int N_BLOCK = 16;  /** Size of block used by GEMM. */
    };
};

}  // namespace winograd