• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H
25 #define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H
26 
27 #include "arm_compute/core/TensorInfo.h"
28 #include "src/core/common/Macros.h"
29 #include "src/cpu/ICpuOperator.h"
30 
31 #include <memory>
32 
33 namespace arm_compute
34 {
35 namespace cpu
36 {
37 namespace kernels
38 {
39 class CpuGemmInterleave4x4Kernel;
40 class CpuGemmLowpMatrixMultiplyKernel;
41 class CpuGemmLowpOffsetContributionKernel;
42 class CpuGemmLowpOffsetContributionOutputStageKernel;
43 class CpuGemmLowpMatrixAReductionKernel;
44 class CpuGemmLowpMatrixBReductionKernel;
45 class CpuGemmTranspose1xWKernel;
46 class CpuConvertQuantizedSignednessKernel;
47 } // namespace kernels
48 class CpuGemmAssemblyDispatch;
49 class CpuActivation;
50 
51 /** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
52  *
53  *  -# @ref kernels::CpuGemmInterleave4x4Kernel
54  *  -# @ref kernels::CpuGemmTranspose1xWKernel
55  *  -# @ref kernels::CpuGemmLowpMatrixMultiplyKernel
56  *  -# @ref kernels::CpuGemmLowpOffsetContributionKernel
57  *  -# @ref CpuActivation
58  *
59  * otherwise if the DOT product instruction is available:
60  *
61  *  -# @ref kernels::CpuGemmLowpOffsetContributionKernel
62  *
63 */
64 class CpuGemmLowpMatrixMultiplyCore : public ICpuOperator
65 {
66 public:
67     /** Constructor */
68     CpuGemmLowpMatrixMultiplyCore();
69     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyCore);
70     /** Destructor */
71     ~CpuGemmLowpMatrixMultiplyCore();
72     /** Initialise the kernel's inputs, output
73      *
74      * Valid data layouts:
75      * - NHWC
76      * - NCHW
77      *
78      * Valid data type configurations:
79      * |src0           |src1               |src2     |dst            |
80      * |:--------------|:------------------|:--------|:--------------|
81      * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
82      * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
83      * |QASYMM8        |QSYMM8             |S32      |QASYMM8        |
84      * |QASYMM8        |QASYMM8            |S32      |S32            |
85      * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |S32            |
86      * |QASYMM8        |QSYMM8             |S32      |S32            |
87      * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
88      * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
89      * |QASYMM8_SIGNED |QSYMM8             |S32      |QASYMM8_SIGNED |
90      * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |S32            |
91      * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |S32            |
92      * |QASYMM8_SIGNED |QSYMM8             |S32      |S32            |
93      *
94      * @note GEMM_LOWP:  low precision GEMM kernel
95      *  This kernel performs the following computations:
96      *
97      *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
98      *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
99      *  -# Compute the matrix product of the resulting a * b in int32.
100      *
101      * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
102      *
103      * @param[in]  a         First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
104      * @param[in]  b         Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
105      * @param[in]  c         Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32
106      * @param[out] dst       Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
107      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
108      *                       if the reshape of matrix B should be executed only for the first run
109      */
110     void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
111     /** Static function to check if given info will lead to a valid configuration
112      *
113      * Similar to CpuGemmLowpMatrixMultiplyCore::configure()
114      *
115      * @return a status
116      */
117     static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
118 
119     // Inherited methods overridden:
120     void run(ITensorPack &tensors) override;
121     void prepare(ITensorPack &tensors) override;
122     experimental::MemoryRequirements workspace() const override;
123 
124 private:
125     enum AuxTensorIdx
126     {
127         AsmGemmWorkspace = 0,
128         Pretranspose,
129         VectorSumCol,
130         VectorSumRow,
131         TmpA,
132         TmpB,
133         MMResultS32,
134         SignedA,
135         SignedOutput,
136         Count
137     };
138 
139     std::unique_ptr<CpuGemmAssemblyDispatch>                                 _asm_glue;
140     std::unique_ptr<kernels::CpuGemmLowpMatrixMultiplyKernel>                _mm_kernel;
141     std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel>                     _mtx_a_reshape_kernel;
142     std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>                      _mtx_b_reshape_kernel;
143     std::unique_ptr<kernels::CpuGemmLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
144     std::unique_ptr<kernels::CpuGemmLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
145     std::unique_ptr<kernels::CpuGemmLowpOffsetContributionKernel>            _offset_contribution_kernel;
146     std::unique_ptr<kernels::CpuGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
147     std::unique_ptr<CpuActivation>                                           _activation_func;
148     std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel>            _convert_to_signed_asymm;
149     std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel>            _convert_from_signed_asymm;
150 
151     TensorInfo _vector_sum_col;
152     TensorInfo _vector_sum_row;
153     TensorInfo _tmp_a;
154     TensorInfo _tmp_b;
155     TensorInfo _mm_result_s32;
156     TensorInfo _signed_a;
157     TensorInfo _signed_output;
158     int32_t    _a_offset;
159     int32_t    _b_offset;
160 
161     bool                             _run_vector_matrix_multiplication;
162     bool                             _assembly_path;
163     bool                             _fused_assembly_path;
164     bool                             _reshape_b_only_on_first_run;
165     bool                             _is_prepared;
166     bool                             _fuse_output_stage;
167     bool                             _run_activation;
168     bool                             _flip_signedness;
169     GEMMInfo                         _gemm_info;
170     experimental::MemoryRequirements _aux_mem{};
171 };
172 } // namespace cpu
173 } // namespace arm_compute
174 #endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H */
175