• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H
25 #define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H
26 
27 #include "arm_compute/runtime/CL/CLTensor.h"
28 #include "arm_compute/runtime/IFunction.h"
29 #include "arm_compute/runtime/MemoryGroup.h"
30 
31 namespace arm_compute
32 {
33 class CLCompileContext;
34 class IMemoryManager;
35 class ICLTensor;
36 class ITensorInfo;
37 class CLDepthConvertLayerKernel;
38 class CLGEMMLowpMatrixMultiplyNativeKernel;
39 class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel;
40 class CLGEMMLowpOffsetContributionKernel;
41 class CLGEMMLowpOffsetContributionOutputStageKernel;
42 class CLGEMMLowpMatrixAReductionKernel;
43 class CLGEMMLowpMatrixBReductionKernel;
44 class CLGEMMReshapeRHSMatrixKernel;
45 
46 /** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */
47 class CLGEMMLowpMatrixMultiplyCore : public IFunction
48 {
49 public:
50     /** Constructor */
51     CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
52     /** Prevent instances of this class from being copied (As this class contains pointers) */
53     CLGEMMLowpMatrixMultiplyCore(const CLGEMMLowpMatrixMultiplyCore &) = delete;
54     /** Default move constructor */
55     CLGEMMLowpMatrixMultiplyCore(CLGEMMLowpMatrixMultiplyCore &&) = default;
56     /** Prevent instances of this class from being copied (As this class contains pointers) */
57     CLGEMMLowpMatrixMultiplyCore &operator=(const CLGEMMLowpMatrixMultiplyCore &) = delete;
58     /** Default move assignment operator */
59     CLGEMMLowpMatrixMultiplyCore &operator=(CLGEMMLowpMatrixMultiplyCore &&) = default;
60     /** Default destructor */
61     ~CLGEMMLowpMatrixMultiplyCore();
62     /** Initialise the kernel's inputs, output
63      *
64      * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
65      *  This kernel performs the following computations:
66      *
67      *  -# Convert a values from 8-bit quantized to int32 and add a_offset to each of them.
68      *  -# Convert b values from 8-bit quantized to int32 and add b_offset to each of them.
69      *  -# Compute the matrix product of the resulting a * b in int32.
70      *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
71      *
72      * @param[in]  a         First input tensor  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
73      * @param[in]  b         Second input tensor (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
74      * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32
75      * @param[out] output    Output tensor. Data type supported: S32 or QASYMM8/QASYMM8_SIGNED if gemm_info.gemmlowp_output_stage != NONE
76      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
77      *                       if the reshape of matrix B should be executed only for the first run
78      */
79     void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
80     /** Initialise the kernel's inputs, output
81      *
82      * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
83      *  This kernel performs the following computations:
84      *
85      *  -# Convert a values from 8-bit quantized to int32 and add a_offset to each of them.
86      *  -# Convert b values from 8-bit quantized to int32 and add b_offset to each of them.
87      *  -# Compute the matrix product of the resulting a * b in int32.
88      *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
89      *
90      * @param[in]  compile_context The compile context to be used.
91      * @param[in]  a               First input tensor  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
92      * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a
93      * @param[in]  c               Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32
94      * @param[out] output          Output tensor. Data type supported: S32 or QASYMM8/QASYMM8_SIGNED if gemm_info.gemmlowp_output_stage != NONE
95      * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
96      *                       if the reshape of matrix B should be executed only for the first run
97      */
98     void configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
99     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyCore
100      *
101      * @param[in] a         First input tensor info (Matrix A). Data type supported: QASYMM8.
102      * @param[in] b         Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
103      * @param[in] c         Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32
104      * @param[in] output    Output tensor info. Data type supported: S32 or QASYMM8/QASYMM8_SIGNED if gemm_info.gemmlowp_output_stage != NONE
105      * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
106      *                      if the reshape of matrix B should be executed only for the first run
107      *
108      * @return a status
109      */
110     static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
111 
112     // Inherited methods overridden:
113     void run() override;
114     void prepare() override;
115 
116 private:
117     MemoryGroup _memory_group;
118 
119     // Kernels used
120     std::unique_ptr<CLDepthConvertLayerKernel>                     _weights_to_qasymm8;
121     std::unique_ptr<CLGEMMLowpMatrixMultiplyNativeKernel>          _mm_native_kernel;
122     std::unique_ptr<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel> _mm_reshaped_only_rhs_kernel;
123     std::unique_ptr<CLGEMMReshapeRHSMatrixKernel>                  _mtx_b_reshape_kernel;
124     std::unique_ptr<CLGEMMLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
125     std::unique_ptr<CLGEMMLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
126     std::unique_ptr<CLGEMMLowpOffsetContributionKernel>            _offset_contribution_kernel;
127     std::unique_ptr<CLGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
128 
129     // Temporary tensors
130     CLTensor _qasymm8_weights;
131     CLTensor _vector_sum_col;
132     CLTensor _vector_sum_row;
133     CLTensor _tmp_b;
134     CLTensor _mm_result_s32;
135     CLTensor _gemm_output_stage_multipliers;
136     CLTensor _gemm_output_stage_shifts;
137 
138     // Tensor pointers
139     const ICLTensor *_matrix_a;
140     const ICLTensor *_original_b;
141     const ICLTensor *_output;
142 
143     int32_t _a_offset;
144     int32_t _b_offset;
145     bool    _is_gemm_reshaped;
146     bool    _reshape_b_only_on_first_run;
147     bool    _is_prepared;
148     bool    _run_output_stage;
149     bool    _convert_to_qasymm8;
150     bool    _run_offset_contribution;
151 };
152 } // namespace arm_compute
153 #endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H */