1 /* 2 * Copyright (c) 2017-2020 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 #ifndef ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H 25 #define ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H 26 27 #include "arm_compute/core/Error.h" 28 #include "arm_compute/runtime/CL/ICLSimpleFunction.h" 29 30 #include <limits> 31 32 /** This file contains all available output stages for GEMMLowp on OpenCL. 33 * 34 * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore), 35 * and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. 36 * 37 * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md 38 */ 39 40 namespace arm_compute 41 { 42 class CLCompileContext; 43 class ITensor; 44 class ICLTensor; 45 class ITensorInfo; 46 struct GEMMLowpOutputStageInfo; 47 48 /** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL. 49 * 50 * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters: 51 * 52 * result_fixedpoint_multiplier, result_shift, result_offset_after_shift 53 * 54 * The final result is: 55 * 56 * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift 57 * 58 * where FixedPointMul(x, y) is the nearest integer to the following 59 * mathematical expression, evaluated without overflow or intermediate rounding: 60 * 61 * (x * y) / 2^31 62 * 63 * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 64 * 65 * In case the bias tensor is provided, the final result is: 66 * 67 * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift 68 * 69 * This function calls the following OpenCL kernels: 70 * 71 * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel 72 * 73 * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions 74 * after the result is shifted right by result_shift 75 */ 76 class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public ICLSimpleFunction 77 { 78 public: 79 /** Initialise the kernel's inputs, output 80 * 81 * @param[in] input Input tensor. Data type supported: S32 82 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. 83 * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. 84 * @param[out] output Output tensor. Data type supported: QASYMM8 85 * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add 86 * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication 87 * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 88 * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. 89 * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, 90 * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. 91 */ 92 void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, 93 int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); 94 /** Initialise the kernel's inputs, output 95 * 96 * @param[in] compile_context The compile context to be used. 97 * @param[in] input Input tensor. Data type supported: S32 98 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. 99 * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. 100 * @param[out] output Output tensor. Data type supported: QASYMM8 101 * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add 102 * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication 103 * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 104 * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. 105 * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, 106 * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. 107 */ 108 void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, 109 int result_offset_after_shift, 110 int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); 111 /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint 112 * 113 * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 114 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. 115 * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. 116 * @param[in] output Output tensor. Data type supported: QASYMM8 117 * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. 118 * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, 119 * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. 120 * 121 * @return a status 122 */ 123 static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); 124 }; 125 126 /** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint on OpenCL. 127 * 128 * CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters: 129 * 130 * result_fixedpoint_multiplier, result_shift, result_offset_after_shift 131 * 132 * The final result is: 133 * 134 * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift 135 * 136 * where FixedPointMul(x, y) is the nearest integer to the following 137 * mathematical expression, evaluated without overflow or intermediate rounding: 138 * 139 * (x * y) / 2^31 140 * 141 * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 142 * 143 * In case the bias tensor is provided, the final result is: 144 * 145 * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift 146 * 147 * This function calls the following OpenCL kernels: 148 * 149 * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel 150 * 151 * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions 152 * after the result is shifted right by result_shift 153 */ 154 class CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public ICLSimpleFunction 155 { 156 public: 157 /** Initialise the kernel's inputs, output 158 * 159 * @param[in] input Input tensor. Data type supported: S32 160 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. 161 * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. 162 * @param[out] output Output tensor. Data type supported: QASYMM8_SIGNED 163 * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add 164 * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication 165 * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED 166 * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. 167 * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0 168 * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. 169 */ 170 void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, 171 int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); 172 /** Initialise the kernel's inputs, output 173 * 174 * @param[in] compile_context The compile context to be used. 175 * @param[in] input Input tensor. Data type supported: S32 176 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. 177 * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. 178 * @param[out] output Output tensor. Data type supported: QASYMM8_SIGNED 179 * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add 180 * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication 181 * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED 182 * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. 183 * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0 184 * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. 185 */ 186 void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, 187 int result_offset_after_shift, 188 int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); 189 /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint 190 * 191 * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 192 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. 193 * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. 194 * @param[in] output Output tensor. Data type supported: QASYMM8_SIGNED 195 * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. 196 * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0 197 * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. 198 * 199 * @return a status 200 */ 201 static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); 202 }; 203 204 /** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on OpenCL. 205 * 206 * CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters: 207 * 208 * result_fixedpoint_multiplier, result_shift 209 * 210 * The final result is: 211 * 212 * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) 213 * 214 * where FixedPointMul(x, y) is the nearest integer to the following 215 * mathematical expression, evaluated without overflow or intermediate rounding: 216 * 217 * (x * y) / 2^31 218 * 219 * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 220 * 221 * In case the bias tensor is provided, the final result is: 222 * 223 * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift 224 * 225 * This function calls the following NEON kernels: 226 * 227 * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel 228 * 229 * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions 230 * after the result is shifted right by result_shift 231 */ 232 class CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public ICLSimpleFunction 233 { 234 public: 235 /** Initialise the kernel's inputs, output 236 * 237 * @param[in] input Input tensor. Data type supported: S32 238 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. 239 * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. 240 * @param[out] output Output tensor. Data type supported: QSYMM16 241 * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add 242 * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication 243 * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. 244 * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. 245 * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. 246 */ 247 void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits<int32_t>::lowest(), 248 int max = std::numeric_limits<int32_t>::max()); 249 /** Initialise the kernel's inputs, output 250 * 251 * @param[in] compile_context The compile context to be used. 252 * @param[in] input Input tensor. Data type supported: S32 253 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. 254 * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. 255 * @param[out] output Output tensor. Data type supported: QSYMM16 256 * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add 257 * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication 258 * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. 259 * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. 260 * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. 261 */ 262 void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, 263 int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); 264 /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint 265 * 266 * @param[in] input Input tensor info. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 267 * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. 268 * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. 269 * @param[in] output Output tensor info. Data type supported: QSYMM16 270 * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. 271 * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16, 272 * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. 273 * 274 * @return a status 275 */ 276 static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); 277 }; 278 /** Basic function to execute GEMMLowpQuantizeDown kernels on CL. 279 * 280 * This function calls the following CL kernels: 281 * 282 * -# @ref CLGEMMLowpQuantizeDownInt32ScaleKernel 283 * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel 284 * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel 285 */ 286 class CLGEMMLowpOutputStage : public ICLSimpleFunction 287 { 288 public: 289 /** Initialise the kernel's inputs, output 290 * 291 * @param[in] input Input tensor. Data type supported: S32 292 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. 293 * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. 294 * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED 295 * @param[in] info GEMMLowp output stage metadata. 296 */ 297 void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info); 298 /** Initialise the kernel's inputs, output 299 * 300 * @param[in] compile_context The compile context to be used. 301 * @param[in] input Input tensor. Data type supported: S32 302 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. 303 * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. 304 * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED 305 * @param[in] info GEMMLowp output stage metadata. 306 */ 307 void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info); 308 /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel 309 * 310 * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 311 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. 312 * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. 313 * @param[in] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED 314 * @param[in] info GEMMLowp output stage metadata. 315 * 316 * @return a status 317 */ 318 static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info); 319 }; 320 } // namespace arm_compute 321 #endif /*ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H */ 322