1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // output.h: processing the 32-bit accumulators output by the unpack 16 // stage, obtaining the final result matrix entries and storing them into 17 // the destination matrix. 18 19 #ifndef GEMMLOWP_INTERNAL_OUTPUT_H_ 20 #define GEMMLOWP_INTERNAL_OUTPUT_H_ 21 22 #include <cmath> 23 #include <tuple> 24 #include <type_traits> 25 26 #include "../fixedpoint/fixedpoint.h" 27 #include "../public/output_stages.h" 28 #include "simd_wrappers.h" 29 30 namespace gemmlowp { 31 32 template <typename OutputStage, typename InputBufferType> 33 struct OutputStageEvalBufferImpl { 34 // This generic template body should never be hit. 35 static_assert( 36 std::is_same<InputBufferType, void>::value, 37 "Unimplemented: missing implementation of this output pipeline stage " 38 "for this data type. This would happen if some architecture-specific " 39 "SIMD back-end (output_$arch.h) were incomplete."); 40 }; 41 42 template <typename OutputStage, typename InputType> 43 struct OutputStageEvalImpl { 44 static constexpr int kRows = InputType::kRows; 45 static constexpr int kCols = InputType::kCols; 46 using InputBufferType = typename InputType::BufferType; 47 using BufferEvalImplType = 48 OutputStageEvalBufferImpl<OutputStage, InputBufferType>; 49 using OutputBufferType = typename BufferEvalImplType::OutputType; 50 using OutputScalarType = typename OutputBufferType::ScalarType; 51 using OutputType = RegisterBlock<OutputScalarType, kRows, kCols>; 52 OutputStageEvalImplOutputStageEvalImpl53 OutputStageEvalImpl(const OutputStage& s) : buffer_eval_impl(s) {} 54 EvalOutputStageEvalImpl55 OutputType Eval(InputType input, int, int) const { 56 OutputType output; 57 output.buf = buffer_eval_impl.Eval(input.buf); 58 return output; 59 } 60 61 const BufferEvalImplType buffer_eval_impl; 62 }; 63 64 template <int Size> 65 struct OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ToUint8Scale, 66 RegisterBuffer<std::int32_t, Size>> { 67 using InputType = RegisterBuffer<std::int32_t, Size>; 68 using OutputType = RegisterBuffer<std::int32_t, Size>; 69 70 typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage; 71 72 OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {} 73 74 OutputType Eval(InputType input) const { 75 const int result_shift = output_stage.result_shift; 76 const std::int32_t result_mult_int = output_stage.result_mult_int; 77 using RegisterType = typename InputType::RegisterType; 78 const RegisterType result_offset = 79 Dup<RegisterType>(output_stage.result_offset); 80 OutputType output; 81 for (int i = 0; i < InputType::kRegisterCount; i++) { 82 output.reg[i] = RoundingDivideByPOT( 83 Mul(Add(input.reg[i], result_offset), result_mult_int), result_shift); 84 } 85 return output; 86 } 87 88 const OutputStage& output_stage; 89 }; 90 91 template <int Rows, int Cols, VectorShape Shape> 92 struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>, 93 RegisterBlock<std::int32_t, Rows, Cols>> { 94 typedef RegisterBlock<std::int32_t, Rows, Cols> InputType; 95 typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType; 96 typedef OutputStageQuantizeDownInt32ToUint8ScalePC<Shape> OutputStage; 97 98 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 99 100 OutputType Eval(InputType input, int row, int col) const { 101 OutputType output; 102 const int result_shift = output_stage.result_shift; 103 const int pos = Shape == VectorShape::Col ? row : col; 104 const auto result_mult_int = 105 LoadForBroadcasting<InputType>(output_stage.result_mult_int, pos); 106 const auto result_offset = 107 LoadForBroadcasting<InputType>(output_stage.result_offset, pos); 108 const auto dividend = BroadcastMul<InputType>( 109 BroadcastAdd<InputType>(input, result_offset), result_mult_int); 110 for (int i = 0; i < InputType::kRegisterCount; i++) { 111 output.buf.reg[i] = 112 RoundingDivideByPOT(dividend.buf.reg[i], result_shift); 113 } 114 return output; 115 } 116 117 const OutputStage& output_stage; 118 }; 119 120 template <int Size> 121 struct OutputStageEvalBufferImpl< 122 OutputStageQuantizeDownInt32ByFixedPoint, 123 RegisterBuffer<std::int32_t, Size>> { 124 typedef RegisterBuffer<std::int32_t, Size> InputType; 125 typedef RegisterBuffer<std::int32_t, Size> OutputType; 126 127 typedef OutputStageQuantizeDownInt32ByFixedPoint OutputStage; 128 129 OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {} 130 131 OutputType Eval(InputType input) const { 132 OutputType output; 133 using RegisterType = typename InputType::RegisterType; 134 const RegisterType result_offset_after_shift = 135 Dup<RegisterType>(output_stage.result_offset_after_shift); 136 for (int i = 0; i < InputType::kRegisterCount; i++) { 137 const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul( 138 input.reg[i], output_stage.result_fixedpoint_multiplier); 139 output.reg[i] = 140 Add(RoundingDivideByPOT(mulhigh_val, output_stage.result_shift), 141 result_offset_after_shift); 142 } 143 return output; 144 } 145 146 const OutputStage& output_stage; 147 }; 148 149 template <int Size> 150 struct OutputStageEvalBufferImpl<OutputStageScaleInt32ByFixedPointAndExponent, 151 RegisterBuffer<std::int32_t, Size>> { 152 typedef RegisterBuffer<std::int32_t, Size> InputType; 153 typedef RegisterBuffer<std::int32_t, Size> OutputType; 154 155 typedef OutputStageScaleInt32ByFixedPointAndExponent OutputStage; 156 157 OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) { 158 left_shift = std::max(0, output_stage.result_exponent); 159 right_shift = std::max(0, -output_stage.result_exponent); 160 } 161 162 OutputType Eval(InputType input) const { 163 OutputType output; 164 using RegisterType = typename InputType::RegisterType; 165 const RegisterType result_offset_after_shift = 166 Dup<RegisterType>(output_stage.result_offset_after_shift); 167 for (int i = 0; i < InputType::kRegisterCount; i++) { 168 const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul( 169 ShiftLeft(input.reg[i], left_shift), 170 output_stage.result_fixedpoint_multiplier); 171 output.reg[i] = Add(RoundingDivideByPOT(mulhigh_val, right_shift), 172 result_offset_after_shift); 173 } 174 return output; 175 } 176 177 const OutputStage& output_stage; 178 int left_shift; 179 int right_shift; 180 }; 181 182 // Implementation of OutputStageSaturatingCastToUint8 for scalar data 183 template <int Size> 184 struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, 185 RegisterBuffer<std::int32_t, Size>> { 186 typedef RegisterBuffer<std::int32_t, Size> InputType; 187 typedef RegisterBuffer<std::uint8_t, Size> OutputType; 188 static_assert(InputType::kRegisterLanes == 1, 189 "This path is only for scalar values"); 190 191 typedef OutputStageSaturatingCastToUint8 OutputStage; 192 193 OutputStageEvalBufferImpl(const OutputStage&) {} 194 195 OutputType Eval(InputType input) const { 196 OutputType output; 197 for (int i = 0; i < InputType::kRegisterCount; i++) { 198 std::int32_t data = input.reg[i]; 199 output.reg[i] = data > 255 ? 255 : data < 0 ? 0 : data; 200 } 201 return output; 202 } 203 }; 204 205 // Implementation of OutputStageSaturatingCastToInt16 for scalar data 206 template <int Size> 207 struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, 208 RegisterBuffer<std::int32_t, Size>> { 209 typedef RegisterBuffer<std::int32_t, Size> InputType; 210 typedef RegisterBuffer<std::int16_t, Size> OutputType; 211 static_assert(InputType::kRegisterLanes == 1, 212 "This path is only for scalar values"); 213 214 typedef OutputStageSaturatingCastToInt16 OutputStage; 215 216 OutputStageEvalBufferImpl(const OutputStage&) {} 217 218 OutputType Eval(InputType input) const { 219 OutputType output; 220 for (int i = 0; i < InputType::kRegisterCount; i++) { 221 std::int32_t data = input.reg[i]; 222 output.reg[i] = data > 32767 ? 32767 : data < -32768 ? -32768 : data; 223 } 224 return output; 225 } 226 }; 227 228 template <int Rows, int Cols, typename VectorType> 229 struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>, 230 RegisterBlock<std::int32_t, Rows, Cols>> { 231 typedef RegisterBlock<std::int32_t, Rows, Cols> InputType; 232 typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType; 233 typedef OutputStageBiasAddition<VectorType> OutputStage; 234 235 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 236 237 OutputType Eval(InputType input, int row, int col) const { 238 const int pos = VectorType::kShape == VectorShape::Row ? col : row; 239 return BroadcastAdd<InputType>( 240 input, LoadForBroadcasting<InputType>(output_stage.bias_vector, pos)); 241 } 242 243 const OutputStage& output_stage; 244 }; 245 246 template <int Size> 247 struct OutputStageEvalBufferImpl<OutputStageClamp, 248 RegisterBuffer<std::int32_t, Size>> { 249 typedef RegisterBuffer<std::int32_t, Size> InputType; 250 typedef RegisterBuffer<std::int32_t, Size> OutputType; 251 252 typedef OutputStageClamp OutputStage; 253 254 OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {} 255 256 OutputType Eval(InputType input) const { 257 using RegisterType = typename InputType::RegisterType; 258 const RegisterType min = Dup<RegisterType>(output_stage.min); 259 const RegisterType max = Dup<RegisterType>(output_stage.max); 260 OutputType output; 261 for (int i = 0; i < InputType::kRegisterCount; i++) { 262 output.reg[i] = Min(Max(input.reg[i], min), max); 263 } 264 return output; 265 } 266 267 const OutputStage& output_stage; 268 }; 269 270 template <int Size> 271 struct OutputStageEvalBufferImpl<OutputStageTanh, 272 RegisterBuffer<std::int32_t, Size>> { 273 typedef RegisterBuffer<std::int32_t, Size> InputType; 274 typedef RegisterBuffer<std::int32_t, Size> OutputType; 275 using RegisterType = typename InputType::RegisterType; 276 typedef RegisterType DataType; 277 typedef OutputStageTanh OutputStage; 278 279 OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) { 280 const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; 281 const std::int32_t real_amplitude_as_int32 = 282 output_stage.real_amplitude_as_int32; 283 284 input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32; 285 input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32; 286 output_min = real_zero_as_int32 - real_amplitude_as_int32; 287 output_max = real_zero_as_int32 + real_amplitude_as_int32; 288 289 double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32; 290 inverse_amplitude_neg_exponent = 0; 291 while (inverse_amplitude_normalized_double < 0.5) { 292 inverse_amplitude_normalized_double *= 2; 293 inverse_amplitude_neg_exponent++; 294 } 295 inverse_amplitude_normalized = FixedPoint<DataType, 0>::FromDouble( 296 inverse_amplitude_normalized_double); 297 298 double amplitude_normalized_double = real_amplitude_as_int32; 299 amplitude_exponent = 0; 300 while (amplitude_normalized_double >= 1.0) { 301 amplitude_normalized_double *= 0.5; 302 amplitude_exponent++; 303 } 304 amplitude_normalized = 305 FixedPoint<DataType, 0>::FromDouble(amplitude_normalized_double); 306 } 307 308 OutputType Eval(InputType input) const { 309 const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; 310 311 typedef FixedPoint<DataType, 3> F3; 312 typedef FixedPoint<DataType, 0> F0; 313 314 OutputType output; 315 316 for (int i = 0; i < OutputType::kRegisterCount; i++) { 317 // fixed-point affine transformation 318 DataType input_centered = 319 Sub(input.reg[i], Dup<DataType>(real_zero_as_int32)); 320 F3 fixedpoint_input = 321 F3::FromRaw(input_centered) * inverse_amplitude_normalized; 322 // left shift 323 fixedpoint_input.raw() = ShiftLeft(fixedpoint_input.raw(), 324 28 - inverse_amplitude_neg_exponent); 325 // fixed-point tanh and multiplication 326 F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized; 327 // right shift 328 DataType int32_output = 329 Add(Dup<DataType>(real_zero_as_int32), 330 ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent)); 331 332 DataType mask_if_below_cutoff_min = 333 MaskIfLessThanOrEqual(input.reg[i], Dup<DataType>(input_cutoff_min)); 334 DataType mask_if_above_cutoff_max = MaskIfGreaterThanOrEqual( 335 input.reg[i], Dup<DataType>(input_cutoff_max)); 336 337 output.reg[i] = SelectUsingMask( 338 mask_if_below_cutoff_min, Dup<DataType>(output_min), 339 SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max), 340 int32_output)); 341 } 342 return output; 343 } 344 345 const OutputStage& output_stage; 346 std::int32_t input_cutoff_min, input_cutoff_max; 347 std::int32_t output_min, output_max; 348 FixedPoint<DataType, 0> inverse_amplitude_normalized; 349 int inverse_amplitude_neg_exponent; 350 FixedPoint<DataType, 0> amplitude_normalized; 351 int amplitude_exponent; 352 }; 353 354 // OutputPipelineOutputType is a helper to determine the output data type of a 355 // pipeline, for a 356 // given input data type. It is a recursive template; see the explanation on 357 // OutputPipelineEvalImpl below. 358 template <typename OutputPipelineType, int FirstStage, typename InputType, 359 bool StopRecursion = 360 FirstStage == std::tuple_size<OutputPipelineType>::value> 361 struct OutputPipelineOutputType { 362 typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type 363 FirstStageType; 364 typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType 365 FirstStageOutputType; 366 typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1, 367 FirstStageOutputType>::Type Type; 368 }; 369 370 template <typename OutputPipelineType, int FirstStage, typename InputType> 371 struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType, 372 true> { 373 typedef InputType Type; 374 }; 375 376 // OutputPipelineEvalImpl is a helper to implement the evaluation of 377 // the whole pipeline. It is a recursive template to implement compile-time 378 // unrolling of the loop over all pipeline stages. The 'FirstStage' parameter 379 // is how we implement recursion: each specialization implements only 380 // evaluation starting at 'FirstStage'. The StopRecursion parameter is just a 381 // helper to implement the termination of the recursion as a partial 382 // specialization below. 383 template <typename OutputPipelineType, int FirstStage, typename InputType, 384 bool StopRecursion = 385 FirstStage == std::tuple_size<OutputPipelineType>::value> 386 struct OutputPipelineEvalImpl { 387 typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type 388 FirstStageType; 389 typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType 390 FirstStageOutputType; 391 typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage, 392 InputType>::Type OutputType; 393 394 OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline) 395 : head_impl(std::get<FirstStage>(output_pipeline)), 396 tail_impl(output_pipeline) {} 397 398 OutputType Eval(InputType input, int row, int col) const { 399 // Evaluate the first stage. 400 FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col); 401 // Recurse into the remaining stages. 402 return tail_impl.Eval(first_stage_output, row, col); 403 } 404 405 const OutputStageEvalImpl<FirstStageType, InputType> head_impl; 406 const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1, 407 FirstStageOutputType> 408 tail_impl; 409 }; 410 411 // Specialization on 'StopRecursion' for terminating the recursion. 412 template <typename OutputPipelineType, int FirstStage, typename InputType> 413 struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> { 414 OutputPipelineEvalImpl(const OutputPipelineType&) {} 415 416 InputType Eval(InputType input, int, int) const { 417 // Terminating the recursion. 418 return input; 419 } 420 }; 421 422 template <typename RegisterBlockType, typename DstType> 423 struct StoreFinalOutputImpl { 424 static_assert(std::is_same<RegisterBlockType, void>::value, 425 "This generic impl should never be hit"); 426 }; 427 428 template <typename ScalarType, int Rows, int Cols, typename DstType> 429 struct StoreFinalOutputImpl<RegisterBlock<ScalarType, Rows, Cols>, DstType> { 430 using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>; 431 static void Run(const RegisterBlockType& src, DstType* dst, int row, 432 int col) { 433 for (int r = 0; r < Rows; r++) { 434 for (int c = 0; c < Cols; c++) { 435 *dst->data(row + r, col + c) = src.buf.reg[r + c * Rows]; 436 } 437 } 438 } 439 }; 440 441 // StoreFinalOutput takes the final value at the end of the output pipeline and 442 // stores it into the destination matrix. It can be specialized for different 443 // data types; the generic implementation here is typically used only for plain 444 // old scalar (not SIMD) types. 445 template <typename RegisterBlockType, typename DstType> 446 void StoreFinalOutput(RegisterBlockType src, DstType* dst, int row, int col) { 447 StoreFinalOutputImpl<RegisterBlockType, DstType>::Run(src, dst, row, col); 448 } 449 450 template <typename OutputPipelineType, typename InputType> 451 struct OutputPipelineExecutor { 452 OutputPipelineExecutor(const OutputPipelineType& output_pipeline) 453 : output_pipeline_eval_impl_(output_pipeline) {} 454 455 // RunOutputPipeline is the entry point into the output pipeline evaluation 456 // code. It should be the only thing that unpack code calls. It takes the 457 // result 458 // of the unpack stage and stores it into the destination matrix. 459 template <typename DstType> 460 void Execute(InputType input, DstType* dst, int src_global_row, 461 int src_global_col, int dst_row, int dst_col) const { 462 // Statically assert that the output pipeline matches the given destination 463 // matrix's scalar type. 464 typedef typename OutputPipelineOutputType< 465 OutputPipelineType, 0, InputType>::Type::BufferType::ScalarType 466 467 ScalarOutputType; 468 typedef typename DstType::Scalar ScalarDstType; 469 static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value, 470 "mismatched destination scalar type and output pipeline"); 471 472 // Evaluate the output pipeline. 473 auto output = 474 output_pipeline_eval_impl_.Eval(input, src_global_row, src_global_col); 475 // Store the result into the destination matrix. 476 StoreFinalOutput(output, dst, dst_row, dst_col); 477 } 478 479 const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType> 480 output_pipeline_eval_impl_; 481 }; 482 483 } // namespace gemmlowp 484 485 #ifdef GEMMLOWP_NEON 486 #include "output_neon.h" 487 #elif defined(GEMMLOWP_SSE4) 488 #include "output_sse.h" 489 #elif defined(GEMMLOWP_MSA) 490 #include "output_msa.h" 491 #endif 492 493 #endif // GEMMLOWP_INTERNAL_OUTPUT_H_ 494