// Copyright 2015 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // unpack.h: unpacking the result blocks computed by compute.h, // storing them into the destination matrix. #ifndef GEMMLOWP_INTERNAL_UNPACK_H_ #define GEMMLOWP_INTERNAL_UNPACK_H_ #include "allocator.h" #include "block_params.h" #include "output.h" #include "pack.h" #include namespace gemmlowp { class PackedResult { public: PackedResult(Allocator* _allocator, const BlockParams& _block_params) : allocator_(_allocator), block_params_(_block_params) { matrix_handle_ = allocator_->Reserve(block_params_.l2_rows * block_params_.l2_cols); } ~PackedResult() {} MatrixMap Map() { return MatrixMap( allocator_->GetPointer(matrix_handle_), block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); } MatrixMap Map() const { return MatrixMap( allocator_->GetPointer(matrix_handle_), block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); } private: Allocator* allocator_; Allocator::Handle matrix_handle_; const BlockParams& block_params_; }; template std::int32_t RoundingMultiplyByConstantFraction(std::int32_t x) { if (numerator == denominator) { return x; } // We'll use only signed arithmetic here. This is // simpler (since this function operates on signed int32's) and // more friendly to ARM NEON, where this allows us to use the // VQRDMULH instruction. static const std::int32_t int_quotient = (numerator + denominator / 2) / denominator; static const std::int32_t remaining_numerator = numerator - int_quotient * denominator; static const std::int32_t scaled_remaining_numerator = static_cast( (static_cast(remaining_numerator) * (1ll << 31)) / denominator); const std::int64_t scaled_remaining_product = static_cast(x) * static_cast(scaled_remaining_numerator); const std::int32_t scaled_remaining_product_nudge = (scaled_remaining_product > 0 ? 1 : -1) * (1 << 30); const std::int32_t remaining_product = static_cast( (scaled_remaining_product + scaled_remaining_product_nudge) / (1u << 31)); return x * int_quotient + remaining_product; } template struct UnpackResultImplGeneric { static void Unpack(ResultBlockType* dst, const PackedResultType& src, int depth, const std::int32_t* lhs_sums_of_each_slice, const std::int32_t* rhs_sums_of_each_slice, const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, const OutputPipelineType& output_pipeline) { auto src_map = src.Map(); // No top-level blocking in the depth dimension at the moment. // Too much loss of precision. const int kLhsBits = BitDepthParams::LhsBitDepth::kBits; const int kRhsBits = BitDepthParams::RhsBitDepth::kBits; const std::int32_t kLhsMax = (1 << kLhsBits) - 1; const std::int32_t kRhsMax = (1 << kRhsBits) - 1; OutputPipelineExecutor output_pipeline_executor(output_pipeline); for (int c = 0; c < dst->cols(); c++) { for (int r = 0; r < dst->rows(); r++) { // To understand this code, read // doc/low-precision.txt // doc/less-than-8-bit.txt // We have 4 terms to sum: xx, x1, 1x, 11. // In case of requantization, we first need to scale them back // to the original scale, using RoundingMultiplyByConstantFraction. std::int32_t raw_xx = src_map(r, c); std::int32_t raw_x1 = lhs_sums_of_each_slice[r] * rhs_offset(c); std::int32_t raw_1x = rhs_sums_of_each_slice[c] * lhs_offset(r); std::int32_t term_xx = RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>( raw_xx); std::int32_t term_x1 = RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1); std::int32_t term_1x = RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x); std::int32_t term_11 = lhs_offset(r) * rhs_offset(c) * depth; // Sum the 4 terms. FragmentInt32x1x1 sum = term_xx + term_x1 + term_1x + term_11; output_pipeline_executor.Execute(sum, dst, r, c); } } } }; template struct UnpackResultImpl : UnpackResultImplGeneric {}; template void UnpackResult(ResultBlockType* dst, const PackedResultType& src, int depth, const std::int32_t* lhs_sums_of_each_slice, const std::int32_t* rhs_sums_of_each_slice, const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, const OutputPipelineType& output_pipeline) { ScopedProfilingLabel label("unpack"); UnpackResultImpl::Unpack( dst, src, depth, lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset, rhs_offset, output_pipeline); } } // namespace gemmlowp #ifdef GEMMLOWP_NEON #include "unpack_neon.h" #endif #endif // GEMMLOWP_INTERNAL_UNPACK_H_