index/main/main-index-merger.cc

// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "icing/index/main/main-index-merger.h"

#include <algorithm>
#include <cstdint>
#include <cstring>
#include <unordered_map>
#include <utility>
#include <vector>

#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/file/posting_list/posting-list-common.h"
#include "icing/index/hit/hit.h"
#include "icing/index/lite/lite-index.h"
#include "icing/index/lite/term-id-hit-pair.h"
#include "icing/index/main/main-index.h"
#include "icing/index/term-id-codec.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"

namespace icing {
namespace lib {

namespace {

class HitSelector {
 public:
  // Returns whether or not term_id_hit_pair has the same term_id, document_id
  // and section_id as the previously selected hits.
  bool IsEquivalentHit(const TermIdHitPair& term_id_hit_pair) {
    return prev_.term_id() == term_id_hit_pair.term_id() &&
           prev_.hit().document_id() == term_id_hit_pair.hit().document_id() &&
           prev_.hit().section_id() == term_id_hit_pair.hit().section_id();
  }

  // Merges term_id_hit_pair with previously added hits.
  void SelectIfBetter(const TermIdHitPair& term_id_hit_pair) {
    if (term_id_hit_pair.hit().is_prefix_hit()) {
      SelectPrefixHitIfBetter(term_id_hit_pair);
    } else {
      SelectExactHitIfBetter(term_id_hit_pair);
    }
    prev_ = term_id_hit_pair;
  }

  // Adds all valid, selected hits to hits starting at position pos in hits.
  // Returns the offset in hits after the position of the last added hit.
  // This function may add between 0-2 hits depending on whether the HitSelector
  // holds both a valid exact hit and a valid prefix hit, one of those or none.
  size_t InsertSelectedHits(size_t pos, std::vector<TermIdHitPair>* hits) {
    // Given the prefix/exact hits for a given term+docid+sectionid, push needed
    // hits into hits array at offset pos. Return new pos.
    if (best_prefix_hit_.hit().is_valid() && best_exact_hit_.hit().is_valid()) {
      (*hits)[pos++] = best_exact_hit_;
      const Hit& prefix_hit = best_prefix_hit_.hit();
      // The prefix hit has score equal to the sum of the scores, capped at
      // kMaxTermFrequency.
      Hit::TermFrequency final_term_frequency = std::min(
          static_cast<int>(Hit::kMaxTermFrequency),
          prefix_hit.term_frequency() + best_exact_hit_.hit().term_frequency());
      best_prefix_hit_ = TermIdHitPair(
          best_prefix_hit_.term_id(),
          Hit(prefix_hit.section_id(), prefix_hit.document_id(),
              final_term_frequency, prefix_hit.is_in_prefix_section(),
              prefix_hit.is_prefix_hit()));
      (*hits)[pos++] = best_prefix_hit_;
      // Ensure sorted.
      if (best_prefix_hit_.hit() < best_exact_hit_.hit()) {
        std::swap((*hits)[pos - 1], (*hits)[pos - 2]);
      }
    } else if (best_prefix_hit_.hit().is_valid()) {
      (*hits)[pos++] = best_prefix_hit_;
    } else if (best_exact_hit_.hit().is_valid()) {
      (*hits)[pos++] = best_exact_hit_;
    }

    return pos;
  }

  void Reset() {
    best_prefix_hit_ = TermIdHitPair();
    best_exact_hit_ = TermIdHitPair();
    prev_ = TermIdHitPair();
  }

 private:
  void SelectPrefixHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
    if (!best_prefix_hit_.hit().is_valid()) {
      best_prefix_hit_ = term_id_hit_pair;
    } else {
      const Hit& hit = term_id_hit_pair.hit();
      // Create a new prefix hit with term_frequency as the sum of the term
      // frequencies. The term frequency is capped at kMaxTermFrequency.
      Hit::TermFrequency final_term_frequency = std::min(
          static_cast<int>(Hit::kMaxTermFrequency),
          hit.term_frequency() + best_prefix_hit_.hit().term_frequency());
      best_prefix_hit_ = TermIdHitPair(
          term_id_hit_pair.term_id(),
          Hit(hit.section_id(), hit.document_id(), final_term_frequency,
              best_prefix_hit_.hit().is_in_prefix_section(),
              best_prefix_hit_.hit().is_prefix_hit()));
    }
  }

  void SelectExactHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
    if (!best_exact_hit_.hit().is_valid()) {
      best_exact_hit_ = term_id_hit_pair;
    } else {
      const Hit& hit = term_id_hit_pair.hit();
      // Create a new exact hit with term_frequency as the sum of the term
      // frequencies. The term frequency is capped at kMaxHitScore.
      Hit::TermFrequency final_term_frequency = std::min(
          static_cast<int>(Hit::kMaxTermFrequency),
          hit.term_frequency() + best_exact_hit_.hit().term_frequency());
      best_exact_hit_ = TermIdHitPair(
          term_id_hit_pair.term_id(),
          Hit(hit.section_id(), hit.document_id(), final_term_frequency,
              best_exact_hit_.hit().is_in_prefix_section(),
              best_exact_hit_.hit().is_prefix_hit()));
    }
  }

  TermIdHitPair best_prefix_hit_;
  TermIdHitPair best_exact_hit_;
  TermIdHitPair prev_;
};

class HitComparator {
 public:
  explicit HitComparator(
      const TermIdCodec& term_id_codec,
      const std::unordered_map<uint32_t, int>& main_tvi_to_block_index)
      : term_id_codec_(&term_id_codec),
        main_tvi_to_block_index_(&main_tvi_to_block_index) {}

  bool operator()(const TermIdHitPair& lhs, const TermIdHitPair& rhs) const {
    // Primary sort by index block. This acheives two things:
    // 1. It reduces the number of flash writes by grouping together new hits
    // for terms whose posting lists might share the same index block.
    // 2. More importantly, this ensures that newly added backfill branch points
    // will be populated first (because all newly added terms have an invalid
    // block index of 0) before any new hits are added to the postings lists
    // that they backfill from.
    int lhs_index_block = GetIndexBlock(lhs.term_id());
    int rhs_index_block = GetIndexBlock(rhs.term_id());
    if (lhs_index_block == rhs_index_block) {
      // Secondary sort by term_id and hit.
      return lhs.value() < rhs.value();
    }
    return lhs_index_block < rhs_index_block;
  }

 private:
  int GetIndexBlock(uint32_t term_id) const {
    auto term_info_or = term_id_codec_->DecodeTermInfo(term_id);
    if (!term_info_or.ok()) {
      ICING_LOG(WARNING)
          << "Unable to decode term-info during merge. This shouldn't happen.";
      return kInvalidBlockIndex;
    }
    TermIdCodec::DecodedTermInfo term_info =
        std::move(term_info_or).ValueOrDie();
    auto itr = main_tvi_to_block_index_->find(term_info.tvi);
    if (itr == main_tvi_to_block_index_->end()) {
      return kInvalidBlockIndex;
    }
    return itr->second;
  }

  const TermIdCodec* term_id_codec_;
  const std::unordered_map<uint32_t, int>* main_tvi_to_block_index_;
};

// A helper function to dedupe hits stored in hits. Suppose that the lite index
// contained a single document with two hits in a single prefix section: "foot"
// and "fool". When expanded, there would be four hits:
// {"fo", docid0, sectionid0}
// {"fo", docid0, sectionid0}
// {"foot", docid0, sectionid0}
// {"fool", docid0, sectionid0}
//
// The first two are duplicates of each other. So, this function will dedupe
// and shrink hits to be:
// {"fo", docid0, sectionid0}
// {"foot", docid0, sectionid0}
// {"fool", docid0, sectionid0}
//
// When two or more prefix hits are duplicates, merge into one hit with term
// frequency as the sum of the term frequencies. If there is both an exact and
// prefix hit for the same term, keep the exact hit as it is, update the prefix
// hit so that its term frequency is the sum of the term frequencies.
void DedupeHits(
    std::vector<TermIdHitPair>* hits, const TermIdCodec& term_id_codec,
    const std::unordered_map<uint32_t, int>& main_tvi_to_block_index) {
  // Now all terms are grouped together and all hits for a term are sorted.
  // Merge equivalent hits into one.
  std::sort(hits->begin(), hits->end(),
            HitComparator(term_id_codec, main_tvi_to_block_index));
  size_t current_offset = 0;
  HitSelector hit_selector;
  for (const TermIdHitPair& term_id_hit_pair : *hits) {
    if (!hit_selector.IsEquivalentHit(term_id_hit_pair)) {
      // We've reached a new hit. Insert the previously selected hits that we
      // had accumulated and reset to add this new hit.
      current_offset = hit_selector.InsertSelectedHits(current_offset, hits);
      hit_selector.Reset();
    }
    // Update best exact and prefix hit.
    hit_selector.SelectIfBetter(term_id_hit_pair);
  }

  // Push last.
  current_offset = hit_selector.InsertSelectedHits(current_offset, hits);

  hits->resize(current_offset);
}

// Based on experiments with full prefix expansion, the multiplier
// is ~4x.
constexpr int kAvgPrefixesPerTerm = 4;

}  // namespace

libtextclassifier3::StatusOr<std::vector<TermIdHitPair>>
MainIndexMerger::TranslateAndExpandLiteHits(
    const LiteIndex& lite_index, const TermIdCodec& term_id_codec,
    const MainIndex::LexiconMergeOutputs& lexicon_merge_outputs) {
  std::vector<TermIdHitPair> hits;
  if (lite_index.empty()) {
    return hits;
  }
  // Reserve enough space for the average number of prefixes per term and the
  // terms themselves.
  hits.reserve(lite_index.size() * (kAvgPrefixesPerTerm + 1));

  // Translate lite tvis to main tvis.
  for (const TermIdHitPair& term_id_hit_pair : lite_index) {
    uint32_t cur_term_id = term_id_hit_pair.term_id();
    ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
                           term_id_codec.DecodeTermInfo(cur_term_id));
    Hit hit(term_id_hit_pair.hit());

    // 1. Translate and push original.
    auto itr =
        lexicon_merge_outputs.other_tvi_to_main_tvi.find(cur_decoded_term.tvi);
    if (itr == lexicon_merge_outputs.other_tvi_to_main_tvi.cend()) {
      // b/37273773
      return absl_ports::InternalError(IcingStringUtil::StringPrintf(
          "Trying to translate lite tvi %u that was never added to the lexicon",
          cur_decoded_term.tvi));
    }
    ICING_ASSIGN_OR_RETURN(uint32_t term_id,
                           term_id_codec.EncodeTvi(itr->second, TviType::MAIN));
    hits.emplace_back(term_id, hit);

    // 2. Expand hits in prefix sections.
    if (hit.is_in_prefix_section()) {
      // Hit was in a prefix section. Push prefixes. Turn on prefix bit.
      auto itr_prefixes =
          lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.find(
              cur_decoded_term.tvi);
      if (itr_prefixes ==
          lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.end()) {
        ICING_VLOG(1) << "No necessary prefix expansion for "
                      << cur_decoded_term.tvi;
        continue;
      }
      // The tvis of all prefixes of this hit's term that appear in the main
      // lexicon are between [prefix_tvis_buf[offset],
      // prefix_tvis_buf[offset+len]).
      size_t offset = itr_prefixes->second.first;
      size_t len = itr_prefixes->second.second;
      size_t offset_end_exclusive = offset + len;
      Hit prefix_hit(hit.section_id(), hit.document_id(), hit.term_frequency(),
                     /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
      for (; offset < offset_end_exclusive; ++offset) {
        // Take the tvi (in the main lexicon) of each prefix term.
        uint32_t prefix_main_tvi =
            lexicon_merge_outputs.prefix_tvis_buf[offset];
        // Convert it to a term_id.
        ICING_ASSIGN_OR_RETURN(
            uint32_t prefix_term_id,
            term_id_codec.EncodeTvi(prefix_main_tvi, TviType::MAIN));
        // Create add an element for this prefix TermId and prefix Hit to hits.
        hits.emplace_back(prefix_term_id, prefix_hit);
      }
    }
  }
  // 3. Remove any duplicate hits.
  DedupeHits(&hits, term_id_codec,
             lexicon_merge_outputs.main_tvi_to_block_index);
  return hits;
}

}  // namespace lib
}  // namespace icing