android-14.0.0_r21/s

/*
 * Copyright (C) 2021 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef INCLUDE_PERFETTO_EXT_BASE_FLAT_HASH_MAP_H_
#define INCLUDE_PERFETTO_EXT_BASE_FLAT_HASH_MAP_H_

#include "perfetto/base/compiler.h"
#include "perfetto/base/logging.h"
#include "perfetto/ext/base/hash.h"
#include "perfetto/ext/base/utils.h"

#include <algorithm>
#include <functional>
#include <limits>

namespace perfetto {
namespace base {

// An open-addressing hashmap implementation.
// Pointers are not stable, neither for keys nor values.
// Has similar performances of a RobinHood hash (without the complications)
// and 2x an unordered map.
// Doc: http://go/perfetto-hashtables .
//
// When used to implement a string pool in TraceProcessor, the performance
// characteristics obtained by replaying the set of strings seeen in a 4GB trace
// (226M strings, 1M unique) are the following (see flat_hash_map_benchmark.cc):
// This(Linear+AppendOnly)    879,383,676 ns    258.013M insertions/s
// This(LinearProbe):         909,206,047 ns    249.546M insertions/s
// This(QuadraticProbe):    1,083,844,388 ns    209.363M insertions/s
// std::unordered_map:      6,203,351,870 ns    36.5811M insertions/s
// tsl::robin_map:            931,403,397 ns    243.622M insertions/s
// absl::flat_hash_map:       998,013,459 ns    227.379M insertions/s
// FollyF14FastMap:         1,181,480,602 ns    192.074M insertions/s

// The structs below define the probing algorithm used to probe slots upon a
// collision. They are guaranteed to visit all slots as our table size is always
// a power of two (see https://en.wikipedia.org/wiki/Quadratic_probing).

// Linear probing can be faster if the hashing is well distributed and the load
// is not high. For TraceProcessor's StringPool this is the fastest. It can
// degenerate badly if the hashing doesn't spread (e.g., if using directly pids
// as keys, with a no-op hashing function).
struct LinearProbe {
  static inline size_t Calc(size_t key_hash, size_t step, size_t capacity) {
    return (key_hash + step) & (capacity - 1);  // Linear probe
  }
};

// Generates the sequence: 0, 3, 10, 21, 36, 55, ...
// Can be a bit (~5%) slower than LinearProbe because it's less cache hot, but
// avoids degenerating badly if the hash function is bad and causes clusters.
// A good default choice unless benchmarks prove otherwise.
struct QuadraticProbe {
  static inline size_t Calc(size_t key_hash, size_t step, size_t capacity) {
    return (key_hash + 2 * step * step + step) & (capacity - 1);
  }
};

// Tends to perform in the middle between linear and quadratic.
// It's a bit more cache-effective than the QuadraticProbe but can create more
// clustering if the hash function doesn't spread well.
// Generates the sequence: 0, 1, 3, 6, 10, 15, 21, ...
struct QuadraticHalfProbe {
  static inline size_t Calc(size_t key_hash, size_t step, size_t capacity) {
    return (key_hash + (step * step + step) / 2) & (capacity - 1);
  }
};

template <typename Key,
          typename Value,
          typename Hasher = base::Hash<Key>,
          typename Probe = QuadraticProbe,
          bool AppendOnly = false>
class FlatHashMap {
 public:
  class Iterator {
   public:
    explicit Iterator(const FlatHashMap* map) : map_(map) { FindNextNonFree(); }
    ~Iterator() = default;
    Iterator(const Iterator&) = default;
    Iterator& operator=(const Iterator&) = default;
    Iterator(Iterator&&) noexcept = default;
    Iterator& operator=(Iterator&&) noexcept = default;

    Key& key() { return map_->keys_[idx_]; }
    Value& value() { return map_->values_[idx_]; }
    const Key& key() const { return map_->keys_[idx_]; }
    const Value& value() const { return map_->values_[idx_]; }

    explicit operator bool() const { return idx_ != kEnd; }
    Iterator& operator++() {
      PERFETTO_DCHECK(idx_ < map_->capacity_);
      ++idx_;
      FindNextNonFree();
      return *this;
    }

   private:
    static constexpr size_t kEnd = std::numeric_limits<size_t>::max();

    void FindNextNonFree() {
      const auto& tags = map_->tags_;
      for (; idx_ < map_->capacity_; idx_++) {
        if (tags[idx_] != kFreeSlot && (AppendOnly || tags[idx_] != kTombstone))
          return;
      }
      idx_ = kEnd;
    }

    const FlatHashMap* map_ = nullptr;
    size_t idx_ = 0;
  };  // Iterator

  static constexpr int kDefaultLoadLimitPct = 75;
  explicit FlatHashMap(size_t initial_capacity = 0,
                       int load_limit_pct = kDefaultLoadLimitPct)
      : load_limit_percent_(load_limit_pct) {
    if (initial_capacity > 0)
      Reset(initial_capacity);
  }

  // We are calling Clear() so that the destructors for the inserted entries are
  // called (unless they are trivial, in which case it will be a no-op).
  ~FlatHashMap() { Clear(); }

  FlatHashMap(FlatHashMap&& other) noexcept {
    tags_ = std::move(other.tags_);
    keys_ = std::move(other.keys_);
    values_ = std::move(other.values_);
    capacity_ = other.capacity_;
    size_ = other.size_;
    max_probe_length_ = other.max_probe_length_;
    load_limit_ = other.load_limit_;
    load_limit_percent_ = other.load_limit_percent_;

    new (&other) FlatHashMap();
  }

  FlatHashMap& operator=(FlatHashMap&& other) noexcept {
    this->~FlatHashMap();
    new (this) FlatHashMap(std::move(other));
    return *this;
  }

  FlatHashMap(const FlatHashMap&) = delete;
  FlatHashMap& operator=(const FlatHashMap&) = delete;

  std::pair<Value*, bool> Insert(Key key, Value value) {
    const size_t key_hash = Hasher{}(key);
    const uint8_t tag = HashToTag(key_hash);
    static constexpr size_t kSlotNotFound = std::numeric_limits<size_t>::max();

    // This for loop does in reality at most two attempts:
    // The first iteration either:
    //  - Early-returns, because the key exists already,
    //  - Finds an insertion slot and proceeds because the load is < limit.
    // The second iteration is only hit in the unlikely case of this insertion
    // bringing the table beyond the target |load_limit_| (or the edge case
    // of the HT being full, if |load_limit_pct_| = 100).
    // We cannot simply pre-grow the table before insertion, because we must
    // guarantee that calling Insert() with a key that already exists doesn't
    // invalidate iterators.
    size_t insertion_slot;
    size_t probe_len;
    for (;;) {
      PERFETTO_DCHECK((capacity_ & (capacity_ - 1)) == 0);  // Must be a pow2.
      insertion_slot = kSlotNotFound;
      // Start the iteration at the desired slot (key_hash % capacity_)
      // searching either for a free slot or a tombstone. In the worst case we
      // might end up scanning the whole array of slots. The Probe functions are
      // guaranteed to visit all the slots within |capacity_| steps. If we find
      // a free slot, we can stop the search immediately (a free slot acts as an
      // "end of chain for entries having the same hash". If we find a
      // tombstones (a deleted slot) we remember its position, but have to keep
      // searching until a free slot to make sure we don't insert a duplicate
      // key.
      for (probe_len = 0; probe_len < capacity_;) {
        const size_t idx = Probe::Calc(key_hash, probe_len, capacity_);
        PERFETTO_DCHECK(idx < capacity_);
        const uint8_t tag_idx = tags_[idx];
        ++probe_len;
        if (tag_idx == kFreeSlot) {
          // Rationale for "insertion_slot == kSlotNotFound": if we encountered
          // a tombstone while iterating we should reuse that rather than
          // taking another slot.
          if (AppendOnly || insertion_slot == kSlotNotFound)
            insertion_slot = idx;
          break;
        }
        // We should never encounter tombstones in AppendOnly mode.
        PERFETTO_DCHECK(!(tag_idx == kTombstone && AppendOnly));
        if (!AppendOnly && tag_idx == kTombstone) {
          insertion_slot = idx;
          continue;
        }
        if (tag_idx == tag && keys_[idx] == key) {
          // The key is already in the map.
          return std::make_pair(&values_[idx], false);
        }
      }  // for (idx)

      // If we got to this point the key does not exist (otherwise we would have
      // hit the return above) and we are going to insert a new entry.
      // Before doing so, ensure we stay under the target load limit.
      if (PERFETTO_UNLIKELY(size_ >= load_limit_)) {
        MaybeGrowAndRehash(/*grow=*/true);
        continue;
      }
      PERFETTO_DCHECK(insertion_slot != kSlotNotFound);
      break;
    }  // for (attempt)

    PERFETTO_CHECK(insertion_slot < capacity_);

    // We found a free slot (or a tombstone). Proceed with the insertion.
    Value* value_idx = &values_[insertion_slot];
    new (&keys_[insertion_slot]) Key(std::move(key));
    new (value_idx) Value(std::move(value));
    tags_[insertion_slot] = tag;
    PERFETTO_DCHECK(probe_len > 0 && probe_len <= capacity_);
    max_probe_length_ = std::max(max_probe_length_, probe_len);
    size_++;

    return std::make_pair(value_idx, true);
  }

  Value* Find(const Key& key) const {
    const size_t idx = FindInternal(key);
    if (idx == kNotFound)
      return nullptr;
    return &values_[idx];
  }

  bool Erase(const Key& key) {
    if (AppendOnly)
      PERFETTO_FATAL("Erase() not supported because AppendOnly=true");
    size_t idx = FindInternal(key);
    if (idx == kNotFound)
      return false;
    EraseInternal(idx);
    return true;
  }

  void Clear() {
    // Avoid trivial heap operations on zero-capacity std::move()-d objects.
    if (PERFETTO_UNLIKELY(capacity_ == 0))
      return;

    for (size_t i = 0; i < capacity_; ++i) {
      const uint8_t tag = tags_[i];
      if (tag != kFreeSlot && tag != kTombstone)
        EraseInternal(i);
    }
    // Clear all tombstones. We really need to do this for AppendOnly.
    MaybeGrowAndRehash(/*grow=*/false);
  }

  Value& operator[](Key key) {
    auto it_and_inserted = Insert(std::move(key), Value{});
    return *it_and_inserted.first;
  }

  Iterator GetIterator() { return Iterator(this); }
  const Iterator GetIterator() const { return Iterator(this); }

  size_t size() const { return size_; }
  size_t capacity() const { return capacity_; }

  // "protected" here is only for the flat_hash_map_benchmark.cc. Everything
  // below is by all means private.
 protected:
  enum ReservedTags : uint8_t { kFreeSlot = 0, kTombstone = 1 };
  static constexpr size_t kNotFound = std::numeric_limits<size_t>::max();

  size_t FindInternal(const Key& key) const {
    const size_t key_hash = Hasher{}(key);
    const uint8_t tag = HashToTag(key_hash);
    PERFETTO_DCHECK((capacity_ & (capacity_ - 1)) == 0);  // Must be a pow2.
    PERFETTO_DCHECK(max_probe_length_ <= capacity_);
    for (size_t i = 0; i < max_probe_length_; ++i) {
      const size_t idx = Probe::Calc(key_hash, i, capacity_);
      const uint8_t tag_idx = tags_[idx];

      if (tag_idx == kFreeSlot)
        return kNotFound;
      // HashToTag() never returns kTombstone, so the tag-check below cannot
      // possibly match. Also we just want to skip tombstones.
      if (tag_idx == tag && keys_[idx] == key) {
        PERFETTO_DCHECK(tag_idx > kTombstone);
        return idx;
      }
    }  // for (idx)
    return kNotFound;
  }

  void EraseInternal(size_t idx) {
    PERFETTO_DCHECK(tags_[idx] > kTombstone);
    PERFETTO_DCHECK(size_ > 0);
    tags_[idx] = kTombstone;
    keys_[idx].~Key();
    values_[idx].~Value();
    size_--;
  }

  PERFETTO_NO_INLINE void MaybeGrowAndRehash(bool grow) {
    PERFETTO_DCHECK(size_ <= capacity_);
    const size_t old_capacity = capacity_;

    // Grow quickly up to 1MB, then chill.
    const size_t old_size_bytes = old_capacity * (sizeof(Key) + sizeof(Value));
    const size_t grow_factor = old_size_bytes < (1024u * 1024u) ? 8 : 2;
    const size_t new_capacity =
        grow ? std::max(old_capacity * grow_factor, size_t(1024))
             : old_capacity;

    auto old_tags(std::move(tags_));
    auto old_keys(std::move(keys_));
    auto old_values(std::move(values_));
    size_t old_size = size_;

    // This must be a CHECK (i.e. not just a DCHECK) to prevent UAF attacks on
    // 32-bit archs that try to double the size of the table until wrapping.
    PERFETTO_CHECK(new_capacity >= old_capacity);
    Reset(new_capacity);

    size_t new_size = 0;  // Recompute the size.
    for (size_t i = 0; i < old_capacity; ++i) {
      const uint8_t old_tag = old_tags[i];
      if (old_tag != kFreeSlot && old_tag != kTombstone) {
        Insert(std::move(old_keys[i]), std::move(old_values[i]));
        old_keys[i].~Key();  // Destroy the old objects.
        old_values[i].~Value();
        new_size++;
      }
    }
    PERFETTO_DCHECK(new_size == old_size);
    size_ = new_size;
  }

  // Doesn't call destructors. Use Clear() for that.
  PERFETTO_NO_INLINE void Reset(size_t n) {
    PERFETTO_DCHECK((n & (n - 1)) == 0);  // Must be a pow2.

    capacity_ = n;
    max_probe_length_ = 0;
    size_ = 0;
    load_limit_ = n * static_cast<size_t>(load_limit_percent_) / 100;
    load_limit_ = std::min(load_limit_, n);

    tags_.reset(new uint8_t[n]);
    memset(&tags_[0], 0, n);                  // Clear all tags.
    keys_ = AlignedAllocTyped<Key[]>(n);      // Deliberately not 0-initialized.
    values_ = AlignedAllocTyped<Value[]>(n);  // Deliberately not 0-initialized.
  }

  static inline uint8_t HashToTag(size_t full_hash) {
    uint8_t tag = full_hash >> (sizeof(full_hash) * 8 - 8);
    // Ensure the hash is always >= 2. We use 0, 1 for kFreeSlot and kTombstone.
    tag += (tag <= kTombstone) << 1;
    PERFETTO_DCHECK(tag > kTombstone);
    return tag;
  }

  size_t capacity_ = 0;
  size_t size_ = 0;
  size_t max_probe_length_ = 0;
  size_t load_limit_ = 0;  // Updated every time |capacity_| changes.
  int load_limit_percent_ =
      kDefaultLoadLimitPct;  // Load factor limit in % of |capacity_|.

  // These arrays have always the |capacity_| elements.
  // Note: AlignedUniquePtr just allocates memory, doesn't invoke any ctor/dtor.
  std::unique_ptr<uint8_t[]> tags_;
  AlignedUniquePtr<Key[]> keys_;
  AlignedUniquePtr<Value[]> values_;
};

}  // namespace base
}  // namespace perfetto

#endif  // INCLUDE_PERFETTO_EXT_BASE_FLAT_HASH_MAP_H_