// Copyright 2024 gRPC authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "src/core/util/tdigest.h" #include #include #include #include #include #include #include #include #include "absl/container/flat_hash_map.h" #include "absl/random/random.h" #include "gmock/gmock.h" #include "gtest/gtest.h" using testing::DoubleNear; using testing::NanSensitiveDoubleEq; namespace grpc_core { namespace { constexpr double kNan = std::numeric_limits::quiet_NaN(); double GetTrueQuantile(const std::vector& samples, double quantile) { std::vector s = samples; double true_idx = static_cast(s.size()) * quantile - 1; double idx_left = std::floor(true_idx); double idx_right = std::ceil(true_idx); std::sort(s.begin(), s.end()); if (idx_left == idx_right) { return s[idx_left]; } return s[idx_left] * (idx_right - true_idx) + s[idx_right] * (true_idx - idx_left); } double GetTrueCdf(const std::vector& samples, double val) { std::vector s = samples; std::sort(s.begin(), s.end()); if (val < s.front()) { return 0; } if (val >= s.back()) { return 1; } int true_idx = 0; for (double v : s) { if (v > val) { break; } ++true_idx; } return static_cast(true_idx) / static_cast(samples.size()); } } // namespace TEST(TDigestTest, Reset) { TDigest tdigest(100); EXPECT_EQ(tdigest.Compression(), 100); tdigest.Reset(50); EXPECT_EQ(tdigest.Compression(), 50); tdigest.Reset(20); EXPECT_EQ(tdigest.Compression(), 20); } TEST(TDigestTest, Stats) { TDigest tdigest(100); tdigest.Add(10); EXPECT_EQ(1, tdigest.Count()); EXPECT_EQ(10, tdigest.Min()); EXPECT_EQ(10, tdigest.Max()); EXPECT_EQ(10, tdigest.Sum()); EXPECT_EQ(100, tdigest.Compression()); tdigest.Add(20); EXPECT_EQ(2, tdigest.Count()); EXPECT_EQ(10, tdigest.Min()); EXPECT_EQ(20, tdigest.Max()); EXPECT_EQ(30, tdigest.Sum()); EXPECT_EQ(100, tdigest.Compression()); } TEST(TDigestTest, MergeMultipleIntoSingleValued) { constexpr double kMerges = 100; constexpr double kCompression = 100; TDigest tdigest(kCompression); auto p01 = tdigest.Quantile(.01); auto p50 = tdigest.Quantile(.50); auto p99 = tdigest.Quantile(.99); EXPECT_THAT(p01, NanSensitiveDoubleEq(kNan)); EXPECT_THAT(p50, NanSensitiveDoubleEq(kNan)); EXPECT_THAT(p99, NanSensitiveDoubleEq(kNan)); for (int i = 0; i < kMerges; i++) { TDigest new_tdigest(kCompression); new_tdigest.Add(10); new_tdigest.Merge(tdigest); new_tdigest.Swap(tdigest); } EXPECT_EQ(kMerges, tdigest.Count()); p01 = tdigest.Quantile(.01); p50 = tdigest.Quantile(.50); p99 = tdigest.Quantile(.99); EXPECT_THAT(p01, NanSensitiveDoubleEq(10)); EXPECT_THAT(p50, NanSensitiveDoubleEq(10)); EXPECT_THAT(p99, NanSensitiveDoubleEq(10)); } TEST(TDigestTest, MergeSingleIntoMultipleValued) { constexpr double kMerges = 100; constexpr double kCompression = 100; TDigest tdigest(kCompression); auto p01 = tdigest.Quantile(.01); auto p50 = tdigest.Quantile(.50); auto p99 = tdigest.Quantile(.99); EXPECT_THAT(p01, NanSensitiveDoubleEq(kNan)); EXPECT_THAT(p50, NanSensitiveDoubleEq(kNan)); EXPECT_THAT(p99, NanSensitiveDoubleEq(kNan)); for (int i = 0; i < kMerges; i++) { TDigest new_tdigest(kCompression); new_tdigest.Add(10); tdigest.Merge(new_tdigest); } EXPECT_EQ(kMerges, tdigest.Count()); p01 = tdigest.Quantile(.01); p50 = tdigest.Quantile(.50); p99 = tdigest.Quantile(.99); EXPECT_THAT(p01, NanSensitiveDoubleEq(10)); EXPECT_THAT(p50, NanSensitiveDoubleEq(10)); EXPECT_THAT(p99, NanSensitiveDoubleEq(10)); } TEST(TDigestTest, CdfBetweenLastCentroidAndMax) { // Make sure we return the correct CDF value for an element between the last // centroid and maximum. constexpr double kCompression = 10; TDigest tdigest(kCompression); for (int i = 1; i <= 100; i++) { tdigest.Add(i); } for (int i = 1; i <= 100; i++) { tdigest.Add(i * 100); } for (int i = 1; i <= 100; i++) { tdigest.Add(i * 200); } auto cdf_min = tdigest.Cdf(1); EXPECT_THAT(cdf_min, NanSensitiveDoubleEq(0)); auto cdf_max = tdigest.Cdf(20000); EXPECT_THAT(cdf_max, NanSensitiveDoubleEq(1)); auto cdf_tail = tdigest.Cdf(20000 - 1); EXPECT_THAT(cdf_tail, DoubleNear(0.9999, 1e-4)); } TEST(TDigestTest, CdfMostlyMin) { // Make sure we return the correct CDF value when most samples are the // minimum value. constexpr double kCompression = 10; constexpr double kMin = 0; constexpr double kMax = 1; TDigest tdigest(kCompression); for (int i = 0; i < 100; i++) { tdigest.Add(kMin); } tdigest.Add(kMax); auto cdf_min = tdigest.Cdf(kMin); EXPECT_THAT(cdf_min, DoubleNear(0.98, 1e-3)); auto cdf_max = tdigest.Cdf(kMax); EXPECT_THAT(cdf_max, NanSensitiveDoubleEq(1)); } TEST(TDigestTest, SingletonInACrowd) { // Add a test case similar to what is reported upstream: // https://github.com/tdunning/t-digest/issues/89 // // We want to make sure when we have 10k samples of a specific number, // we do not lose information about a single sample at the tail. constexpr int kCrowdSize = 10000; constexpr double kCompression = 100; TDigest tdigest(kCompression); for (int i = 0; i < kCrowdSize; i++) { tdigest.Add(10); } tdigest.Add(20); EXPECT_THAT(tdigest.Quantile(0), NanSensitiveDoubleEq(10)); EXPECT_THAT(tdigest.Quantile(0.5), NanSensitiveDoubleEq(10)); EXPECT_THAT(tdigest.Quantile(0.8), NanSensitiveDoubleEq(10)); EXPECT_THAT(tdigest.Quantile(0.9), NanSensitiveDoubleEq(10)); EXPECT_THAT(tdigest.Quantile(0.99), NanSensitiveDoubleEq(10)); EXPECT_THAT(tdigest.Quantile(0.999), NanSensitiveDoubleEq(10)); EXPECT_THAT(tdigest.Quantile(1), NanSensitiveDoubleEq(20)); } TEST(TDigestTest, MergeDifferentCompressions) { TDigest tdigest_1(100); TDigest tdigest_2(200); for (int i = 0; i < 10; ++i) { tdigest_1.Add(1); tdigest_2.Add(2); } { // Compression/discrete should remain "to be determined". TDigest tdigest_0(0); tdigest_0.Merge(tdigest_0); EXPECT_EQ(0, tdigest_0.Compression()); } { // Should take compression/discrete of tdigest_1. TDigest tdigest_0(0); tdigest_0.Merge(tdigest_1); EXPECT_EQ(100, tdigest_0.Compression()); EXPECT_EQ(1, tdigest_0.Max()); EXPECT_EQ(10, tdigest_0.Count()); } { // Should take compression/discrete of tdigest_2. TDigest tdigest_0(0); tdigest_0.Merge(tdigest_2); EXPECT_EQ(200, tdigest_0.Compression()); EXPECT_EQ(2, tdigest_0.Max()); EXPECT_EQ(10, tdigest_0.Count()); // Now should succeed without changing compression/discrete. tdigest_0.Merge(tdigest_1); EXPECT_EQ(200, tdigest_0.Compression()); EXPECT_EQ(2, tdigest_0.Max()); EXPECT_EQ(20, tdigest_0.Count()); } } // Sample generators for Cdf() and Percentile() precision tests. // Returns a vector of `val` repeated `count` times. std::vector ConstantSamples(int count, double val) { std::vector v; v.reserve(count); for (int i = 0; i < count; ++i) { v.push_back(val); } return v; } // Returns a vector of `count` number of samples from Normal(mean, stdev. std::vector NormalSamples(int count, double mean, double stdev) { std::vector v; v.reserve(count); absl::BitGen rng; for (int i = 0; i < count; i++) { v.push_back(absl::Gaussian(rng, mean, stdev)); } return v; } // Returns a vector of `count` number of samples drawn uniformly randomly in // range [from, to]. template std::vector UniformSamples(int count, double from, double to) { std::vector v; v.reserve(count); absl::BitGen rng; for (int i = 0; i < count; i++) { v.push_back(static_cast(absl::Uniform(rng, from, to))); } return v; } struct PrecisionTestParam { // Function to get samples vector. std::function()> generate_samples; // Map of {percentile or val, max error bound}. absl::flat_hash_map max_errors; }; class QuantilePrecisionTest : public ::testing::TestWithParam { public: static constexpr double kCompression = 100; static constexpr double kMaxCentroids = 200; }; // Tests max and average Percentile() errors against the true percentile. TEST_P(QuantilePrecisionTest, QuantilePrecisionTest) { // We expect higher precision near both ends. const absl::flat_hash_map& max_error_bounds = GetParam().max_errors; const std::vector samples = GetParam().generate_samples(); TDigest tdigest(kCompression); for (double i : samples) { tdigest.Add(i); } for (const auto& p : max_error_bounds) { double quantile = p.first; double error = abs(tdigest.Quantile(quantile) - GetTrueQuantile(samples, quantile)); EXPECT_LE(error, p.second) << "(quantile=" << quantile << ") actual:" << tdigest.Quantile(quantile) << " expected:" << GetTrueQuantile(samples, quantile); } } // Same as above but merge every 1000 samples. TEST_P(QuantilePrecisionTest, MergeQuantilePrecisionTest) { // We expect higher precision near both ends. const absl::flat_hash_map& max_error_bounds = GetParam().max_errors; const std::vector samples = GetParam().generate_samples(); TDigest tdigest(kCompression); TDigest temp(kCompression); for (double i : samples) { temp.Add(i); if (temp.Count() == 1000) { tdigest.Merge(temp); temp.Reset(kCompression); } } tdigest.Merge(temp); ASSERT_EQ(tdigest.Count(), samples.size()); for (const auto& p : max_error_bounds) { double quantile = p.first; double error = abs(tdigest.Quantile(quantile) - GetTrueQuantile(samples, quantile)); EXPECT_LE(error, p.second) << "(quantile=" << quantile << ") actual:" << tdigest.Quantile(quantile) << " expected:" << GetTrueQuantile(samples, quantile); } } INSTANTIATE_TEST_SUITE_P( QuantilePrecisionTest, QuantilePrecisionTest, ::testing::Values( // Constant PrecisionTestParam{[]() { return ConstantSamples(100000, 10); }, {{0.01, 0}, {0.5, 0}, {0.99, 0}}}, // Continuous samples PrecisionTestParam{[]() { return NormalSamples(100000, 0, 5); }, {{0.01, 0.5}, {0.5, 1}, {0.99, 0.5}}}, PrecisionTestParam{[]() { return UniformSamples(100000, -5000, 5000); }, {{0.01, 20}, {0.5, 50}, {0.99, 20}}})); class CdfPrecisionTest : public ::testing::TestWithParam { public: static constexpr double kCompression = 100; static constexpr double kMaxCentroids = 200; }; // Tests max and average Percentile() errors against the true percentile. TEST_P(CdfPrecisionTest, CdfPrecisionTest) { // We expect higher precision near both ends. const absl::flat_hash_map& max_error_bounds = GetParam().max_errors; const std::vector samples = GetParam().generate_samples(); TDigest tdigest(kCompression); for (double i : samples) { tdigest.Add(i); } ASSERT_EQ(tdigest.Count(), samples.size()); for (const auto& p : max_error_bounds) { double val = p.first; double error = abs(tdigest.Cdf(val) - GetTrueCdf(samples, val)); EXPECT_LE(error, p.second) << "(val=" << val << ") actual:" << tdigest.Cdf(val) << " expected:" << GetTrueCdf(samples, val); } } // Same as above but merge every 1000 samples. TEST_P(CdfPrecisionTest, MergeCdfPrecisionTest) { // We expect higher precision near both ends. const absl::flat_hash_map& max_error_bounds = GetParam().max_errors; const std::vector samples = GetParam().generate_samples(); TDigest tdigest(kCompression); TDigest temp(kCompression); for (double i : samples) { temp.Add(i); if (temp.Count() == 1000) { tdigest.Merge(temp); temp.Reset(kCompression); } } tdigest.Merge(temp); ASSERT_EQ(tdigest.Count(), samples.size()); for (const auto& p : max_error_bounds) { double val = p.first; double error = abs(tdigest.Cdf(val) - GetTrueCdf(samples, val)); EXPECT_LE(error, p.second) << "(val=" << val << ") actual:" << tdigest.Cdf(val) << " expected:" << GetTrueCdf(samples, val); } } INSTANTIATE_TEST_SUITE_P( CdfPrecisionTest, CdfPrecisionTest, ::testing::Values( // Constant. PrecisionTestParam{[]() { return ConstantSamples(100000, 10); }, {{9, 0}, {10, 0}, {11, 0}}}, // Continuous samples. PrecisionTestParam{[]() { return NormalSamples(100000, 0, 5); }, {{-10, 0.005}, {0.0, 0.005}, {10, 0.005}}}, PrecisionTestParam{[]() { return UniformSamples(100000, -5000, 5000); }, {{-5000.1, 0}, {-4900, 0.005}, {0, 0.005}, {4900, 0.005}, {5000, 0}}}, // Dense and sparse samples. PrecisionTestParam{ []() { return UniformSamples(100000, 0, 10); }, {{-0.01, 0}, {0.01, 0.03}, {5.0, 0.05}, {9.99, 0.03}, {10, 0}}}, PrecisionTestParam{ []() { return UniformSamples(100000, -10000, 10000); }, {{-10001, 0}, {-9900, 0.005}, {0, 0.008}, {9900, 0.005}, {10000, 0}}})); TEST(TDigestEmptyStringTest, Test) { TDigest tdigest(0); ASSERT_TRUE(tdigest.FromString("").ok()); EXPECT_EQ(tdigest.ToString(), "0/0/0/0/0"); } } // namespace grpc_core int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); }