• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2024 gRPC authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/core/util/tdigest.h"
16 
17 #include <algorithm>
18 #include <cmath>
19 #include <functional>
20 #include <limits>
21 #include <random>
22 #include <string>
23 #include <utility>
24 #include <vector>
25 
26 #include "absl/container/flat_hash_map.h"
27 #include "absl/random/random.h"
28 #include "gmock/gmock.h"
29 #include "gtest/gtest.h"
30 
31 using testing::DoubleNear;
32 using testing::NanSensitiveDoubleEq;
33 
34 namespace grpc_core {
35 namespace {
36 
37 constexpr double kNan = std::numeric_limits<double>::quiet_NaN();
38 
GetTrueQuantile(const std::vector<double> & samples,double quantile)39 double GetTrueQuantile(const std::vector<double>& samples, double quantile) {
40   std::vector<double> s = samples;
41   double true_idx = static_cast<double>(s.size()) * quantile - 1;
42   double idx_left = std::floor(true_idx);
43   double idx_right = std::ceil(true_idx);
44 
45   std::sort(s.begin(), s.end());
46   if (idx_left == idx_right) {
47     return s[idx_left];
48   }
49   return s[idx_left] * (idx_right - true_idx) +
50          s[idx_right] * (true_idx - idx_left);
51 }
52 
GetTrueCdf(const std::vector<double> & samples,double val)53 double GetTrueCdf(const std::vector<double>& samples, double val) {
54   std::vector<double> s = samples;
55   std::sort(s.begin(), s.end());
56 
57   if (val < s.front()) {
58     return 0;
59   }
60   if (val >= s.back()) {
61     return 1;
62   }
63 
64   int true_idx = 0;
65   for (double v : s) {
66     if (v > val) {
67       break;
68     }
69     ++true_idx;
70   }
71 
72   return static_cast<double>(true_idx) / static_cast<double>(samples.size());
73 }
74 
75 }  // namespace
76 
TEST(TDigestTest,Reset)77 TEST(TDigestTest, Reset) {
78   TDigest tdigest(100);
79   EXPECT_EQ(tdigest.Compression(), 100);
80 
81   tdigest.Reset(50);
82   EXPECT_EQ(tdigest.Compression(), 50);
83 
84   tdigest.Reset(20);
85   EXPECT_EQ(tdigest.Compression(), 20);
86 }
87 
TEST(TDigestTest,Stats)88 TEST(TDigestTest, Stats) {
89   TDigest tdigest(100);
90   tdigest.Add(10);
91   EXPECT_EQ(1, tdigest.Count());
92   EXPECT_EQ(10, tdigest.Min());
93   EXPECT_EQ(10, tdigest.Max());
94   EXPECT_EQ(10, tdigest.Sum());
95   EXPECT_EQ(100, tdigest.Compression());
96 
97   tdigest.Add(20);
98   EXPECT_EQ(2, tdigest.Count());
99   EXPECT_EQ(10, tdigest.Min());
100   EXPECT_EQ(20, tdigest.Max());
101   EXPECT_EQ(30, tdigest.Sum());
102   EXPECT_EQ(100, tdigest.Compression());
103 }
104 
TEST(TDigestTest,MergeMultipleIntoSingleValued)105 TEST(TDigestTest, MergeMultipleIntoSingleValued) {
106   constexpr double kMerges = 100;
107   constexpr double kCompression = 100;
108 
109   TDigest tdigest(kCompression);
110 
111   auto p01 = tdigest.Quantile(.01);
112   auto p50 = tdigest.Quantile(.50);
113   auto p99 = tdigest.Quantile(.99);
114   EXPECT_THAT(p01, NanSensitiveDoubleEq(kNan));
115   EXPECT_THAT(p50, NanSensitiveDoubleEq(kNan));
116   EXPECT_THAT(p99, NanSensitiveDoubleEq(kNan));
117 
118   for (int i = 0; i < kMerges; i++) {
119     TDigest new_tdigest(kCompression);
120     new_tdigest.Add(10);
121     new_tdigest.Merge(tdigest);
122     new_tdigest.Swap(tdigest);
123   }
124 
125   EXPECT_EQ(kMerges, tdigest.Count());
126   p01 = tdigest.Quantile(.01);
127   p50 = tdigest.Quantile(.50);
128   p99 = tdigest.Quantile(.99);
129   EXPECT_THAT(p01, NanSensitiveDoubleEq(10));
130   EXPECT_THAT(p50, NanSensitiveDoubleEq(10));
131   EXPECT_THAT(p99, NanSensitiveDoubleEq(10));
132 }
133 
TEST(TDigestTest,MergeSingleIntoMultipleValued)134 TEST(TDigestTest, MergeSingleIntoMultipleValued) {
135   constexpr double kMerges = 100;
136   constexpr double kCompression = 100;
137 
138   TDigest tdigest(kCompression);
139 
140   auto p01 = tdigest.Quantile(.01);
141   auto p50 = tdigest.Quantile(.50);
142   auto p99 = tdigest.Quantile(.99);
143   EXPECT_THAT(p01, NanSensitiveDoubleEq(kNan));
144   EXPECT_THAT(p50, NanSensitiveDoubleEq(kNan));
145   EXPECT_THAT(p99, NanSensitiveDoubleEq(kNan));
146 
147   for (int i = 0; i < kMerges; i++) {
148     TDigest new_tdigest(kCompression);
149     new_tdigest.Add(10);
150     tdigest.Merge(new_tdigest);
151   }
152 
153   EXPECT_EQ(kMerges, tdigest.Count());
154   p01 = tdigest.Quantile(.01);
155   p50 = tdigest.Quantile(.50);
156   p99 = tdigest.Quantile(.99);
157   EXPECT_THAT(p01, NanSensitiveDoubleEq(10));
158   EXPECT_THAT(p50, NanSensitiveDoubleEq(10));
159   EXPECT_THAT(p99, NanSensitiveDoubleEq(10));
160 }
161 
TEST(TDigestTest,CdfBetweenLastCentroidAndMax)162 TEST(TDigestTest, CdfBetweenLastCentroidAndMax) {
163   // Make sure we return the correct CDF value for an element between the last
164   // centroid and maximum.
165   constexpr double kCompression = 10;
166   TDigest tdigest(kCompression);
167 
168   for (int i = 1; i <= 100; i++) {
169     tdigest.Add(i);
170   }
171   for (int i = 1; i <= 100; i++) {
172     tdigest.Add(i * 100);
173   }
174   for (int i = 1; i <= 100; i++) {
175     tdigest.Add(i * 200);
176   }
177 
178   auto cdf_min = tdigest.Cdf(1);
179   EXPECT_THAT(cdf_min, NanSensitiveDoubleEq(0));
180   auto cdf_max = tdigest.Cdf(20000);
181   EXPECT_THAT(cdf_max, NanSensitiveDoubleEq(1));
182   auto cdf_tail = tdigest.Cdf(20000 - 1);
183   EXPECT_THAT(cdf_tail, DoubleNear(0.9999, 1e-4));
184 }
185 
TEST(TDigestTest,CdfMostlyMin)186 TEST(TDigestTest, CdfMostlyMin) {
187   // Make sure we return the correct CDF value when most samples are the
188   // minimum value.
189   constexpr double kCompression = 10;
190   constexpr double kMin = 0;
191   constexpr double kMax = 1;
192   TDigest tdigest(kCompression);
193 
194   for (int i = 0; i < 100; i++) {
195     tdigest.Add(kMin);
196   }
197   tdigest.Add(kMax);
198 
199   auto cdf_min = tdigest.Cdf(kMin);
200   EXPECT_THAT(cdf_min, DoubleNear(0.98, 1e-3));
201   auto cdf_max = tdigest.Cdf(kMax);
202   EXPECT_THAT(cdf_max, NanSensitiveDoubleEq(1));
203 }
204 
TEST(TDigestTest,SingletonInACrowd)205 TEST(TDigestTest, SingletonInACrowd) {
206   // Add a test case similar to what is reported upstream:
207   // https://github.com/tdunning/t-digest/issues/89
208   //
209   // We want to make sure when we have 10k samples of a specific number,
210   // we do not lose information about a single sample at the tail.
211   constexpr int kCrowdSize = 10000;
212   constexpr double kCompression = 100;
213 
214   TDigest tdigest(kCompression);
215 
216   for (int i = 0; i < kCrowdSize; i++) {
217     tdigest.Add(10);
218   }
219   tdigest.Add(20);
220 
221   EXPECT_THAT(tdigest.Quantile(0), NanSensitiveDoubleEq(10));
222   EXPECT_THAT(tdigest.Quantile(0.5), NanSensitiveDoubleEq(10));
223   EXPECT_THAT(tdigest.Quantile(0.8), NanSensitiveDoubleEq(10));
224   EXPECT_THAT(tdigest.Quantile(0.9), NanSensitiveDoubleEq(10));
225   EXPECT_THAT(tdigest.Quantile(0.99), NanSensitiveDoubleEq(10));
226   EXPECT_THAT(tdigest.Quantile(0.999), NanSensitiveDoubleEq(10));
227   EXPECT_THAT(tdigest.Quantile(1), NanSensitiveDoubleEq(20));
228 }
229 
TEST(TDigestTest,MergeDifferentCompressions)230 TEST(TDigestTest, MergeDifferentCompressions) {
231   TDigest tdigest_1(100);
232   TDigest tdigest_2(200);
233   for (int i = 0; i < 10; ++i) {
234     tdigest_1.Add(1);
235     tdigest_2.Add(2);
236   }
237 
238   {
239     // Compression/discrete should remain "to be determined".
240     TDigest tdigest_0(0);
241     tdigest_0.Merge(tdigest_0);
242     EXPECT_EQ(0, tdigest_0.Compression());
243   }
244 
245   {
246     // Should take compression/discrete of tdigest_1.
247     TDigest tdigest_0(0);
248     tdigest_0.Merge(tdigest_1);
249     EXPECT_EQ(100, tdigest_0.Compression());
250     EXPECT_EQ(1, tdigest_0.Max());
251     EXPECT_EQ(10, tdigest_0.Count());
252   }
253 
254   {
255     // Should take compression/discrete of tdigest_2.
256     TDigest tdigest_0(0);
257     tdigest_0.Merge(tdigest_2);
258     EXPECT_EQ(200, tdigest_0.Compression());
259     EXPECT_EQ(2, tdigest_0.Max());
260     EXPECT_EQ(10, tdigest_0.Count());
261 
262     // Now should succeed without changing compression/discrete.
263     tdigest_0.Merge(tdigest_1);
264     EXPECT_EQ(200, tdigest_0.Compression());
265     EXPECT_EQ(2, tdigest_0.Max());
266     EXPECT_EQ(20, tdigest_0.Count());
267   }
268 }
269 
270 // Sample generators for Cdf() and Percentile() precision tests.
271 
272 // Returns a vector of `val` repeated `count` times.
ConstantSamples(int count,double val)273 std::vector<double> ConstantSamples(int count, double val) {
274   std::vector<double> v;
275   v.reserve(count);
276   for (int i = 0; i < count; ++i) {
277     v.push_back(val);
278   }
279   return v;
280 }
281 
282 // Returns a vector of `count` number of samples from Normal(mean, stdev.
NormalSamples(int count,double mean,double stdev)283 std::vector<double> NormalSamples(int count, double mean, double stdev) {
284   std::vector<double> v;
285   v.reserve(count);
286   absl::BitGen rng;
287   for (int i = 0; i < count; i++) {
288     v.push_back(absl::Gaussian(rng, mean, stdev));
289   }
290   return v;
291 }
292 
293 // Returns a vector of `count` number of samples drawn uniformly randomly in
294 // range [from, to].
295 template <typename T = double>
UniformSamples(int count,double from,double to)296 std::vector<double> UniformSamples(int count, double from, double to) {
297   std::vector<double> v;
298   v.reserve(count);
299   absl::BitGen rng;
300   for (int i = 0; i < count; i++) {
301     v.push_back(static_cast<T>(absl::Uniform(rng, from, to)));
302   }
303   return v;
304 }
305 
306 struct PrecisionTestParam {
307   // Function to get samples vector.
308   std::function<std::vector<double>()> generate_samples;
309   // Map of {percentile or val, max error bound}.
310   absl::flat_hash_map<double, double> max_errors;
311 };
312 
313 class QuantilePrecisionTest
314     : public ::testing::TestWithParam<PrecisionTestParam> {
315  public:
316   static constexpr double kCompression = 100;
317   static constexpr double kMaxCentroids = 200;
318 };
319 
320 // Tests max and average Percentile() errors against the true percentile.
TEST_P(QuantilePrecisionTest,QuantilePrecisionTest)321 TEST_P(QuantilePrecisionTest, QuantilePrecisionTest) {
322   // We expect higher precision near both ends.
323   const absl::flat_hash_map<double, double>& max_error_bounds =
324       GetParam().max_errors;
325 
326   const std::vector<double> samples = GetParam().generate_samples();
327   TDigest tdigest(kCompression);
328   for (double i : samples) {
329     tdigest.Add(i);
330   }
331 
332   for (const auto& p : max_error_bounds) {
333     double quantile = p.first;
334     double error =
335         abs(tdigest.Quantile(quantile) - GetTrueQuantile(samples, quantile));
336     EXPECT_LE(error, p.second)
337         << "(quantile=" << quantile << ") actual:" << tdigest.Quantile(quantile)
338         << " expected:" << GetTrueQuantile(samples, quantile);
339   }
340 }
341 
342 // Same as above but merge every 1000 samples.
TEST_P(QuantilePrecisionTest,MergeQuantilePrecisionTest)343 TEST_P(QuantilePrecisionTest, MergeQuantilePrecisionTest) {
344   // We expect higher precision near both ends.
345   const absl::flat_hash_map<double, double>& max_error_bounds =
346       GetParam().max_errors;
347 
348   const std::vector<double> samples = GetParam().generate_samples();
349   TDigest tdigest(kCompression);
350   TDigest temp(kCompression);
351   for (double i : samples) {
352     temp.Add(i);
353     if (temp.Count() == 1000) {
354       tdigest.Merge(temp);
355       temp.Reset(kCompression);
356     }
357   }
358   tdigest.Merge(temp);
359 
360   ASSERT_EQ(tdigest.Count(), samples.size());
361   for (const auto& p : max_error_bounds) {
362     double quantile = p.first;
363     double error =
364         abs(tdigest.Quantile(quantile) - GetTrueQuantile(samples, quantile));
365     EXPECT_LE(error, p.second)
366         << "(quantile=" << quantile << ") actual:" << tdigest.Quantile(quantile)
367         << " expected:" << GetTrueQuantile(samples, quantile);
368   }
369 }
370 
371 INSTANTIATE_TEST_SUITE_P(
372     QuantilePrecisionTest, QuantilePrecisionTest,
373     ::testing::Values(
374         // Constant
__anon000b361a0202() 375         PrecisionTestParam{[]() { return ConstantSamples(100000, 10); },
376                            {{0.01, 0}, {0.5, 0}, {0.99, 0}}},
377 
378         // Continuous samples
__anon000b361a0402() 379         PrecisionTestParam{[]() { return NormalSamples(100000, 0, 5); },
380                            {{0.01, 0.5}, {0.5, 1}, {0.99, 0.5}}},
__anon000b361a0602() 381         PrecisionTestParam{[]() { return UniformSamples(100000, -5000, 5000); },
382                            {{0.01, 20}, {0.5, 50}, {0.99, 20}}}));
383 
384 class CdfPrecisionTest : public ::testing::TestWithParam<PrecisionTestParam> {
385  public:
386   static constexpr double kCompression = 100;
387   static constexpr double kMaxCentroids = 200;
388 };
389 
390 // Tests max and average Percentile() errors against the true percentile.
TEST_P(CdfPrecisionTest,CdfPrecisionTest)391 TEST_P(CdfPrecisionTest, CdfPrecisionTest) {
392   // We expect higher precision near both ends.
393   const absl::flat_hash_map<double, double>& max_error_bounds =
394       GetParam().max_errors;
395 
396   const std::vector<double> samples = GetParam().generate_samples();
397   TDigest tdigest(kCompression);
398   for (double i : samples) {
399     tdigest.Add(i);
400   }
401 
402   ASSERT_EQ(tdigest.Count(), samples.size());
403   for (const auto& p : max_error_bounds) {
404     double val = p.first;
405     double error = abs(tdigest.Cdf(val) - GetTrueCdf(samples, val));
406     EXPECT_LE(error, p.second)
407         << "(val=" << val << ") actual:" << tdigest.Cdf(val)
408         << " expected:" << GetTrueCdf(samples, val);
409   }
410 }
411 
412 // Same as above but merge every 1000 samples.
TEST_P(CdfPrecisionTest,MergeCdfPrecisionTest)413 TEST_P(CdfPrecisionTest, MergeCdfPrecisionTest) {
414   // We expect higher precision near both ends.
415   const absl::flat_hash_map<double, double>& max_error_bounds =
416       GetParam().max_errors;
417 
418   const std::vector<double> samples = GetParam().generate_samples();
419   TDigest tdigest(kCompression);
420   TDigest temp(kCompression);
421   for (double i : samples) {
422     temp.Add(i);
423     if (temp.Count() == 1000) {
424       tdigest.Merge(temp);
425       temp.Reset(kCompression);
426     }
427   }
428   tdigest.Merge(temp);
429 
430   ASSERT_EQ(tdigest.Count(), samples.size());
431   for (const auto& p : max_error_bounds) {
432     double val = p.first;
433     double error = abs(tdigest.Cdf(val) - GetTrueCdf(samples, val));
434     EXPECT_LE(error, p.second)
435         << "(val=" << val << ") actual:" << tdigest.Cdf(val)
436         << " expected:" << GetTrueCdf(samples, val);
437   }
438 }
439 
440 INSTANTIATE_TEST_SUITE_P(
441     CdfPrecisionTest, CdfPrecisionTest,
442     ::testing::Values(
443         // Constant.
__anon000b361a0802() 444         PrecisionTestParam{[]() { return ConstantSamples(100000, 10); },
445                            {{9, 0}, {10, 0}, {11, 0}}},
446 
447         // Continuous samples.
__anon000b361a0a02() 448         PrecisionTestParam{[]() { return NormalSamples(100000, 0, 5); },
449                            {{-10, 0.005}, {0.0, 0.005}, {10, 0.005}}},
__anon000b361a0c02() 450         PrecisionTestParam{[]() { return UniformSamples(100000, -5000, 5000); },
451                            {{-5000.1, 0},
452                             {-4900, 0.005},
453                             {0, 0.005},
454                             {4900, 0.005},
455                             {5000, 0}}},
456 
457         // Dense and sparse samples.
458         PrecisionTestParam{
__anon000b361a0e02() 459             []() { return UniformSamples<int>(100000, 0, 10); },
__anon000b361a0f02() 460             {{-0.01, 0}, {0.01, 0.03}, {5.0, 0.05}, {9.99, 0.03}, {10, 0}}},
461         PrecisionTestParam{
__anon000b361a1002() 462             []() { return UniformSamples<int>(100000, -10000, 10000); },
__anon000b361a1102() 463             {{-10001, 0},
464              {-9900, 0.005},
465              {0, 0.008},
466              {9900, 0.005},
467              {10000, 0}}}));
468 
TEST(TDigestEmptyStringTest,Test)469 TEST(TDigestEmptyStringTest, Test) {
470   TDigest tdigest(0);
471   ASSERT_TRUE(tdigest.FromString("").ok());
472   EXPECT_EQ(tdigest.ToString(), "0/0/0/0/0");
473 }
474 
475 }  // namespace grpc_core
476 
main(int argc,char ** argv)477 int main(int argc, char** argv) {
478   ::testing::InitGoogleTest(&argc, argv);
479   return RUN_ALL_TESTS();
480 }
481