1 // Copyright 2024 gRPC authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/core/util/tdigest.h"
16
17 #include <algorithm>
18 #include <cmath>
19 #include <functional>
20 #include <limits>
21 #include <random>
22 #include <string>
23 #include <utility>
24 #include <vector>
25
26 #include "absl/container/flat_hash_map.h"
27 #include "absl/random/random.h"
28 #include "gmock/gmock.h"
29 #include "gtest/gtest.h"
30
31 using testing::DoubleNear;
32 using testing::NanSensitiveDoubleEq;
33
34 namespace grpc_core {
35 namespace {
36
37 constexpr double kNan = std::numeric_limits<double>::quiet_NaN();
38
GetTrueQuantile(const std::vector<double> & samples,double quantile)39 double GetTrueQuantile(const std::vector<double>& samples, double quantile) {
40 std::vector<double> s = samples;
41 double true_idx = static_cast<double>(s.size()) * quantile - 1;
42 double idx_left = std::floor(true_idx);
43 double idx_right = std::ceil(true_idx);
44
45 std::sort(s.begin(), s.end());
46 if (idx_left == idx_right) {
47 return s[idx_left];
48 }
49 return s[idx_left] * (idx_right - true_idx) +
50 s[idx_right] * (true_idx - idx_left);
51 }
52
GetTrueCdf(const std::vector<double> & samples,double val)53 double GetTrueCdf(const std::vector<double>& samples, double val) {
54 std::vector<double> s = samples;
55 std::sort(s.begin(), s.end());
56
57 if (val < s.front()) {
58 return 0;
59 }
60 if (val >= s.back()) {
61 return 1;
62 }
63
64 int true_idx = 0;
65 for (double v : s) {
66 if (v > val) {
67 break;
68 }
69 ++true_idx;
70 }
71
72 return static_cast<double>(true_idx) / static_cast<double>(samples.size());
73 }
74
75 } // namespace
76
TEST(TDigestTest,Reset)77 TEST(TDigestTest, Reset) {
78 TDigest tdigest(100);
79 EXPECT_EQ(tdigest.Compression(), 100);
80
81 tdigest.Reset(50);
82 EXPECT_EQ(tdigest.Compression(), 50);
83
84 tdigest.Reset(20);
85 EXPECT_EQ(tdigest.Compression(), 20);
86 }
87
TEST(TDigestTest,Stats)88 TEST(TDigestTest, Stats) {
89 TDigest tdigest(100);
90 tdigest.Add(10);
91 EXPECT_EQ(1, tdigest.Count());
92 EXPECT_EQ(10, tdigest.Min());
93 EXPECT_EQ(10, tdigest.Max());
94 EXPECT_EQ(10, tdigest.Sum());
95 EXPECT_EQ(100, tdigest.Compression());
96
97 tdigest.Add(20);
98 EXPECT_EQ(2, tdigest.Count());
99 EXPECT_EQ(10, tdigest.Min());
100 EXPECT_EQ(20, tdigest.Max());
101 EXPECT_EQ(30, tdigest.Sum());
102 EXPECT_EQ(100, tdigest.Compression());
103 }
104
TEST(TDigestTest,MergeMultipleIntoSingleValued)105 TEST(TDigestTest, MergeMultipleIntoSingleValued) {
106 constexpr double kMerges = 100;
107 constexpr double kCompression = 100;
108
109 TDigest tdigest(kCompression);
110
111 auto p01 = tdigest.Quantile(.01);
112 auto p50 = tdigest.Quantile(.50);
113 auto p99 = tdigest.Quantile(.99);
114 EXPECT_THAT(p01, NanSensitiveDoubleEq(kNan));
115 EXPECT_THAT(p50, NanSensitiveDoubleEq(kNan));
116 EXPECT_THAT(p99, NanSensitiveDoubleEq(kNan));
117
118 for (int i = 0; i < kMerges; i++) {
119 TDigest new_tdigest(kCompression);
120 new_tdigest.Add(10);
121 new_tdigest.Merge(tdigest);
122 new_tdigest.Swap(tdigest);
123 }
124
125 EXPECT_EQ(kMerges, tdigest.Count());
126 p01 = tdigest.Quantile(.01);
127 p50 = tdigest.Quantile(.50);
128 p99 = tdigest.Quantile(.99);
129 EXPECT_THAT(p01, NanSensitiveDoubleEq(10));
130 EXPECT_THAT(p50, NanSensitiveDoubleEq(10));
131 EXPECT_THAT(p99, NanSensitiveDoubleEq(10));
132 }
133
TEST(TDigestTest,MergeSingleIntoMultipleValued)134 TEST(TDigestTest, MergeSingleIntoMultipleValued) {
135 constexpr double kMerges = 100;
136 constexpr double kCompression = 100;
137
138 TDigest tdigest(kCompression);
139
140 auto p01 = tdigest.Quantile(.01);
141 auto p50 = tdigest.Quantile(.50);
142 auto p99 = tdigest.Quantile(.99);
143 EXPECT_THAT(p01, NanSensitiveDoubleEq(kNan));
144 EXPECT_THAT(p50, NanSensitiveDoubleEq(kNan));
145 EXPECT_THAT(p99, NanSensitiveDoubleEq(kNan));
146
147 for (int i = 0; i < kMerges; i++) {
148 TDigest new_tdigest(kCompression);
149 new_tdigest.Add(10);
150 tdigest.Merge(new_tdigest);
151 }
152
153 EXPECT_EQ(kMerges, tdigest.Count());
154 p01 = tdigest.Quantile(.01);
155 p50 = tdigest.Quantile(.50);
156 p99 = tdigest.Quantile(.99);
157 EXPECT_THAT(p01, NanSensitiveDoubleEq(10));
158 EXPECT_THAT(p50, NanSensitiveDoubleEq(10));
159 EXPECT_THAT(p99, NanSensitiveDoubleEq(10));
160 }
161
TEST(TDigestTest,CdfBetweenLastCentroidAndMax)162 TEST(TDigestTest, CdfBetweenLastCentroidAndMax) {
163 // Make sure we return the correct CDF value for an element between the last
164 // centroid and maximum.
165 constexpr double kCompression = 10;
166 TDigest tdigest(kCompression);
167
168 for (int i = 1; i <= 100; i++) {
169 tdigest.Add(i);
170 }
171 for (int i = 1; i <= 100; i++) {
172 tdigest.Add(i * 100);
173 }
174 for (int i = 1; i <= 100; i++) {
175 tdigest.Add(i * 200);
176 }
177
178 auto cdf_min = tdigest.Cdf(1);
179 EXPECT_THAT(cdf_min, NanSensitiveDoubleEq(0));
180 auto cdf_max = tdigest.Cdf(20000);
181 EXPECT_THAT(cdf_max, NanSensitiveDoubleEq(1));
182 auto cdf_tail = tdigest.Cdf(20000 - 1);
183 EXPECT_THAT(cdf_tail, DoubleNear(0.9999, 1e-4));
184 }
185
TEST(TDigestTest,CdfMostlyMin)186 TEST(TDigestTest, CdfMostlyMin) {
187 // Make sure we return the correct CDF value when most samples are the
188 // minimum value.
189 constexpr double kCompression = 10;
190 constexpr double kMin = 0;
191 constexpr double kMax = 1;
192 TDigest tdigest(kCompression);
193
194 for (int i = 0; i < 100; i++) {
195 tdigest.Add(kMin);
196 }
197 tdigest.Add(kMax);
198
199 auto cdf_min = tdigest.Cdf(kMin);
200 EXPECT_THAT(cdf_min, DoubleNear(0.98, 1e-3));
201 auto cdf_max = tdigest.Cdf(kMax);
202 EXPECT_THAT(cdf_max, NanSensitiveDoubleEq(1));
203 }
204
TEST(TDigestTest,SingletonInACrowd)205 TEST(TDigestTest, SingletonInACrowd) {
206 // Add a test case similar to what is reported upstream:
207 // https://github.com/tdunning/t-digest/issues/89
208 //
209 // We want to make sure when we have 10k samples of a specific number,
210 // we do not lose information about a single sample at the tail.
211 constexpr int kCrowdSize = 10000;
212 constexpr double kCompression = 100;
213
214 TDigest tdigest(kCompression);
215
216 for (int i = 0; i < kCrowdSize; i++) {
217 tdigest.Add(10);
218 }
219 tdigest.Add(20);
220
221 EXPECT_THAT(tdigest.Quantile(0), NanSensitiveDoubleEq(10));
222 EXPECT_THAT(tdigest.Quantile(0.5), NanSensitiveDoubleEq(10));
223 EXPECT_THAT(tdigest.Quantile(0.8), NanSensitiveDoubleEq(10));
224 EXPECT_THAT(tdigest.Quantile(0.9), NanSensitiveDoubleEq(10));
225 EXPECT_THAT(tdigest.Quantile(0.99), NanSensitiveDoubleEq(10));
226 EXPECT_THAT(tdigest.Quantile(0.999), NanSensitiveDoubleEq(10));
227 EXPECT_THAT(tdigest.Quantile(1), NanSensitiveDoubleEq(20));
228 }
229
TEST(TDigestTest,MergeDifferentCompressions)230 TEST(TDigestTest, MergeDifferentCompressions) {
231 TDigest tdigest_1(100);
232 TDigest tdigest_2(200);
233 for (int i = 0; i < 10; ++i) {
234 tdigest_1.Add(1);
235 tdigest_2.Add(2);
236 }
237
238 {
239 // Compression/discrete should remain "to be determined".
240 TDigest tdigest_0(0);
241 tdigest_0.Merge(tdigest_0);
242 EXPECT_EQ(0, tdigest_0.Compression());
243 }
244
245 {
246 // Should take compression/discrete of tdigest_1.
247 TDigest tdigest_0(0);
248 tdigest_0.Merge(tdigest_1);
249 EXPECT_EQ(100, tdigest_0.Compression());
250 EXPECT_EQ(1, tdigest_0.Max());
251 EXPECT_EQ(10, tdigest_0.Count());
252 }
253
254 {
255 // Should take compression/discrete of tdigest_2.
256 TDigest tdigest_0(0);
257 tdigest_0.Merge(tdigest_2);
258 EXPECT_EQ(200, tdigest_0.Compression());
259 EXPECT_EQ(2, tdigest_0.Max());
260 EXPECT_EQ(10, tdigest_0.Count());
261
262 // Now should succeed without changing compression/discrete.
263 tdigest_0.Merge(tdigest_1);
264 EXPECT_EQ(200, tdigest_0.Compression());
265 EXPECT_EQ(2, tdigest_0.Max());
266 EXPECT_EQ(20, tdigest_0.Count());
267 }
268 }
269
270 // Sample generators for Cdf() and Percentile() precision tests.
271
272 // Returns a vector of `val` repeated `count` times.
ConstantSamples(int count,double val)273 std::vector<double> ConstantSamples(int count, double val) {
274 std::vector<double> v;
275 v.reserve(count);
276 for (int i = 0; i < count; ++i) {
277 v.push_back(val);
278 }
279 return v;
280 }
281
282 // Returns a vector of `count` number of samples from Normal(mean, stdev.
NormalSamples(int count,double mean,double stdev)283 std::vector<double> NormalSamples(int count, double mean, double stdev) {
284 std::vector<double> v;
285 v.reserve(count);
286 absl::BitGen rng;
287 for (int i = 0; i < count; i++) {
288 v.push_back(absl::Gaussian(rng, mean, stdev));
289 }
290 return v;
291 }
292
293 // Returns a vector of `count` number of samples drawn uniformly randomly in
294 // range [from, to].
295 template <typename T = double>
UniformSamples(int count,double from,double to)296 std::vector<double> UniformSamples(int count, double from, double to) {
297 std::vector<double> v;
298 v.reserve(count);
299 absl::BitGen rng;
300 for (int i = 0; i < count; i++) {
301 v.push_back(static_cast<T>(absl::Uniform(rng, from, to)));
302 }
303 return v;
304 }
305
306 struct PrecisionTestParam {
307 // Function to get samples vector.
308 std::function<std::vector<double>()> generate_samples;
309 // Map of {percentile or val, max error bound}.
310 absl::flat_hash_map<double, double> max_errors;
311 };
312
313 class QuantilePrecisionTest
314 : public ::testing::TestWithParam<PrecisionTestParam> {
315 public:
316 static constexpr double kCompression = 100;
317 static constexpr double kMaxCentroids = 200;
318 };
319
320 // Tests max and average Percentile() errors against the true percentile.
TEST_P(QuantilePrecisionTest,QuantilePrecisionTest)321 TEST_P(QuantilePrecisionTest, QuantilePrecisionTest) {
322 // We expect higher precision near both ends.
323 const absl::flat_hash_map<double, double>& max_error_bounds =
324 GetParam().max_errors;
325
326 const std::vector<double> samples = GetParam().generate_samples();
327 TDigest tdigest(kCompression);
328 for (double i : samples) {
329 tdigest.Add(i);
330 }
331
332 for (const auto& p : max_error_bounds) {
333 double quantile = p.first;
334 double error =
335 abs(tdigest.Quantile(quantile) - GetTrueQuantile(samples, quantile));
336 EXPECT_LE(error, p.second)
337 << "(quantile=" << quantile << ") actual:" << tdigest.Quantile(quantile)
338 << " expected:" << GetTrueQuantile(samples, quantile);
339 }
340 }
341
342 // Same as above but merge every 1000 samples.
TEST_P(QuantilePrecisionTest,MergeQuantilePrecisionTest)343 TEST_P(QuantilePrecisionTest, MergeQuantilePrecisionTest) {
344 // We expect higher precision near both ends.
345 const absl::flat_hash_map<double, double>& max_error_bounds =
346 GetParam().max_errors;
347
348 const std::vector<double> samples = GetParam().generate_samples();
349 TDigest tdigest(kCompression);
350 TDigest temp(kCompression);
351 for (double i : samples) {
352 temp.Add(i);
353 if (temp.Count() == 1000) {
354 tdigest.Merge(temp);
355 temp.Reset(kCompression);
356 }
357 }
358 tdigest.Merge(temp);
359
360 ASSERT_EQ(tdigest.Count(), samples.size());
361 for (const auto& p : max_error_bounds) {
362 double quantile = p.first;
363 double error =
364 abs(tdigest.Quantile(quantile) - GetTrueQuantile(samples, quantile));
365 EXPECT_LE(error, p.second)
366 << "(quantile=" << quantile << ") actual:" << tdigest.Quantile(quantile)
367 << " expected:" << GetTrueQuantile(samples, quantile);
368 }
369 }
370
371 INSTANTIATE_TEST_SUITE_P(
372 QuantilePrecisionTest, QuantilePrecisionTest,
373 ::testing::Values(
374 // Constant
__anon000b361a0202() 375 PrecisionTestParam{[]() { return ConstantSamples(100000, 10); },
376 {{0.01, 0}, {0.5, 0}, {0.99, 0}}},
377
378 // Continuous samples
__anon000b361a0402() 379 PrecisionTestParam{[]() { return NormalSamples(100000, 0, 5); },
380 {{0.01, 0.5}, {0.5, 1}, {0.99, 0.5}}},
__anon000b361a0602() 381 PrecisionTestParam{[]() { return UniformSamples(100000, -5000, 5000); },
382 {{0.01, 20}, {0.5, 50}, {0.99, 20}}}));
383
384 class CdfPrecisionTest : public ::testing::TestWithParam<PrecisionTestParam> {
385 public:
386 static constexpr double kCompression = 100;
387 static constexpr double kMaxCentroids = 200;
388 };
389
390 // Tests max and average Percentile() errors against the true percentile.
TEST_P(CdfPrecisionTest,CdfPrecisionTest)391 TEST_P(CdfPrecisionTest, CdfPrecisionTest) {
392 // We expect higher precision near both ends.
393 const absl::flat_hash_map<double, double>& max_error_bounds =
394 GetParam().max_errors;
395
396 const std::vector<double> samples = GetParam().generate_samples();
397 TDigest tdigest(kCompression);
398 for (double i : samples) {
399 tdigest.Add(i);
400 }
401
402 ASSERT_EQ(tdigest.Count(), samples.size());
403 for (const auto& p : max_error_bounds) {
404 double val = p.first;
405 double error = abs(tdigest.Cdf(val) - GetTrueCdf(samples, val));
406 EXPECT_LE(error, p.second)
407 << "(val=" << val << ") actual:" << tdigest.Cdf(val)
408 << " expected:" << GetTrueCdf(samples, val);
409 }
410 }
411
412 // Same as above but merge every 1000 samples.
TEST_P(CdfPrecisionTest,MergeCdfPrecisionTest)413 TEST_P(CdfPrecisionTest, MergeCdfPrecisionTest) {
414 // We expect higher precision near both ends.
415 const absl::flat_hash_map<double, double>& max_error_bounds =
416 GetParam().max_errors;
417
418 const std::vector<double> samples = GetParam().generate_samples();
419 TDigest tdigest(kCompression);
420 TDigest temp(kCompression);
421 for (double i : samples) {
422 temp.Add(i);
423 if (temp.Count() == 1000) {
424 tdigest.Merge(temp);
425 temp.Reset(kCompression);
426 }
427 }
428 tdigest.Merge(temp);
429
430 ASSERT_EQ(tdigest.Count(), samples.size());
431 for (const auto& p : max_error_bounds) {
432 double val = p.first;
433 double error = abs(tdigest.Cdf(val) - GetTrueCdf(samples, val));
434 EXPECT_LE(error, p.second)
435 << "(val=" << val << ") actual:" << tdigest.Cdf(val)
436 << " expected:" << GetTrueCdf(samples, val);
437 }
438 }
439
440 INSTANTIATE_TEST_SUITE_P(
441 CdfPrecisionTest, CdfPrecisionTest,
442 ::testing::Values(
443 // Constant.
__anon000b361a0802() 444 PrecisionTestParam{[]() { return ConstantSamples(100000, 10); },
445 {{9, 0}, {10, 0}, {11, 0}}},
446
447 // Continuous samples.
__anon000b361a0a02() 448 PrecisionTestParam{[]() { return NormalSamples(100000, 0, 5); },
449 {{-10, 0.005}, {0.0, 0.005}, {10, 0.005}}},
__anon000b361a0c02() 450 PrecisionTestParam{[]() { return UniformSamples(100000, -5000, 5000); },
451 {{-5000.1, 0},
452 {-4900, 0.005},
453 {0, 0.005},
454 {4900, 0.005},
455 {5000, 0}}},
456
457 // Dense and sparse samples.
458 PrecisionTestParam{
__anon000b361a0e02() 459 []() { return UniformSamples<int>(100000, 0, 10); },
__anon000b361a0f02() 460 {{-0.01, 0}, {0.01, 0.03}, {5.0, 0.05}, {9.99, 0.03}, {10, 0}}},
461 PrecisionTestParam{
__anon000b361a1002() 462 []() { return UniformSamples<int>(100000, -10000, 10000); },
__anon000b361a1102() 463 {{-10001, 0},
464 {-9900, 0.005},
465 {0, 0.008},
466 {9900, 0.005},
467 {10000, 0}}}));
468
TEST(TDigestEmptyStringTest,Test)469 TEST(TDigestEmptyStringTest, Test) {
470 TDigest tdigest(0);
471 ASSERT_TRUE(tdigest.FromString("").ok());
472 EXPECT_EQ(tdigest.ToString(), "0/0/0/0/0");
473 }
474
475 } // namespace grpc_core
476
main(int argc,char ** argv)477 int main(int argc, char** argv) {
478 ::testing::InitGoogleTest(&argc, argv);
479 return RUN_ALL_TESTS();
480 }
481