• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2024 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "testing/perf/confidence/ratio_bootstrap_estimator.h"
6 
7 #define _USE_MATH_DEFINES  // Needed to get M_SQRT1_2 on Windows.
8 #include <math.h>
9 
10 #include <algorithm>
11 #include <memory>
12 #include <vector>
13 
14 #ifdef UNSAFE_BUFFERS_BUILD
15 // Not used with untrusted inputs.
16 #pragma allow_unsafe_buffers
17 #endif
18 
19 using std::lower_bound;
20 using std::min;
21 using std::numeric_limits;
22 using std::sort;
23 using std::unique_ptr;
24 using std::vector;
25 
26 // Inverse normal CDF, e.g. InverseNormalCDF(0.975) ~= 1.96
27 // (a 95% CI will cover +/- 1.96 standard deviations from the mean).
28 //
29 // For some reason, C has erf() in its standard library, but not its inverse,
30 // so we have to build it ourselves (which is a bit annoying, since we only
31 // really want it to convert the confidence interval quantiles!). This is
32 // nontrivial, but fortunately, others have figured it out. This is an
33 // implementation of “Algorithm AS 241: The Percentage Points of the Normal
34 // Distribution” (Wichura), and is roughly the same as was used in GNU R
35 // until recently. We don't need extreme precision, so we've only used the
36 // version that is accurate to about seven decimal digits (the other one
37 // is pretty much the same, just with even more constants).
InverseNormalCDF(double p)38 double RatioBootstrapEstimator::InverseNormalCDF(double p) {
39   double q = p - 0.5;
40   if (fabs(q) < 0.425) {
41     const double a0 = 3.3871327179e0;
42     const double a1 = 5.0434271938e1;
43     const double a2 = 1.5929113202e2;
44     const double a3 = 5.9109374720e1;
45     const double b1 = 1.7895169469e1;
46     const double b2 = 7.8757757664e1;
47     const double b3 = 6.7187563600e1;
48 
49     double r = 0.180625 - q * q;
50     return q * (((a3 * r + a2) * r + a1) * r + a0) /
51            (((b3 * r + b2) * r + b1) * r + 1.0);
52   } else {
53     double r = (q < 0) ? p : 1.0 - p;
54     if (r < 0.0) {
55       return numeric_limits<double>::quiet_NaN();
56     }
57     r = sqrt(-log(r));
58     double ret;
59     if (r < 5.0) {
60       const double c0 = 1.4234372777e0;
61       const double c1 = 2.7568153900e0;
62       const double c2 = 1.3067284816e0;
63       const double c3 = 1.7023821103e-1;
64       const double d1 = 7.3700164250e-1;
65       const double d2 = 1.2021132975e-1;
66 
67       r -= 1.6;
68       ret = (((c3 * r + c2) * r + c1) * r + c0) / ((d2 * r + d1) * r + 1.0);
69     } else {
70       const double e0 = 6.6579051150e0;
71       const double e1 = 3.0812263860e0;
72       const double e2 = 4.2868294337e-1;
73       const double e3 = 1.7337203997e-2;
74       const double f1 = 2.4197894225e-1;
75       const double f2 = 1.2258202635e-2;
76 
77       r -= 5.0;
78       ret = (((e3 * r + e2) * r + e1) * r + e0) / ((f2 * r + f1) * r + 1.0);
79     }
80     return (q < 0) ? -ret : ret;
81   }
82 }
83 
84 namespace {
85 
86 // Normal (Gaussian) CDF, e.g. NormCRF(1.96) ~= 0.975
87 // (+/- 1.96 standard deviations would cover a 95% CI).
NormalCDF(double q)88 double NormalCDF(double q) {
89   return 0.5 * erfc(-q * M_SQRT1_2);
90 }
91 
92 // Compute percentiles of the bootstrap distribution (the inverse of G).
93 // We estimate G by Ĝ, the bootstrap estimate of G (text above eq. 2.9
94 // in the paper). Note that unlike bcajack, we interpolate between values
95 // to get slightly better accuracy.
ComputeBCa(const double * estimates,size_t num_estimates,double alpha,double z0,double a)96 double ComputeBCa(const double* estimates,
97                   size_t num_estimates,
98                   double alpha,
99                   double z0,
100                   double a) {
101   double z_alpha = RatioBootstrapEstimator::InverseNormalCDF(alpha);
102 
103   // Eq. (2.2); the basic BCa formula.
104   double q = NormalCDF(z0 + (z0 + z_alpha) / (1 - a * (z0 + z_alpha)));
105 
106   double index = q * (num_estimates - 1);
107   int base_index = index;
108   if (base_index == static_cast<int>(num_estimates - 1)) {
109     // The edge of the CDF; note that R would warn in this case.
110     return estimates[base_index];
111   }
112   double frac = index - base_index;
113   return estimates[base_index] +
114          frac * (estimates[base_index + 1] - estimates[base_index]);
115 }
116 
117 // Calculate Ĝ (the fraction of estimates that are less than search-value).
FindCDF(const double * estimates,size_t num_estimates,double search_val)118 double FindCDF(const double* estimates,
119                size_t num_estimates,
120                double search_val) {
121   // Find first x where x >= search_val.
122   auto it = lower_bound(estimates, estimates + num_estimates, search_val);
123   if (it == estimates + num_estimates) {
124     // All values are less than search_val.
125     // Note that R warns in this case.
126     return 1.0;
127   }
128 
129   unsigned index = std::distance(estimates, it);
130   if (index == 0) {
131     // All values are >= search_val.
132     // Note that R warns in this case.
133     return 0.0;
134   }
135 
136   // TODO(sesse): Consider whether we should interpolate here, like in
137   // compute_bca().
138   return index / double(num_estimates);
139 }
140 
141 }  // namespace
142 
143 // Find the ratio estimate over all values except for the one at skip_index,
144 // i.e., leave-one-out. (If skip_index == -1 or similar, simply compute over
145 // all values.) This is used in the jackknife estimate for the acceleration.
EstimateRatioExcept(const vector<RatioBootstrapEstimator::Sample> & x,int skip_index)146 double RatioBootstrapEstimator::EstimateRatioExcept(
147     const vector<RatioBootstrapEstimator::Sample>& x,
148     int skip_index) {
149   double before = 0.0, after = 0.0;
150   for (unsigned i = 0; i < x.size(); ++i) {
151     if (static_cast<int>(i) == skip_index) {
152       continue;
153     }
154     before += x[i].before;
155     after += x[i].after;
156   }
157   return before / after;
158 }
159 
160 // Similar, for the geometric mean across all the data sets.
EstimateGeometricMeanExcept(const vector<vector<RatioBootstrapEstimator::Sample>> & x,int skip_index)161 double RatioBootstrapEstimator::EstimateGeometricMeanExcept(
162     const vector<vector<RatioBootstrapEstimator::Sample>>& x,
163     int skip_index) {
164   double geometric_mean = 1.0;
165   for (const auto& samples : x) {
166     geometric_mean *= EstimateRatioExcept(samples, skip_index);
167   }
168   return pow(geometric_mean, 1.0 / x.size());
169 }
170 
171 vector<RatioBootstrapEstimator::Estimate>
ComputeRatioEstimates(const vector<vector<RatioBootstrapEstimator::Sample>> & data,unsigned num_resamples,double confidence_level,bool compute_geometric_mean)172 RatioBootstrapEstimator::ComputeRatioEstimates(
173     const vector<vector<RatioBootstrapEstimator::Sample>>& data,
174     unsigned num_resamples,
175     double confidence_level,
176     bool compute_geometric_mean) {
177   if (data.empty() || num_resamples < 10 || confidence_level <= 0.0 ||
178       confidence_level >= 1.0) {
179     return {};
180   }
181 
182   unsigned num_observations = numeric_limits<unsigned>::max();
183   for (const vector<Sample>& samples : data) {
184     num_observations = min<unsigned>(num_observations, samples.size());
185   }
186 
187   // Allocate some memory for temporaries that we need.
188   unsigned num_dimensions = data.size();
189   unique_ptr<double[]> before(new double[num_dimensions]);
190   unique_ptr<double[]> after(new double[num_dimensions]);
191   unique_ptr<double[]> all_estimates(
192       new double[(num_dimensions + compute_geometric_mean) * num_resamples]);
193 
194   // Do our bootstrap resampling. Note that we can sample independently
195   // from the numerator and denumerator (which the R packages cannot do);
196   // this makes sense, because they are not pairs. This allows us to sometimes
197   // get slightly narrower confidence intervals.
198   //
199   // When computing the geometric mean, we could perhaps consider doing
200   // similar independent sampling across the various data sets, but we
201   // currently don't do so.
202   for (unsigned i = 0; i < num_resamples; ++i) {
203     for (unsigned d = 0; d < num_dimensions; ++d) {
204       before[d] = 0.0;
205       after[d] = 0.0;
206     }
207     for (unsigned j = 0; j < num_observations; ++j) {
208       unsigned r1 = gen_();
209       unsigned r2 = gen_();
210 
211       // NOTE: The bias from the modulo here should be insignificant.
212       for (unsigned d = 0; d < num_dimensions; ++d) {
213         unsigned index1 = r1 % data[d].size();
214         unsigned index2 = r2 % data[d].size();
215         before[d] += data[d][index1].before;
216         after[d] += data[d][index2].after;
217       }
218     }
219     double geometric_mean = 1.0;
220     for (unsigned d = 0; d < num_dimensions; ++d) {
221       double ratio = before[d] / after[d];
222       all_estimates[d * num_resamples + i] = ratio;
223       geometric_mean *= ratio;
224     }
225     if (compute_geometric_mean) {
226       all_estimates[num_dimensions * num_resamples + i] =
227           pow(geometric_mean, 1.0 / num_dimensions);
228     }
229   }
230 
231   // Make our point estimates.
232   vector<Estimate> result;
233   for (unsigned d = 0; d < num_dimensions + compute_geometric_mean; ++d) {
234     bool is_geometric_mean = (d == num_dimensions);
235 
236     double* estimates = &all_estimates[d * num_resamples];
237 
238     // FindCDF() and others expect sorted data.
239     sort(estimates, estimates + num_resamples);
240 
241     // Make our point estimate.
242     double point_estimate = is_geometric_mean
243                                 ? EstimateGeometricMeanExcept(data, -1)
244                                 : EstimateRatioExcept(data[d], -1);
245 
246     // Compute bias correction, Eq. (2.9).
247     double z0 =
248         InverseNormalCDF(FindCDF(estimates, num_resamples, point_estimate));
249 
250     // Compute acceleration. This is Eq. (3.11), except that there seems
251     // to be a typo; the sign seems to be flipped compared to bcajack,
252     // so we correct that (see line 148 of bcajack.R). Note that bcajack
253     // uses the average-of-averages instead of the point estimate,
254     // which doesn't seem to match the paper, but the difference is
255     // completely insigificant in practice.
256     double sum_d_squared = 0.0;
257     double sum_d_cubed = 0.0;
258     if (is_geometric_mean) {
259       // NOTE: If there are differing numbers of samples in the different
260       // data series, this will be ever so slightly off, but the effect
261       // should hopefully be small.
262       for (unsigned i = 0; i < num_observations; ++i) {
263         double dd = point_estimate - EstimateGeometricMeanExcept(data, i);
264         sum_d_squared += dd * dd;
265         sum_d_cubed += dd * dd * dd;
266       }
267     } else {
268       for (unsigned i = 0; i < data[d].size(); ++i) {
269         double dd = point_estimate - EstimateRatioExcept(data[d], i);
270         sum_d_squared += dd * dd;
271         sum_d_cubed += dd * dd * dd;
272       }
273     }
274     double a = sum_d_cubed /
275                (6.0 * sqrt(sum_d_squared * sum_d_squared * sum_d_squared));
276 
277     double alpha = 0.5 * (1 - confidence_level);
278 
279     Estimate est;
280     est.point_estimate = point_estimate;
281     est.lower = ComputeBCa(estimates, num_resamples, alpha, z0, a);
282     est.upper = ComputeBCa(estimates, num_resamples, 1.0 - alpha, z0, a);
283     result.push_back(est);
284   }
285   return result;
286 }
287