• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 package org.apache.commons.math3.stat.inference;
19 
20 import java.math.BigDecimal;
21 import java.util.Arrays;
22 import java.util.HashSet;
23 
24 import org.apache.commons.math3.distribution.EnumeratedRealDistribution;
25 import org.apache.commons.math3.distribution.RealDistribution;
26 import org.apache.commons.math3.distribution.UniformRealDistribution;
27 import org.apache.commons.math3.exception.InsufficientDataException;
28 import org.apache.commons.math3.exception.MathArithmeticException;
29 import org.apache.commons.math3.exception.MathInternalError;
30 import org.apache.commons.math3.exception.NullArgumentException;
31 import org.apache.commons.math3.exception.NumberIsTooLargeException;
32 import org.apache.commons.math3.exception.OutOfRangeException;
33 import org.apache.commons.math3.exception.TooManyIterationsException;
34 import org.apache.commons.math3.exception.util.LocalizedFormats;
35 import org.apache.commons.math3.fraction.BigFraction;
36 import org.apache.commons.math3.fraction.BigFractionField;
37 import org.apache.commons.math3.fraction.FractionConversionException;
38 import org.apache.commons.math3.linear.Array2DRowFieldMatrix;
39 import org.apache.commons.math3.linear.FieldMatrix;
40 import org.apache.commons.math3.linear.MatrixUtils;
41 import org.apache.commons.math3.linear.RealMatrix;
42 import org.apache.commons.math3.random.JDKRandomGenerator;
43 import org.apache.commons.math3.random.RandomGenerator;
44 import org.apache.commons.math3.random.Well19937c;
45 import org.apache.commons.math3.util.CombinatoricsUtils;
46 import org.apache.commons.math3.util.FastMath;
47 import org.apache.commons.math3.util.MathArrays;
48 import org.apache.commons.math3.util.MathUtils;
49 
50 /**
51  * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
52  * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions.
53  * <p>
54  * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
55  * sample data points from the distribution expected under the null hypothesis. For one-sample tests
56  * evaluating the null hypothesis that a set of sample data points follow a given distribution, the
57  * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
58  * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
59  * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
60  * given in [2].
61  * </p>
62  * <p>
63  * Two-sample tests are also supported, evaluating the null hypothesis that the two samples
64  * {@code x} and {@code y} come from the same underlying distribution. In this case, the test
65  * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is
66  * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of
67  * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The
68  * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
69  * follows:
70  * <ul>
71  * <li>For small samples (where the product of the sample sizes is less than
72  * {@value #LARGE_SAMPLE_PRODUCT}), the method presented in [4] is used to compute the
73  * exact p-value for the 2-sample test.</li>
74  * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the asymptotic
75  * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
76  * the approximation.</li>
77  * </ul></p><p>
78  * If the product of the sample sizes is less than {@value #LARGE_SAMPLE_PRODUCT} and the sample
79  * data contains ties, random jitter is added to the sample data to break ties before applying
80  * the algorithm above. Alternatively, the {@link #bootstrap(double[], double[], int, boolean)}
81  * method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
82  * in the R Matching package [3], can be used if ties are known to be present in the data.
83  * </p>
84  * <p>
85  * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value
86  * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} > d \)
87  * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean
88  * {@code strict} parameter. This parameter is ignored for large samples.
89  * </p>
90  * <p>
91  * The methods used by the 2-sample default implementation are also exposed directly:
92  * <ul>
93  * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
94  * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
95  * arguments in the first two methods allow the probability used to estimate the p-value to be
96  * expressed using strict or non-strict inequality. See
97  * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li>
98  * </ul>
99  * </p>
100  * <p>
101  * References:
102  * <ul>
103  * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by
104  * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li>
105  * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
106  * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li>
107  * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07">
108  * Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
109  * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li>
110  * <li>[4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing,
111  * Chapter 5, 3rd Ed. Academic Press.</li>
112  * </ul>
113  * <br/>
114  * Note that [1] contains an error in computing h, refer to <a
115  * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
116  * </p>
117  *
118  * @since 3.3
119  */
120 public class KolmogorovSmirnovTest {
121 
122     /**
123      * Bound on the number of partial sums in {@link #ksSum(double, double, int)}
124      */
125     protected static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000;
126 
127     /** Convergence criterion for {@link #ksSum(double, double, int)} */
128     protected static final double KS_SUM_CAUCHY_CRITERION = 1E-20;
129 
130     /** Convergence criterion for the sums in #pelzGood(double, double, int)} */
131     protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10;
132 
133     /** No longer used. */
134     @Deprecated
135     protected static final int SMALL_SAMPLE_PRODUCT = 200;
136 
137     /**
138      * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
139      * distribution to compute the p-value.
140      */
141     protected static final int LARGE_SAMPLE_PRODUCT = 10000;
142 
143     /** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)}.
144      *  Deprecated as of version 3.6, as this method is no longer needed. */
145     @Deprecated
146     protected static final int MONTE_CARLO_ITERATIONS = 1000000;
147 
148     /** Random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} */
149     private final RandomGenerator rng;
150 
151     /**
152      * Construct a KolmogorovSmirnovTest instance with a default random data generator.
153      */
KolmogorovSmirnovTest()154     public KolmogorovSmirnovTest() {
155         rng = new Well19937c();
156     }
157 
158     /**
159      * Construct a KolmogorovSmirnovTest with the provided random data generator.
160      * The #monteCarloP(double, int, int, boolean, int) that uses the generator supplied to this
161      * constructor is deprecated as of version 3.6.
162      *
163      * @param rng random data generator used by {@link #monteCarloP(double, int, int, boolean, int)}
164      */
165     @Deprecated
KolmogorovSmirnovTest(RandomGenerator rng)166     public KolmogorovSmirnovTest(RandomGenerator rng) {
167         this.rng = rng;
168     }
169 
170     /**
171      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
172      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
173      * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If
174      * {@code exact} is true, the distribution used to compute the p-value is computed using
175      * extended precision. See {@link #cdfExact(double, int)}.
176      *
177      * @param distribution reference distribution
178      * @param data sample being being evaluated
179      * @param exact whether or not to force exact computation of the p-value
180      * @return the p-value associated with the null hypothesis that {@code data} is a sample from
181      *         {@code distribution}
182      * @throws InsufficientDataException if {@code data} does not have length at least 2
183      * @throws NullArgumentException if {@code data} is null
184      */
kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact)185     public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) {
186         return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact);
187     }
188 
189     /**
190      * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where
191      * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the
192      * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at
193      * each of the values in {@code data}.
194      *
195      * @param distribution reference distribution
196      * @param data sample being evaluated
197      * @return Kolmogorov-Smirnov statistic \(D_n\)
198      * @throws InsufficientDataException if {@code data} does not have length at least 2
199      * @throws NullArgumentException if {@code data} is null
200      */
kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data)201     public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) {
202         checkArray(data);
203         final int n = data.length;
204         final double nd = n;
205         final double[] dataCopy = new double[n];
206         System.arraycopy(data, 0, dataCopy, 0, n);
207         Arrays.sort(dataCopy);
208         double d = 0d;
209         for (int i = 1; i <= n; i++) {
210             final double yi = distribution.cumulativeProbability(dataCopy[i - 1]);
211             final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi);
212             if (currD > d) {
213                 d = currD;
214             }
215         }
216         return d;
217     }
218 
219     /**
220      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
221      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
222      * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
223      * probability distribution. Specifically, what is returned is an estimate of the probability
224      * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly
225      * selected partition of the combined sample into subsamples of sizes {@code x.length} and
226      * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
227      * large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
228      * <ul>
229      * <li>For small samples (where the product of the sample sizes is less than
230      * {@value #LARGE_SAMPLE_PRODUCT}), the exact p-value is computed using the method presented
231      * in [4], implemented in {@link #exactP(double, int, int, boolean)}. </li>
232      * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the
233      * asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)}
234      * for details on the approximation.</li>
235      * </ul><p>
236      * If {@code x.length * y.length} < {@value #LARGE_SAMPLE_PRODUCT} and the combined set of values in
237      * {@code x} and {@code y} contains ties, random jitter is added to {@code x} and {@code y} to
238      * break ties before computing \(D_{n,m}\) and the p-value. The jitter is uniformly distributed
239      * on (-minDelta / 2, minDelta / 2) where minDelta is the smallest pairwise difference between
240      * values in the combined sample.</p>
241      * <p>
242      * If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)}
243      * may be used as an alternative method for estimating the p-value.</p>
244      *
245      * @param x first sample dataset
246      * @param y second sample dataset
247      * @param strict whether or not the probability to compute is expressed as a strict inequality
248      *        (ignored for large samples)
249      * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
250      *         samples from the same distribution
251      * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
252      *         least 2
253      * @throws NullArgumentException if either {@code x} or {@code y} is null
254      * @see #bootstrap(double[], double[], int, boolean)
255      */
kolmogorovSmirnovTest(double[] x, double[] y, boolean strict)256     public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
257         final long lengthProduct = (long) x.length * y.length;
258         double[] xa = null;
259         double[] ya = null;
260         if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) {
261             xa = MathArrays.copyOf(x);
262             ya = MathArrays.copyOf(y);
263             fixTies(xa, ya);
264         } else {
265             xa = x;
266             ya = y;
267         }
268         if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
269             return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict);
270         }
271         return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
272     }
273 
274     /**
275      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
276      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
277      * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
278      * probability distribution. Assumes the strict form of the inequality used to compute the
279      * p-value. See {@link #kolmogorovSmirnovTest(RealDistribution, double[], boolean)}.
280      *
281      * @param x first sample dataset
282      * @param y second sample dataset
283      * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
284      *         samples from the same distribution
285      * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
286      *         least 2
287      * @throws NullArgumentException if either {@code x} or {@code y} is null
288      */
kolmogorovSmirnovTest(double[] x, double[] y)289     public double kolmogorovSmirnovTest(double[] x, double[] y) {
290         return kolmogorovSmirnovTest(x, y, true);
291     }
292 
293     /**
294      * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
295      * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
296      * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
297      * is the empirical distribution of the {@code y} values.
298      *
299      * @param x first sample
300      * @param y second sample
301      * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
302      *         {@code y} represent samples from the same underlying distribution
303      * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
304      *         least 2
305      * @throws NullArgumentException if either {@code x} or {@code y} is null
306      */
kolmogorovSmirnovStatistic(double[] x, double[] y)307     public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
308         return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length));
309     }
310 
311     /**
312      * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
313      * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
314      * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
315      * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned
316      * as long value.
317      *
318      * @param x first sample
319      * @param y second sample
320      * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
321      *         {@code y} represent samples from the same underlying distribution
322      * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
323      *         least 2
324      * @throws NullArgumentException if either {@code x} or {@code y} is null
325      */
integralKolmogorovSmirnovStatistic(double[] x, double[] y)326     private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
327         checkArray(x);
328         checkArray(y);
329         // Copy and sort the sample arrays
330         final double[] sx = MathArrays.copyOf(x);
331         final double[] sy = MathArrays.copyOf(y);
332         Arrays.sort(sx);
333         Arrays.sort(sy);
334         final int n = sx.length;
335         final int m = sy.length;
336 
337         int rankX = 0;
338         int rankY = 0;
339         long curD = 0l;
340 
341         // Find the max difference between cdf_x and cdf_y
342         long supD = 0l;
343         do {
344             double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
345             while(rankX < n && Double.compare(sx[rankX], z) == 0) {
346                 rankX += 1;
347                 curD += m;
348             }
349             while(rankY < m && Double.compare(sy[rankY], z) == 0) {
350                 rankY += 1;
351                 curD -= n;
352             }
353             if (curD > supD) {
354                 supD = curD;
355             }
356             else if (-curD > supD) {
357                 supD = -curD;
358             }
359         } while(rankX < n && rankY < m);
360         return supD;
361     }
362 
363     /**
364      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
365      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
366      * evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
367      *
368      * @param distribution reference distribution
369      * @param data sample being being evaluated
370      * @return the p-value associated with the null hypothesis that {@code data} is a sample from
371      *         {@code distribution}
372      * @throws InsufficientDataException if {@code data} does not have length at least 2
373      * @throws NullArgumentException if {@code data} is null
374      */
kolmogorovSmirnovTest(RealDistribution distribution, double[] data)375     public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) {
376         return kolmogorovSmirnovTest(distribution, data, false);
377     }
378 
379     /**
380      * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov
381      * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
382      *
383      * @param distribution reference distribution
384      * @param data sample being being evaluated
385      * @param alpha significance level of the test
386      * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution}
387      *         can be rejected with confidence 1 - {@code alpha}
388      * @throws InsufficientDataException if {@code data} does not have length at least 2
389      * @throws NullArgumentException if {@code data} is null
390      */
kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha)391     public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) {
392         if ((alpha <= 0) || (alpha > 0.5)) {
393             throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
394         }
395         return kolmogorovSmirnovTest(distribution, data) < alpha;
396     }
397 
398     /**
399      * Estimates the <i>p-value</i> of a two-sample
400      * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
401      * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
402      * probability distribution. This method estimates the p-value by repeatedly sampling sets of size
403      * {@code x.length} and {@code y.length} from the empirical distribution of the combined sample.
404      * When {@code strict} is true, this is equivalent to the algorithm implemented in the R function
405      * {@code ks.boot}, described in <pre>
406      * Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching
407      * Software with Automated Balance Optimization: The Matching package for R.'
408      * Journal of Statistical Software, 42(7): 1-52.
409      * </pre>
410      * @param x first sample
411      * @param y second sample
412      * @param iterations number of bootstrap resampling iterations
413      * @param strict whether or not the null hypothesis is expressed as a strict inequality
414      * @return estimated p-value
415      */
bootstrap(double[] x, double[] y, int iterations, boolean strict)416     public double bootstrap(double[] x, double[] y, int iterations, boolean strict) {
417         final int xLength = x.length;
418         final int yLength = y.length;
419         final double[] combined = new double[xLength + yLength];
420         System.arraycopy(x, 0, combined, 0, xLength);
421         System.arraycopy(y, 0, combined, xLength, yLength);
422         final EnumeratedRealDistribution dist = new EnumeratedRealDistribution(rng, combined);
423         final long d = integralKolmogorovSmirnovStatistic(x, y);
424         int greaterCount = 0;
425         int equalCount = 0;
426         double[] curX;
427         double[] curY;
428         long curD;
429         for (int i = 0; i < iterations; i++) {
430             curX = dist.sample(xLength);
431             curY = dist.sample(yLength);
432             curD = integralKolmogorovSmirnovStatistic(curX, curY);
433             if (curD > d) {
434                 greaterCount++;
435             } else if (curD == d) {
436                 equalCount++;
437             }
438         }
439         return strict ? greaterCount / (double) iterations :
440             (greaterCount + equalCount) / (double) iterations;
441     }
442 
443     /**
444      * Computes {@code bootstrap(x, y, iterations, true)}.
445      * This is equivalent to ks.boot(x,y, nboots=iterations) using the R Matching
446      * package function. See #bootstrap(double[], double[], int, boolean).
447      *
448      * @param x first sample
449      * @param y second sample
450      * @param iterations number of bootstrap resampling iterations
451      * @return estimated p-value
452      */
bootstrap(double[] x, double[] y, int iterations)453     public double bootstrap(double[] x, double[] y, int iterations) {
454         return bootstrap(x, y, iterations, true);
455     }
456 
457     /**
458      * Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme
459      * values given in [2] (see above). The result is not exact as with
460      * {@link #cdfExact(double, int)} because calculations are based on
461      * {@code double} rather than {@link org.apache.commons.math3.fraction.BigFraction}.
462      *
463      * @param d statistic
464      * @param n sample size
465      * @return \(P(D_n < d)\)
466      * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
467      *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
468      *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
469      */
cdf(double d, int n)470     public double cdf(double d, int n)
471         throws MathArithmeticException {
472         return cdf(d, n, false);
473     }
474 
475     /**
476      * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is
477      * used everywhere at the expense of very slow execution time. Almost never choose this in real
478      * applications unless you are very sure; this is almost solely for verification purposes.
479      * Normally, you would choose {@link #cdf(double, int)}. See the class
480      * javadoc for definitions and algorithm description.
481      *
482      * @param d statistic
483      * @param n sample size
484      * @return \(P(D_n < d)\)
485      * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a
486      *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
487      *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
488      */
cdfExact(double d, int n)489     public double cdfExact(double d, int n)
490         throws MathArithmeticException {
491         return cdf(d, n, true);
492     }
493 
494     /**
495      * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme
496      * values given in [2] (see above).
497      *
498      * @param d statistic
499      * @param n sample size
500      * @param exact whether the probability should be calculated exact using
501      *        {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the expense of
502      *        very slow execution time, or if {@code double} should be used convenient places to
503      *        gain speed. Almost never choose {@code true} in real applications unless you are very
504      *        sure; {@code true} is almost solely for verification purposes.
505      * @return \(P(D_n < d)\)
506      * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
507      *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
508      *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
509      */
cdf(double d, int n, boolean exact)510     public double cdf(double d, int n, boolean exact)
511         throws MathArithmeticException {
512 
513         final double ninv = 1 / ((double) n);
514         final double ninvhalf = 0.5 * ninv;
515 
516         if (d <= ninvhalf) {
517             return 0;
518         } else if (ninvhalf < d && d <= ninv) {
519             double res = 1;
520             final double f = 2 * d - ninv;
521             // n! f^n = n*f * (n-1)*f * ... * 1*x
522             for (int i = 1; i <= n; ++i) {
523                 res *= i * f;
524             }
525             return res;
526         } else if (1 - ninv <= d && d < 1) {
527             return 1 - 2 * Math.pow(1 - d, n);
528         } else if (1 <= d) {
529             return 1;
530         }
531         if (exact) {
532             return exactK(d, n);
533         }
534         if (n <= 140) {
535             return roundedK(d, n);
536         }
537         return pelzGood(d, n);
538     }
539 
540     /**
541      * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference
542      * in class javadoc above) and {@link org.apache.commons.math3.fraction.BigFraction} (see
543      * above).
544      *
545      * @param d statistic
546      * @param n sample size
547      * @return the two-sided probability of \(P(D_n < d)\)
548      * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
549      *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
550      *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
551      */
exactK(double d, int n)552     private double exactK(double d, int n)
553         throws MathArithmeticException {
554 
555         final int k = (int) Math.ceil(n * d);
556 
557         final FieldMatrix<BigFraction> H = this.createExactH(d, n);
558         final FieldMatrix<BigFraction> Hpower = H.power(n);
559 
560         BigFraction pFrac = Hpower.getEntry(k - 1, k - 1);
561 
562         for (int i = 1; i <= n; ++i) {
563             pFrac = pFrac.multiply(i).divide(n);
564         }
565 
566         /*
567          * BigFraction.doubleValue converts numerator to double and the denominator to double and
568          * divides afterwards. That gives NaN quite easy. This does not (scale is the number of
569          * digits):
570          */
571         return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue();
572     }
573 
574     /**
575      * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above).
576      *
577      * @param d statistic
578      * @param n sample size
579      * @return \(P(D_n < d)\)
580      */
roundedK(double d, int n)581     private double roundedK(double d, int n) {
582 
583         final int k = (int) Math.ceil(n * d);
584         final RealMatrix H = this.createRoundedH(d, n);
585         final RealMatrix Hpower = H.power(n);
586 
587         double pFrac = Hpower.getEntry(k - 1, k - 1);
588         for (int i = 1; i <= n; ++i) {
589             pFrac *= (double) i / (double) n;
590         }
591 
592         return pFrac;
593     }
594 
595     /**
596      * Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc.
597      *
598      * @param d value of d-statistic (x in [2])
599      * @param n sample size
600      * @return \(P(D_n < d)\)
601      * @since 3.4
602      */
pelzGood(double d, int n)603     public double pelzGood(double d, int n) {
604         // Change the variable since approximation is for the distribution evaluated at d / sqrt(n)
605         final double sqrtN = FastMath.sqrt(n);
606         final double z = d * sqrtN;
607         final double z2 = d * d * n;
608         final double z4 = z2 * z2;
609         final double z6 = z4 * z2;
610         final double z8 = z4 * z4;
611 
612         // Eventual return value
613         double ret = 0;
614 
615         // Compute K_0(z)
616         double sum = 0;
617         double increment = 0;
618         double kTerm = 0;
619         double z2Term = MathUtils.PI_SQUARED / (8 * z2);
620         int k = 1;
621         for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
622             kTerm = 2 * k - 1;
623             increment = FastMath.exp(-z2Term * kTerm * kTerm);
624             sum += increment;
625             if (increment <= PG_SUM_RELATIVE_ERROR * sum) {
626                 break;
627             }
628         }
629         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
630             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
631         }
632         ret = sum * FastMath.sqrt(2 * FastMath.PI) / z;
633 
634         // K_1(z)
635         // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have
636         // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...)
637         final double twoZ2 = 2 * z2;
638         sum = 0;
639         kTerm = 0;
640         double kTerm2 = 0;
641         for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
642             kTerm = k + 0.5;
643             kTerm2 = kTerm * kTerm;
644             increment = (MathUtils.PI_SQUARED * kTerm2 - z2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
645             sum += increment;
646             if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
647                 break;
648             }
649         }
650         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
651             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
652         }
653         final double sqrtHalfPi = FastMath.sqrt(FastMath.PI / 2);
654         // Instead of doubling sum, divide by 3 instead of 6
655         ret += sum * sqrtHalfPi / (3 * z4 * sqrtN);
656 
657         // K_2(z)
658         // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers.
659         final double z4Term = 2 * z4;
660         final double z6Term = 6 * z6;
661         z2Term = 5 * z2;
662         final double pi4 = MathUtils.PI_SQUARED * MathUtils.PI_SQUARED;
663         sum = 0;
664         kTerm = 0;
665         kTerm2 = 0;
666         for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
667             kTerm = k + 0.5;
668             kTerm2 = kTerm * kTerm;
669             increment =  (z6Term + z4Term + MathUtils.PI_SQUARED * (z4Term - z2Term) * kTerm2 +
670                     pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
671             sum += increment;
672             if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
673                 break;
674             }
675         }
676         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
677             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
678         }
679         double sum2 = 0;
680         kTerm2 = 0;
681         for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
682             kTerm2 = k * k;
683             increment = MathUtils.PI_SQUARED * kTerm2 * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
684             sum2 += increment;
685             if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
686                 break;
687             }
688         }
689         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
690             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
691         }
692         // Again, adjust coefficients instead of doubling sum, sum2
693         ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z));
694 
695         // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even.
696         // Multiply coefficient denominators by 2, so omit doubling sums.
697         final double pi6 = pi4 * MathUtils.PI_SQUARED;
698         sum = 0;
699         double kTerm4 = 0;
700         double kTerm6 = 0;
701         for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
702             kTerm = k + 0.5;
703             kTerm2 = kTerm * kTerm;
704             kTerm4 = kTerm2 * kTerm2;
705             kTerm6 = kTerm4 * kTerm2;
706             increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) +
707                             MathUtils.PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) *
708                     FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
709             sum += increment;
710             if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
711                 break;
712             }
713         }
714         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
715             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
716         }
717         sum2 = 0;
718         for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
719             kTerm2 = k * k;
720             kTerm4 = kTerm2 * kTerm2;
721             increment = (-pi4 * kTerm4 + 3 * MathUtils.PI_SQUARED * kTerm2 * z2) *
722                     FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
723             sum2 += increment;
724             if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
725                 break;
726             }
727         }
728         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
729             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
730         }
731         return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) +
732                 + sum2 / (108 * z6));
733 
734     }
735 
736     /***
737      * Creates {@code H} of size {@code m x m} as described in [1] (see above).
738      *
739      * @param d statistic
740      * @param n sample size
741      * @return H matrix
742      * @throws NumberIsTooLargeException if fractional part is greater than 1
743      * @throws FractionConversionException if algorithm fails to convert {@code h} to a
744      *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
745      *         - h) / m\) for integer {@code k, m} and \(0 <= h < 1\).
746      */
createExactH(double d, int n)747     private FieldMatrix<BigFraction> createExactH(double d, int n)
748         throws NumberIsTooLargeException, FractionConversionException {
749 
750         final int k = (int) Math.ceil(n * d);
751         final int m = 2 * k - 1;
752         final double hDouble = k - n * d;
753         if (hDouble >= 1) {
754             throw new NumberIsTooLargeException(hDouble, 1.0, false);
755         }
756         BigFraction h = null;
757         try {
758             h = new BigFraction(hDouble, 1.0e-20, 10000);
759         } catch (final FractionConversionException e1) {
760             try {
761                 h = new BigFraction(hDouble, 1.0e-10, 10000);
762             } catch (final FractionConversionException e2) {
763                 h = new BigFraction(hDouble, 1.0e-5, 10000);
764             }
765         }
766         final BigFraction[][] Hdata = new BigFraction[m][m];
767 
768         /*
769          * Start by filling everything with either 0 or 1.
770          */
771         for (int i = 0; i < m; ++i) {
772             for (int j = 0; j < m; ++j) {
773                 if (i - j + 1 < 0) {
774                     Hdata[i][j] = BigFraction.ZERO;
775                 } else {
776                     Hdata[i][j] = BigFraction.ONE;
777                 }
778             }
779         }
780 
781         /*
782          * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
783          * hPowers[m-1] = h^m
784          */
785         final BigFraction[] hPowers = new BigFraction[m];
786         hPowers[0] = h;
787         for (int i = 1; i < m; ++i) {
788             hPowers[i] = h.multiply(hPowers[i - 1]);
789         }
790 
791         /*
792          * First column and last row has special values (each other reversed).
793          */
794         for (int i = 0; i < m; ++i) {
795             Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]);
796             Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]);
797         }
798 
799         /*
800          * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
801          * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
802          */
803         if (h.compareTo(BigFraction.ONE_HALF) == 1) {
804             Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m));
805         }
806 
807         /*
808          * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
809          * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
810          * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
811          * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
812          * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
813          * really necessary.
814          */
815         for (int i = 0; i < m; ++i) {
816             for (int j = 0; j < i + 1; ++j) {
817                 if (i - j + 1 > 0) {
818                     for (int g = 2; g <= i - j + 1; ++g) {
819                         Hdata[i][j] = Hdata[i][j].divide(g);
820                     }
821                 }
822             }
823         }
824         return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata);
825     }
826 
827     /***
828      * Creates {@code H} of size {@code m x m} as described in [1] (see above)
829      * using double-precision.
830      *
831      * @param d statistic
832      * @param n sample size
833      * @return H matrix
834      * @throws NumberIsTooLargeException if fractional part is greater than 1
835      */
createRoundedH(double d, int n)836     private RealMatrix createRoundedH(double d, int n)
837         throws NumberIsTooLargeException {
838 
839         final int k = (int) Math.ceil(n * d);
840         final int m = 2 * k - 1;
841         final double h = k - n * d;
842         if (h >= 1) {
843             throw new NumberIsTooLargeException(h, 1.0, false);
844         }
845         final double[][] Hdata = new double[m][m];
846 
847         /*
848          * Start by filling everything with either 0 or 1.
849          */
850         for (int i = 0; i < m; ++i) {
851             for (int j = 0; j < m; ++j) {
852                 if (i - j + 1 < 0) {
853                     Hdata[i][j] = 0;
854                 } else {
855                     Hdata[i][j] = 1;
856                 }
857             }
858         }
859 
860         /*
861          * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
862          * hPowers[m-1] = h^m
863          */
864         final double[] hPowers = new double[m];
865         hPowers[0] = h;
866         for (int i = 1; i < m; ++i) {
867             hPowers[i] = h * hPowers[i - 1];
868         }
869 
870         /*
871          * First column and last row has special values (each other reversed).
872          */
873         for (int i = 0; i < m; ++i) {
874             Hdata[i][0] = Hdata[i][0] - hPowers[i];
875             Hdata[m - 1][i] -= hPowers[m - i - 1];
876         }
877 
878         /*
879          * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
880          * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
881          */
882         if (Double.compare(h, 0.5) > 0) {
883             Hdata[m - 1][0] += FastMath.pow(2 * h - 1, m);
884         }
885 
886         /*
887          * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
888          * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
889          * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
890          * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
891          * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
892          * really necessary.
893          */
894         for (int i = 0; i < m; ++i) {
895             for (int j = 0; j < i + 1; ++j) {
896                 if (i - j + 1 > 0) {
897                     for (int g = 2; g <= i - j + 1; ++g) {
898                         Hdata[i][j] /= g;
899                     }
900                 }
901             }
902         }
903         return MatrixUtils.createRealMatrix(Hdata);
904     }
905 
906     /**
907      * Verifies that {@code array} has length at least 2.
908      *
909      * @param array array to test
910      * @throws NullArgumentException if array is null
911      * @throws InsufficientDataException if array is too short
912      */
checkArray(double[] array)913     private void checkArray(double[] array) {
914         if (array == null) {
915             throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED);
916         }
917         if (array.length < 2) {
918             throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length,
919                                                 2);
920         }
921     }
922 
923     /**
924      * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial
925      * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums
926      * have been computed. If the sum does not converge before {@code maxIterations} iterations a
927      * {@link TooManyIterationsException} is thrown.
928      *
929      * @param t argument
930      * @param tolerance Cauchy criterion for partial sums
931      * @param maxIterations maximum number of partial sums to compute
932      * @return Kolmogorov sum evaluated at t
933      * @throws TooManyIterationsException if the series does not converge
934      */
ksSum(double t, double tolerance, int maxIterations)935     public double ksSum(double t, double tolerance, int maxIterations) {
936         if (t == 0.0) {
937             return 0.0;
938         }
939 
940         // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
941         // from class javadoc should be used.
942 
943         final double x = -2 * t * t;
944         int sign = -1;
945         long i = 1;
946         double partialSum = 0.5d;
947         double delta = 1;
948         while (delta > tolerance && i < maxIterations) {
949             delta = FastMath.exp(x * i * i);
950             partialSum += sign * delta;
951             sign *= -1;
952             i++;
953         }
954         if (i == maxIterations) {
955             throw new TooManyIterationsException(maxIterations);
956         }
957         return partialSum * 2;
958     }
959 
960     /**
961      * Given a d-statistic in the range [0, 1] and the two sample sizes n and m,
962      * an integral d-statistic in the range [0, n*m] is calculated, that can be used for
963      * comparison with other integral d-statistics. Depending whether {@code strict} is
964      * {@code true} or not, the returned value divided by (n*m) is greater than
965      * (resp greater than or equal to) the given d value (allowing some tolerance).
966      *
967      * @param d a d-statistic in the range [0, 1]
968      * @param n first sample size
969      * @param m second sample size
970      * @param strict whether the returned value divided by (n*m) is allowed to be equal to d
971      * @return the integral d-statistic in the range [0, n*m]
972      */
calculateIntegralD(double d, int n, int m, boolean strict)973     private static long calculateIntegralD(double d, int n, int m, boolean strict) {
974         final double tol = 1e-12;  // d-values within tol of one another are considered equal
975         long nm = n * (long)m;
976         long upperBound = (long)FastMath.ceil((d - tol) * nm);
977         long lowerBound = (long)FastMath.floor((d + tol) * nm);
978         if (strict && lowerBound == upperBound) {
979             return upperBound + 1l;
980         }
981         else {
982             return upperBound;
983         }
984     }
985 
986     /**
987      * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
988      * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
989      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
990      * <p>
991      * The returned probability is exact, implemented by unwinding the recursive function
992      * definitions presented in [4] (class javadoc).
993      * </p>
994      *
995      * @param d D-statistic value
996      * @param n first sample size
997      * @param m second sample size
998      * @param strict whether or not the probability to compute is expressed as a strict inequality
999      * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
1000      *         greater than (resp. greater than or equal to) {@code d}
1001      */
exactP(double d, int n, int m, boolean strict)1002     public double exactP(double d, int n, int m, boolean strict) {
1003        return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) /
1004                CombinatoricsUtils.binomialCoefficientDouble(n + m, m);
1005     }
1006 
1007     /**
1008      * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\)
1009      * is the 2-sample Kolmogorov-Smirnov statistic. See
1010      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
1011      * <p>
1012      * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2
1013      * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for
1014      * details on how convergence of the sum is determined. This implementation passes {@code ksSum}
1015      * {@value #KS_SUM_CAUCHY_CRITERION} as {@code tolerance} and
1016      * {@value #MAXIMUM_PARTIAL_SUM_COUNT} as {@code maxIterations}.
1017      * </p>
1018      *
1019      * @param d D-statistic value
1020      * @param n first sample size
1021      * @param m second sample size
1022      * @return approximate probability that a randomly selected m-n partition of m + n generates
1023      *         \(D_{n,m}\) greater than {@code d}
1024      */
approximateP(double d, int n, int m)1025     public double approximateP(double d, int n, int m) {
1026         final double dm = m;
1027         final double dn = n;
1028         return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)),
1029                          KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
1030     }
1031 
1032     /**
1033      * Fills a boolean array randomly with a fixed number of {@code true} values.
1034      * The method uses a simplified version of the Fisher-Yates shuffle algorithm.
1035      * By processing first the {@code true} values followed by the remaining {@code false} values
1036      * less random numbers need to be generated. The method is optimized for the case
1037      * that the number of {@code true} values is larger than or equal to the number of
1038      * {@code false} values.
1039      *
1040      * @param b boolean array
1041      * @param numberOfTrueValues number of {@code true} values the boolean array should finally have
1042      * @param rng random data generator
1043      */
fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final RandomGenerator rng)1044     static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final RandomGenerator rng) {
1045         Arrays.fill(b, true);
1046         for (int k = numberOfTrueValues; k < b.length; k++) {
1047             final int r = rng.nextInt(k + 1);
1048             b[(b[r]) ? r : k] = false;
1049         }
1050     }
1051 
1052     /**
1053      * Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the
1054      * 2-sample Kolmogorov-Smirnov statistic. See
1055      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
1056      * <p>
1057      * The simulation generates {@code iterations} random partitions of {@code m + n} into an
1058      * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning
1059      * the proportion of values that are greater than {@code d}, or greater than or equal to
1060      * {@code d} if {@code strict} is {@code false}.
1061      * </p>
1062      *
1063      * @param d D-statistic value
1064      * @param n first sample size
1065      * @param m second sample size
1066      * @param iterations number of random partitions to generate
1067      * @param strict whether or not the probability to compute is expressed as a strict inequality
1068      * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
1069      *         greater than (resp. greater than or equal to) {@code d}
1070      */
monteCarloP(final double d, final int n, final int m, final boolean strict, final int iterations)1071     public double monteCarloP(final double d, final int n, final int m, final boolean strict,
1072                               final int iterations) {
1073         return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations);
1074     }
1075 
1076     /**
1077      * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d/(n*m))\) where \(D_{n,m}\) is the
1078      * 2-sample Kolmogorov-Smirnov statistic.
1079      * <p>
1080      * Here d is the D-statistic represented as long value.
1081      * The real D-statistic is obtained by dividing d by n*m.
1082      * See also {@link #monteCarloP(double, int, int, boolean, int)}.
1083      *
1084      * @param d integral D-statistic
1085      * @param n first sample size
1086      * @param m second sample size
1087      * @param iterations number of random partitions to generate
1088      * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
1089      *         greater than or equal to {@code d/(n*m))}
1090      */
integralMonteCarloP(final long d, final int n, final int m, final int iterations)1091     private double integralMonteCarloP(final long d, final int n, final int m, final int iterations) {
1092 
1093         // ensure that nn is always the max of (n, m) to require fewer random numbers
1094         final int nn = FastMath.max(n, m);
1095         final int mm = FastMath.min(n, m);
1096         final int sum = nn + mm;
1097 
1098         int tail = 0;
1099         final boolean b[] = new boolean[sum];
1100         for (int i = 0; i < iterations; i++) {
1101             fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng);
1102             long curD = 0l;
1103             for(int j = 0; j < b.length; ++j) {
1104                 if (b[j]) {
1105                     curD += mm;
1106                     if (curD >= d) {
1107                         tail++;
1108                         break;
1109                     }
1110                 } else {
1111                     curD -= nn;
1112                     if (curD <= -d) {
1113                         tail++;
1114                         break;
1115                     }
1116                 }
1117             }
1118         }
1119         return (double) tail / iterations;
1120     }
1121 
1122     /**
1123      * If there are no ties in the combined dataset formed from x and y, this
1124      * method is a no-op.  If there are ties, a uniform random deviate in
1125      * (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where
1126      * minDelta is the minimum difference between unequal values in the combined
1127      * sample.  A fixed seed is used to generate the jitter, so repeated activations
1128      * with the same input arrays result in the same values.
1129      *
1130      * NOTE: if there are ties in the data, this method overwrites the data in
1131      * x and y with the jittered values.
1132      *
1133      * @param x first sample
1134      * @param y second sample
1135      */
fixTies(double[] x, double[] y)1136     private static void fixTies(double[] x, double[] y) {
1137        final double[] values = MathArrays.unique(MathArrays.concatenate(x,y));
1138        if (values.length == x.length + y.length) {
1139            return;  // There are no ties
1140        }
1141 
1142        // Find the smallest difference between values, or 1 if all values are the same
1143        double minDelta = 1;
1144        double prev = values[0];
1145        double delta = 1;
1146        for (int i = 1; i < values.length; i++) {
1147           delta = prev - values[i];
1148           if (delta < minDelta) {
1149               minDelta = delta;
1150           }
1151           prev = values[i];
1152        }
1153        minDelta /= 2;
1154 
1155        // Add jitter using a fixed seed (so same arguments always give same results),
1156        // low-initialization-overhead generator
1157        final RealDistribution dist =
1158                new UniformRealDistribution(new JDKRandomGenerator(100), -minDelta, minDelta);
1159 
1160        // It is theoretically possible that jitter does not break ties, so repeat
1161        // until all ties are gone.  Bound the loop and throw MIE if bound is exceeded.
1162        int ct = 0;
1163        boolean ties = true;
1164        do {
1165            jitter(x, dist);
1166            jitter(y, dist);
1167            ties = hasTies(x, y);
1168            ct++;
1169        } while (ties && ct < 1000);
1170        if (ties) {
1171            throw new MathInternalError(); // Should never happen
1172        }
1173     }
1174 
1175     /**
1176      * Returns true iff there are ties in the combined sample
1177      * formed from x and y.
1178      *
1179      * @param x first sample
1180      * @param y second sample
1181      * @return true if x and y together contain ties
1182      */
hasTies(double[] x, double[] y)1183     private static boolean hasTies(double[] x, double[] y) {
1184         final HashSet<Double> values = new HashSet<Double>();
1185             for (int i = 0; i < x.length; i++) {
1186                 if (!values.add(x[i])) {
1187                     return true;
1188                 }
1189             }
1190             for (int i = 0; i < y.length; i++) {
1191                 if (!values.add(y[i])) {
1192                     return true;
1193                 }
1194             }
1195         return false;
1196     }
1197 
1198     /**
1199      * Adds random jitter to {@code data} using deviates sampled from {@code dist}.
1200      * <p>
1201      * Note that jitter is applied in-place - i.e., the array
1202      * values are overwritten with the result of applying jitter.</p>
1203      *
1204      * @param data input/output data array - entries overwritten by the method
1205      * @param dist probability distribution to sample for jitter values
1206      * @throws NullPointerException if either of the parameters is null
1207      */
jitter(double[] data, RealDistribution dist)1208     private static void jitter(double[] data, RealDistribution dist) {
1209         for (int i = 0; i < data.length; i++) {
1210             data[i] += dist.sample();
1211         }
1212     }
1213 
1214     /**
1215      * The function C(i, j) defined in [4] (class javadoc), formula (5.5).
1216      * defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up
1217      * and recoded as a long to avoid rounding errors in comparison tests, so what
1218      * is actually tested is |im - jn| <= cmn.
1219      *
1220      * @param i first path parameter
1221      * @param j second path paramter
1222      * @param m first sample size
1223      * @param n second sample size
1224      * @param cmn integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
1225      * @param strict whether or not the null hypothesis uses strict inequality
1226      * @return C(i,j) for given m, n, c
1227      */
c(int i, int j, int m, int n, long cmn, boolean strict)1228     private static int c(int i, int j, int m, int n, long cmn, boolean strict) {
1229         if (strict) {
1230             return FastMath.abs(i*(long)n - j*(long)m) <= cmn ? 1 : 0;
1231         }
1232         return FastMath.abs(i*(long)n - j*(long)m) < cmn ? 1 : 0;
1233     }
1234 
1235     /**
1236      * The function N(i, j) defined in [4] (class javadoc).
1237      * Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m}
1238      * from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path.
1239      * The return value is integral, but subject to overflow, so it is maintained and
1240      * returned as a double.
1241      *
1242      * @param i first path parameter
1243      * @param j second path parameter
1244      * @param m first sample size
1245      * @param n second sample size
1246      * @param cnm integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
1247      * @param strict whether or not the null hypothesis uses strict inequality
1248      * @return number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n
1249      */
n(int i, int j, int m, int n, long cnm, boolean strict)1250     private static double n(int i, int j, int m, int n, long cnm, boolean strict) {
1251         /*
1252          * Unwind the recursive definition given in [4].
1253          * Compute n(1,1), n(1,2)...n(2,1), n(2,2)... up to n(i,j), one row at a time.
1254          * When n(i,*) are being computed, lag[] holds the values of n(i - 1, *).
1255          */
1256         final double[] lag = new double[n];
1257         double last = 0;
1258         for (int k = 0; k < n; k++) {
1259             lag[k] = c(0, k + 1, m, n, cnm, strict);
1260         }
1261         for (int k = 1; k <= i; k++) {
1262             last = c(k, 0, m, n, cnm, strict);
1263             for (int l = 1; l <= j; l++) {
1264                 lag[l - 1] = c(k, l, m, n, cnm, strict) * (last + lag[l - 1]);
1265                 last = lag[l - 1];
1266             }
1267         }
1268         return last;
1269     }
1270 }
1271