1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.commons.math3.stat.inference; 19 20 import java.math.BigDecimal; 21 import java.util.Arrays; 22 import java.util.HashSet; 23 24 import org.apache.commons.math3.distribution.EnumeratedRealDistribution; 25 import org.apache.commons.math3.distribution.RealDistribution; 26 import org.apache.commons.math3.distribution.UniformRealDistribution; 27 import org.apache.commons.math3.exception.InsufficientDataException; 28 import org.apache.commons.math3.exception.MathArithmeticException; 29 import org.apache.commons.math3.exception.MathInternalError; 30 import org.apache.commons.math3.exception.NullArgumentException; 31 import org.apache.commons.math3.exception.NumberIsTooLargeException; 32 import org.apache.commons.math3.exception.OutOfRangeException; 33 import org.apache.commons.math3.exception.TooManyIterationsException; 34 import org.apache.commons.math3.exception.util.LocalizedFormats; 35 import org.apache.commons.math3.fraction.BigFraction; 36 import org.apache.commons.math3.fraction.BigFractionField; 37 import org.apache.commons.math3.fraction.FractionConversionException; 38 import org.apache.commons.math3.linear.Array2DRowFieldMatrix; 39 import org.apache.commons.math3.linear.FieldMatrix; 40 import org.apache.commons.math3.linear.MatrixUtils; 41 import org.apache.commons.math3.linear.RealMatrix; 42 import org.apache.commons.math3.random.JDKRandomGenerator; 43 import org.apache.commons.math3.random.RandomGenerator; 44 import org.apache.commons.math3.random.Well19937c; 45 import org.apache.commons.math3.util.CombinatoricsUtils; 46 import org.apache.commons.math3.util.FastMath; 47 import org.apache.commons.math3.util.MathArrays; 48 import org.apache.commons.math3.util.MathUtils; 49 50 /** 51 * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> 52 * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions. 53 * <p> 54 * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of 55 * sample data points from the distribution expected under the null hypothesis. For one-sample tests 56 * evaluating the null hypothesis that a set of sample data points follow a given distribution, the 57 * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and 58 * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of 59 * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values 60 * given in [2]. 61 * </p> 62 * <p> 63 * Two-sample tests are also supported, evaluating the null hypothesis that the two samples 64 * {@code x} and {@code y} come from the same underlying distribution. In this case, the test 65 * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is 66 * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of 67 * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The 68 * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as 69 * follows: 70 * <ul> 71 * <li>For small samples (where the product of the sample sizes is less than 72 * {@value #LARGE_SAMPLE_PRODUCT}), the method presented in [4] is used to compute the 73 * exact p-value for the 2-sample test.</li> 74 * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the asymptotic 75 * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on 76 * the approximation.</li> 77 * </ul></p><p> 78 * If the product of the sample sizes is less than {@value #LARGE_SAMPLE_PRODUCT} and the sample 79 * data contains ties, random jitter is added to the sample data to break ties before applying 80 * the algorithm above. Alternatively, the {@link #bootstrap(double[], double[], int, boolean)} 81 * method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a> 82 * in the R Matching package [3], can be used if ties are known to be present in the data. 83 * </p> 84 * <p> 85 * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value 86 * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} > d \) 87 * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean 88 * {@code strict} parameter. This parameter is ignored for large samples. 89 * </p> 90 * <p> 91 * The methods used by the 2-sample default implementation are also exposed directly: 92 * <ul> 93 * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li> 94 * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean} 95 * arguments in the first two methods allow the probability used to estimate the p-value to be 96 * expressed using strict or non-strict inequality. See 97 * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li> 98 * </ul> 99 * </p> 100 * <p> 101 * References: 102 * <ul> 103 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by 104 * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li> 105 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov 106 * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li> 107 * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07"> 108 * Multivariate and Propensity Score Matching Software with Automated Balance Optimization: 109 * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li> 110 * <li>[4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing, 111 * Chapter 5, 3rd Ed. Academic Press.</li> 112 * </ul> 113 * <br/> 114 * Note that [1] contains an error in computing h, refer to <a 115 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details. 116 * </p> 117 * 118 * @since 3.3 119 */ 120 public class KolmogorovSmirnovTest { 121 122 /** 123 * Bound on the number of partial sums in {@link #ksSum(double, double, int)} 124 */ 125 protected static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000; 126 127 /** Convergence criterion for {@link #ksSum(double, double, int)} */ 128 protected static final double KS_SUM_CAUCHY_CRITERION = 1E-20; 129 130 /** Convergence criterion for the sums in #pelzGood(double, double, int)} */ 131 protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10; 132 133 /** No longer used. */ 134 @Deprecated 135 protected static final int SMALL_SAMPLE_PRODUCT = 200; 136 137 /** 138 * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic 139 * distribution to compute the p-value. 140 */ 141 protected static final int LARGE_SAMPLE_PRODUCT = 10000; 142 143 /** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)}. 144 * Deprecated as of version 3.6, as this method is no longer needed. */ 145 @Deprecated 146 protected static final int MONTE_CARLO_ITERATIONS = 1000000; 147 148 /** Random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} */ 149 private final RandomGenerator rng; 150 151 /** 152 * Construct a KolmogorovSmirnovTest instance with a default random data generator. 153 */ KolmogorovSmirnovTest()154 public KolmogorovSmirnovTest() { 155 rng = new Well19937c(); 156 } 157 158 /** 159 * Construct a KolmogorovSmirnovTest with the provided random data generator. 160 * The #monteCarloP(double, int, int, boolean, int) that uses the generator supplied to this 161 * constructor is deprecated as of version 3.6. 162 * 163 * @param rng random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} 164 */ 165 @Deprecated KolmogorovSmirnovTest(RandomGenerator rng)166 public KolmogorovSmirnovTest(RandomGenerator rng) { 167 this.rng = rng; 168 } 169 170 /** 171 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a 172 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 173 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If 174 * {@code exact} is true, the distribution used to compute the p-value is computed using 175 * extended precision. See {@link #cdfExact(double, int)}. 176 * 177 * @param distribution reference distribution 178 * @param data sample being being evaluated 179 * @param exact whether or not to force exact computation of the p-value 180 * @return the p-value associated with the null hypothesis that {@code data} is a sample from 181 * {@code distribution} 182 * @throws InsufficientDataException if {@code data} does not have length at least 2 183 * @throws NullArgumentException if {@code data} is null 184 */ kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact)185 public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) { 186 return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact); 187 } 188 189 /** 190 * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where 191 * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the 192 * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at 193 * each of the values in {@code data}. 194 * 195 * @param distribution reference distribution 196 * @param data sample being evaluated 197 * @return Kolmogorov-Smirnov statistic \(D_n\) 198 * @throws InsufficientDataException if {@code data} does not have length at least 2 199 * @throws NullArgumentException if {@code data} is null 200 */ kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data)201 public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) { 202 checkArray(data); 203 final int n = data.length; 204 final double nd = n; 205 final double[] dataCopy = new double[n]; 206 System.arraycopy(data, 0, dataCopy, 0, n); 207 Arrays.sort(dataCopy); 208 double d = 0d; 209 for (int i = 1; i <= n; i++) { 210 final double yi = distribution.cumulativeProbability(dataCopy[i - 1]); 211 final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi); 212 if (currD > d) { 213 d = currD; 214 } 215 } 216 return d; 217 } 218 219 /** 220 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a 221 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 222 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 223 * probability distribution. Specifically, what is returned is an estimate of the probability 224 * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly 225 * selected partition of the combined sample into subsamples of sizes {@code x.length} and 226 * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as 227 * large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}. 228 * <ul> 229 * <li>For small samples (where the product of the sample sizes is less than 230 * {@value #LARGE_SAMPLE_PRODUCT}), the exact p-value is computed using the method presented 231 * in [4], implemented in {@link #exactP(double, int, int, boolean)}. </li> 232 * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the 233 * asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} 234 * for details on the approximation.</li> 235 * </ul><p> 236 * If {@code x.length * y.length} < {@value #LARGE_SAMPLE_PRODUCT} and the combined set of values in 237 * {@code x} and {@code y} contains ties, random jitter is added to {@code x} and {@code y} to 238 * break ties before computing \(D_{n,m}\) and the p-value. The jitter is uniformly distributed 239 * on (-minDelta / 2, minDelta / 2) where minDelta is the smallest pairwise difference between 240 * values in the combined sample.</p> 241 * <p> 242 * If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)} 243 * may be used as an alternative method for estimating the p-value.</p> 244 * 245 * @param x first sample dataset 246 * @param y second sample dataset 247 * @param strict whether or not the probability to compute is expressed as a strict inequality 248 * (ignored for large samples) 249 * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent 250 * samples from the same distribution 251 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 252 * least 2 253 * @throws NullArgumentException if either {@code x} or {@code y} is null 254 * @see #bootstrap(double[], double[], int, boolean) 255 */ kolmogorovSmirnovTest(double[] x, double[] y, boolean strict)256 public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) { 257 final long lengthProduct = (long) x.length * y.length; 258 double[] xa = null; 259 double[] ya = null; 260 if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) { 261 xa = MathArrays.copyOf(x); 262 ya = MathArrays.copyOf(y); 263 fixTies(xa, ya); 264 } else { 265 xa = x; 266 ya = y; 267 } 268 if (lengthProduct < LARGE_SAMPLE_PRODUCT) { 269 return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict); 270 } 271 return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length); 272 } 273 274 /** 275 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a 276 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 277 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 278 * probability distribution. Assumes the strict form of the inequality used to compute the 279 * p-value. See {@link #kolmogorovSmirnovTest(RealDistribution, double[], boolean)}. 280 * 281 * @param x first sample dataset 282 * @param y second sample dataset 283 * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent 284 * samples from the same distribution 285 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 286 * least 2 287 * @throws NullArgumentException if either {@code x} or {@code y} is null 288 */ kolmogorovSmirnovTest(double[] x, double[] y)289 public double kolmogorovSmirnovTest(double[] x, double[] y) { 290 return kolmogorovSmirnovTest(x, y, true); 291 } 292 293 /** 294 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\) 295 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the 296 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) 297 * is the empirical distribution of the {@code y} values. 298 * 299 * @param x first sample 300 * @param y second sample 301 * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and 302 * {@code y} represent samples from the same underlying distribution 303 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 304 * least 2 305 * @throws NullArgumentException if either {@code x} or {@code y} is null 306 */ kolmogorovSmirnovStatistic(double[] x, double[] y)307 public double kolmogorovSmirnovStatistic(double[] x, double[] y) { 308 return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length)); 309 } 310 311 /** 312 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\) 313 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the 314 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) 315 * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned 316 * as long value. 317 * 318 * @param x first sample 319 * @param y second sample 320 * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and 321 * {@code y} represent samples from the same underlying distribution 322 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 323 * least 2 324 * @throws NullArgumentException if either {@code x} or {@code y} is null 325 */ integralKolmogorovSmirnovStatistic(double[] x, double[] y)326 private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) { 327 checkArray(x); 328 checkArray(y); 329 // Copy and sort the sample arrays 330 final double[] sx = MathArrays.copyOf(x); 331 final double[] sy = MathArrays.copyOf(y); 332 Arrays.sort(sx); 333 Arrays.sort(sy); 334 final int n = sx.length; 335 final int m = sy.length; 336 337 int rankX = 0; 338 int rankY = 0; 339 long curD = 0l; 340 341 // Find the max difference between cdf_x and cdf_y 342 long supD = 0l; 343 do { 344 double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY]; 345 while(rankX < n && Double.compare(sx[rankX], z) == 0) { 346 rankX += 1; 347 curD += m; 348 } 349 while(rankY < m && Double.compare(sy[rankY], z) == 0) { 350 rankY += 1; 351 curD -= n; 352 } 353 if (curD > supD) { 354 supD = curD; 355 } 356 else if (-curD > supD) { 357 supD = -curD; 358 } 359 } while(rankX < n && rankY < m); 360 return supD; 361 } 362 363 /** 364 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a 365 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 366 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. 367 * 368 * @param distribution reference distribution 369 * @param data sample being being evaluated 370 * @return the p-value associated with the null hypothesis that {@code data} is a sample from 371 * {@code distribution} 372 * @throws InsufficientDataException if {@code data} does not have length at least 2 373 * @throws NullArgumentException if {@code data} is null 374 */ kolmogorovSmirnovTest(RealDistribution distribution, double[] data)375 public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) { 376 return kolmogorovSmirnovTest(distribution, data, false); 377 } 378 379 /** 380 * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov 381 * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}. 382 * 383 * @param distribution reference distribution 384 * @param data sample being being evaluated 385 * @param alpha significance level of the test 386 * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution} 387 * can be rejected with confidence 1 - {@code alpha} 388 * @throws InsufficientDataException if {@code data} does not have length at least 2 389 * @throws NullArgumentException if {@code data} is null 390 */ kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha)391 public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) { 392 if ((alpha <= 0) || (alpha > 0.5)) { 393 throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5); 394 } 395 return kolmogorovSmirnovTest(distribution, data) < alpha; 396 } 397 398 /** 399 * Estimates the <i>p-value</i> of a two-sample 400 * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 401 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 402 * probability distribution. This method estimates the p-value by repeatedly sampling sets of size 403 * {@code x.length} and {@code y.length} from the empirical distribution of the combined sample. 404 * When {@code strict} is true, this is equivalent to the algorithm implemented in the R function 405 * {@code ks.boot}, described in <pre> 406 * Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching 407 * Software with Automated Balance Optimization: The Matching package for R.' 408 * Journal of Statistical Software, 42(7): 1-52. 409 * </pre> 410 * @param x first sample 411 * @param y second sample 412 * @param iterations number of bootstrap resampling iterations 413 * @param strict whether or not the null hypothesis is expressed as a strict inequality 414 * @return estimated p-value 415 */ bootstrap(double[] x, double[] y, int iterations, boolean strict)416 public double bootstrap(double[] x, double[] y, int iterations, boolean strict) { 417 final int xLength = x.length; 418 final int yLength = y.length; 419 final double[] combined = new double[xLength + yLength]; 420 System.arraycopy(x, 0, combined, 0, xLength); 421 System.arraycopy(y, 0, combined, xLength, yLength); 422 final EnumeratedRealDistribution dist = new EnumeratedRealDistribution(rng, combined); 423 final long d = integralKolmogorovSmirnovStatistic(x, y); 424 int greaterCount = 0; 425 int equalCount = 0; 426 double[] curX; 427 double[] curY; 428 long curD; 429 for (int i = 0; i < iterations; i++) { 430 curX = dist.sample(xLength); 431 curY = dist.sample(yLength); 432 curD = integralKolmogorovSmirnovStatistic(curX, curY); 433 if (curD > d) { 434 greaterCount++; 435 } else if (curD == d) { 436 equalCount++; 437 } 438 } 439 return strict ? greaterCount / (double) iterations : 440 (greaterCount + equalCount) / (double) iterations; 441 } 442 443 /** 444 * Computes {@code bootstrap(x, y, iterations, true)}. 445 * This is equivalent to ks.boot(x,y, nboots=iterations) using the R Matching 446 * package function. See #bootstrap(double[], double[], int, boolean). 447 * 448 * @param x first sample 449 * @param y second sample 450 * @param iterations number of bootstrap resampling iterations 451 * @return estimated p-value 452 */ bootstrap(double[] x, double[] y, int iterations)453 public double bootstrap(double[] x, double[] y, int iterations) { 454 return bootstrap(x, y, iterations, true); 455 } 456 457 /** 458 * Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme 459 * values given in [2] (see above). The result is not exact as with 460 * {@link #cdfExact(double, int)} because calculations are based on 461 * {@code double} rather than {@link org.apache.commons.math3.fraction.BigFraction}. 462 * 463 * @param d statistic 464 * @param n sample size 465 * @return \(P(D_n < d)\) 466 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a 467 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 468 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\) 469 */ cdf(double d, int n)470 public double cdf(double d, int n) 471 throws MathArithmeticException { 472 return cdf(d, n, false); 473 } 474 475 /** 476 * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is 477 * used everywhere at the expense of very slow execution time. Almost never choose this in real 478 * applications unless you are very sure; this is almost solely for verification purposes. 479 * Normally, you would choose {@link #cdf(double, int)}. See the class 480 * javadoc for definitions and algorithm description. 481 * 482 * @param d statistic 483 * @param n sample size 484 * @return \(P(D_n < d)\) 485 * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a 486 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 487 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\) 488 */ cdfExact(double d, int n)489 public double cdfExact(double d, int n) 490 throws MathArithmeticException { 491 return cdf(d, n, true); 492 } 493 494 /** 495 * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme 496 * values given in [2] (see above). 497 * 498 * @param d statistic 499 * @param n sample size 500 * @param exact whether the probability should be calculated exact using 501 * {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the expense of 502 * very slow execution time, or if {@code double} should be used convenient places to 503 * gain speed. Almost never choose {@code true} in real applications unless you are very 504 * sure; {@code true} is almost solely for verification purposes. 505 * @return \(P(D_n < d)\) 506 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a 507 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 508 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\). 509 */ cdf(double d, int n, boolean exact)510 public double cdf(double d, int n, boolean exact) 511 throws MathArithmeticException { 512 513 final double ninv = 1 / ((double) n); 514 final double ninvhalf = 0.5 * ninv; 515 516 if (d <= ninvhalf) { 517 return 0; 518 } else if (ninvhalf < d && d <= ninv) { 519 double res = 1; 520 final double f = 2 * d - ninv; 521 // n! f^n = n*f * (n-1)*f * ... * 1*x 522 for (int i = 1; i <= n; ++i) { 523 res *= i * f; 524 } 525 return res; 526 } else if (1 - ninv <= d && d < 1) { 527 return 1 - 2 * Math.pow(1 - d, n); 528 } else if (1 <= d) { 529 return 1; 530 } 531 if (exact) { 532 return exactK(d, n); 533 } 534 if (n <= 140) { 535 return roundedK(d, n); 536 } 537 return pelzGood(d, n); 538 } 539 540 /** 541 * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference 542 * in class javadoc above) and {@link org.apache.commons.math3.fraction.BigFraction} (see 543 * above). 544 * 545 * @param d statistic 546 * @param n sample size 547 * @return the two-sided probability of \(P(D_n < d)\) 548 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a 549 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 550 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\). 551 */ exactK(double d, int n)552 private double exactK(double d, int n) 553 throws MathArithmeticException { 554 555 final int k = (int) Math.ceil(n * d); 556 557 final FieldMatrix<BigFraction> H = this.createExactH(d, n); 558 final FieldMatrix<BigFraction> Hpower = H.power(n); 559 560 BigFraction pFrac = Hpower.getEntry(k - 1, k - 1); 561 562 for (int i = 1; i <= n; ++i) { 563 pFrac = pFrac.multiply(i).divide(n); 564 } 565 566 /* 567 * BigFraction.doubleValue converts numerator to double and the denominator to double and 568 * divides afterwards. That gives NaN quite easy. This does not (scale is the number of 569 * digits): 570 */ 571 return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue(); 572 } 573 574 /** 575 * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above). 576 * 577 * @param d statistic 578 * @param n sample size 579 * @return \(P(D_n < d)\) 580 */ roundedK(double d, int n)581 private double roundedK(double d, int n) { 582 583 final int k = (int) Math.ceil(n * d); 584 final RealMatrix H = this.createRoundedH(d, n); 585 final RealMatrix Hpower = H.power(n); 586 587 double pFrac = Hpower.getEntry(k - 1, k - 1); 588 for (int i = 1; i <= n; ++i) { 589 pFrac *= (double) i / (double) n; 590 } 591 592 return pFrac; 593 } 594 595 /** 596 * Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc. 597 * 598 * @param d value of d-statistic (x in [2]) 599 * @param n sample size 600 * @return \(P(D_n < d)\) 601 * @since 3.4 602 */ pelzGood(double d, int n)603 public double pelzGood(double d, int n) { 604 // Change the variable since approximation is for the distribution evaluated at d / sqrt(n) 605 final double sqrtN = FastMath.sqrt(n); 606 final double z = d * sqrtN; 607 final double z2 = d * d * n; 608 final double z4 = z2 * z2; 609 final double z6 = z4 * z2; 610 final double z8 = z4 * z4; 611 612 // Eventual return value 613 double ret = 0; 614 615 // Compute K_0(z) 616 double sum = 0; 617 double increment = 0; 618 double kTerm = 0; 619 double z2Term = MathUtils.PI_SQUARED / (8 * z2); 620 int k = 1; 621 for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 622 kTerm = 2 * k - 1; 623 increment = FastMath.exp(-z2Term * kTerm * kTerm); 624 sum += increment; 625 if (increment <= PG_SUM_RELATIVE_ERROR * sum) { 626 break; 627 } 628 } 629 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 630 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 631 } 632 ret = sum * FastMath.sqrt(2 * FastMath.PI) / z; 633 634 // K_1(z) 635 // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have 636 // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...) 637 final double twoZ2 = 2 * z2; 638 sum = 0; 639 kTerm = 0; 640 double kTerm2 = 0; 641 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 642 kTerm = k + 0.5; 643 kTerm2 = kTerm * kTerm; 644 increment = (MathUtils.PI_SQUARED * kTerm2 - z2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 645 sum += increment; 646 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) { 647 break; 648 } 649 } 650 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 651 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 652 } 653 final double sqrtHalfPi = FastMath.sqrt(FastMath.PI / 2); 654 // Instead of doubling sum, divide by 3 instead of 6 655 ret += sum * sqrtHalfPi / (3 * z4 * sqrtN); 656 657 // K_2(z) 658 // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers. 659 final double z4Term = 2 * z4; 660 final double z6Term = 6 * z6; 661 z2Term = 5 * z2; 662 final double pi4 = MathUtils.PI_SQUARED * MathUtils.PI_SQUARED; 663 sum = 0; 664 kTerm = 0; 665 kTerm2 = 0; 666 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 667 kTerm = k + 0.5; 668 kTerm2 = kTerm * kTerm; 669 increment = (z6Term + z4Term + MathUtils.PI_SQUARED * (z4Term - z2Term) * kTerm2 + 670 pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 671 sum += increment; 672 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) { 673 break; 674 } 675 } 676 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 677 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 678 } 679 double sum2 = 0; 680 kTerm2 = 0; 681 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 682 kTerm2 = k * k; 683 increment = MathUtils.PI_SQUARED * kTerm2 * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 684 sum2 += increment; 685 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) { 686 break; 687 } 688 } 689 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 690 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 691 } 692 // Again, adjust coefficients instead of doubling sum, sum2 693 ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z)); 694 695 // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even. 696 // Multiply coefficient denominators by 2, so omit doubling sums. 697 final double pi6 = pi4 * MathUtils.PI_SQUARED; 698 sum = 0; 699 double kTerm4 = 0; 700 double kTerm6 = 0; 701 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 702 kTerm = k + 0.5; 703 kTerm2 = kTerm * kTerm; 704 kTerm4 = kTerm2 * kTerm2; 705 kTerm6 = kTerm4 * kTerm2; 706 increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) + 707 MathUtils.PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) * 708 FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 709 sum += increment; 710 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) { 711 break; 712 } 713 } 714 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 715 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 716 } 717 sum2 = 0; 718 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 719 kTerm2 = k * k; 720 kTerm4 = kTerm2 * kTerm2; 721 increment = (-pi4 * kTerm4 + 3 * MathUtils.PI_SQUARED * kTerm2 * z2) * 722 FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 723 sum2 += increment; 724 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) { 725 break; 726 } 727 } 728 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 729 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 730 } 731 return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) + 732 + sum2 / (108 * z6)); 733 734 } 735 736 /*** 737 * Creates {@code H} of size {@code m x m} as described in [1] (see above). 738 * 739 * @param d statistic 740 * @param n sample size 741 * @return H matrix 742 * @throws NumberIsTooLargeException if fractional part is greater than 1 743 * @throws FractionConversionException if algorithm fails to convert {@code h} to a 744 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 745 * - h) / m\) for integer {@code k, m} and \(0 <= h < 1\). 746 */ createExactH(double d, int n)747 private FieldMatrix<BigFraction> createExactH(double d, int n) 748 throws NumberIsTooLargeException, FractionConversionException { 749 750 final int k = (int) Math.ceil(n * d); 751 final int m = 2 * k - 1; 752 final double hDouble = k - n * d; 753 if (hDouble >= 1) { 754 throw new NumberIsTooLargeException(hDouble, 1.0, false); 755 } 756 BigFraction h = null; 757 try { 758 h = new BigFraction(hDouble, 1.0e-20, 10000); 759 } catch (final FractionConversionException e1) { 760 try { 761 h = new BigFraction(hDouble, 1.0e-10, 10000); 762 } catch (final FractionConversionException e2) { 763 h = new BigFraction(hDouble, 1.0e-5, 10000); 764 } 765 } 766 final BigFraction[][] Hdata = new BigFraction[m][m]; 767 768 /* 769 * Start by filling everything with either 0 or 1. 770 */ 771 for (int i = 0; i < m; ++i) { 772 for (int j = 0; j < m; ++j) { 773 if (i - j + 1 < 0) { 774 Hdata[i][j] = BigFraction.ZERO; 775 } else { 776 Hdata[i][j] = BigFraction.ONE; 777 } 778 } 779 } 780 781 /* 782 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ... 783 * hPowers[m-1] = h^m 784 */ 785 final BigFraction[] hPowers = new BigFraction[m]; 786 hPowers[0] = h; 787 for (int i = 1; i < m; ++i) { 788 hPowers[i] = h.multiply(hPowers[i - 1]); 789 } 790 791 /* 792 * First column and last row has special values (each other reversed). 793 */ 794 for (int i = 0; i < m; ++i) { 795 Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]); 796 Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]); 797 } 798 799 /* 800 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m + 801 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check: 802 */ 803 if (h.compareTo(BigFraction.ONE_HALF) == 1) { 804 Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m)); 805 } 806 807 /* 808 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i - 809 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is 810 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then 811 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of 812 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't 813 * really necessary. 814 */ 815 for (int i = 0; i < m; ++i) { 816 for (int j = 0; j < i + 1; ++j) { 817 if (i - j + 1 > 0) { 818 for (int g = 2; g <= i - j + 1; ++g) { 819 Hdata[i][j] = Hdata[i][j].divide(g); 820 } 821 } 822 } 823 } 824 return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata); 825 } 826 827 /*** 828 * Creates {@code H} of size {@code m x m} as described in [1] (see above) 829 * using double-precision. 830 * 831 * @param d statistic 832 * @param n sample size 833 * @return H matrix 834 * @throws NumberIsTooLargeException if fractional part is greater than 1 835 */ createRoundedH(double d, int n)836 private RealMatrix createRoundedH(double d, int n) 837 throws NumberIsTooLargeException { 838 839 final int k = (int) Math.ceil(n * d); 840 final int m = 2 * k - 1; 841 final double h = k - n * d; 842 if (h >= 1) { 843 throw new NumberIsTooLargeException(h, 1.0, false); 844 } 845 final double[][] Hdata = new double[m][m]; 846 847 /* 848 * Start by filling everything with either 0 or 1. 849 */ 850 for (int i = 0; i < m; ++i) { 851 for (int j = 0; j < m; ++j) { 852 if (i - j + 1 < 0) { 853 Hdata[i][j] = 0; 854 } else { 855 Hdata[i][j] = 1; 856 } 857 } 858 } 859 860 /* 861 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ... 862 * hPowers[m-1] = h^m 863 */ 864 final double[] hPowers = new double[m]; 865 hPowers[0] = h; 866 for (int i = 1; i < m; ++i) { 867 hPowers[i] = h * hPowers[i - 1]; 868 } 869 870 /* 871 * First column and last row has special values (each other reversed). 872 */ 873 for (int i = 0; i < m; ++i) { 874 Hdata[i][0] = Hdata[i][0] - hPowers[i]; 875 Hdata[m - 1][i] -= hPowers[m - i - 1]; 876 } 877 878 /* 879 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m + 880 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check: 881 */ 882 if (Double.compare(h, 0.5) > 0) { 883 Hdata[m - 1][0] += FastMath.pow(2 * h - 1, m); 884 } 885 886 /* 887 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i - 888 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is 889 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then 890 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of 891 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't 892 * really necessary. 893 */ 894 for (int i = 0; i < m; ++i) { 895 for (int j = 0; j < i + 1; ++j) { 896 if (i - j + 1 > 0) { 897 for (int g = 2; g <= i - j + 1; ++g) { 898 Hdata[i][j] /= g; 899 } 900 } 901 } 902 } 903 return MatrixUtils.createRealMatrix(Hdata); 904 } 905 906 /** 907 * Verifies that {@code array} has length at least 2. 908 * 909 * @param array array to test 910 * @throws NullArgumentException if array is null 911 * @throws InsufficientDataException if array is too short 912 */ checkArray(double[] array)913 private void checkArray(double[] array) { 914 if (array == null) { 915 throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED); 916 } 917 if (array.length < 2) { 918 throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length, 919 2); 920 } 921 } 922 923 /** 924 * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial 925 * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums 926 * have been computed. If the sum does not converge before {@code maxIterations} iterations a 927 * {@link TooManyIterationsException} is thrown. 928 * 929 * @param t argument 930 * @param tolerance Cauchy criterion for partial sums 931 * @param maxIterations maximum number of partial sums to compute 932 * @return Kolmogorov sum evaluated at t 933 * @throws TooManyIterationsException if the series does not converge 934 */ ksSum(double t, double tolerance, int maxIterations)935 public double ksSum(double t, double tolerance, int maxIterations) { 936 if (t == 0.0) { 937 return 0.0; 938 } 939 940 // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1] 941 // from class javadoc should be used. 942 943 final double x = -2 * t * t; 944 int sign = -1; 945 long i = 1; 946 double partialSum = 0.5d; 947 double delta = 1; 948 while (delta > tolerance && i < maxIterations) { 949 delta = FastMath.exp(x * i * i); 950 partialSum += sign * delta; 951 sign *= -1; 952 i++; 953 } 954 if (i == maxIterations) { 955 throw new TooManyIterationsException(maxIterations); 956 } 957 return partialSum * 2; 958 } 959 960 /** 961 * Given a d-statistic in the range [0, 1] and the two sample sizes n and m, 962 * an integral d-statistic in the range [0, n*m] is calculated, that can be used for 963 * comparison with other integral d-statistics. Depending whether {@code strict} is 964 * {@code true} or not, the returned value divided by (n*m) is greater than 965 * (resp greater than or equal to) the given d value (allowing some tolerance). 966 * 967 * @param d a d-statistic in the range [0, 1] 968 * @param n first sample size 969 * @param m second sample size 970 * @param strict whether the returned value divided by (n*m) is allowed to be equal to d 971 * @return the integral d-statistic in the range [0, n*m] 972 */ calculateIntegralD(double d, int n, int m, boolean strict)973 private static long calculateIntegralD(double d, int n, int m, boolean strict) { 974 final double tol = 1e-12; // d-values within tol of one another are considered equal 975 long nm = n * (long)m; 976 long upperBound = (long)FastMath.ceil((d - tol) * nm); 977 long lowerBound = (long)FastMath.floor((d + tol) * nm); 978 if (strict && lowerBound == upperBound) { 979 return upperBound + 1l; 980 } 981 else { 982 return upperBound; 983 } 984 } 985 986 /** 987 * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge 988 * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See 989 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 990 * <p> 991 * The returned probability is exact, implemented by unwinding the recursive function 992 * definitions presented in [4] (class javadoc). 993 * </p> 994 * 995 * @param d D-statistic value 996 * @param n first sample size 997 * @param m second sample size 998 * @param strict whether or not the probability to compute is expressed as a strict inequality 999 * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) 1000 * greater than (resp. greater than or equal to) {@code d} 1001 */ exactP(double d, int n, int m, boolean strict)1002 public double exactP(double d, int n, int m, boolean strict) { 1003 return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) / 1004 CombinatoricsUtils.binomialCoefficientDouble(n + m, m); 1005 } 1006 1007 /** 1008 * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) 1009 * is the 2-sample Kolmogorov-Smirnov statistic. See 1010 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 1011 * <p> 1012 * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2 1013 * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for 1014 * details on how convergence of the sum is determined. This implementation passes {@code ksSum} 1015 * {@value #KS_SUM_CAUCHY_CRITERION} as {@code tolerance} and 1016 * {@value #MAXIMUM_PARTIAL_SUM_COUNT} as {@code maxIterations}. 1017 * </p> 1018 * 1019 * @param d D-statistic value 1020 * @param n first sample size 1021 * @param m second sample size 1022 * @return approximate probability that a randomly selected m-n partition of m + n generates 1023 * \(D_{n,m}\) greater than {@code d} 1024 */ approximateP(double d, int n, int m)1025 public double approximateP(double d, int n, int m) { 1026 final double dm = m; 1027 final double dn = n; 1028 return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)), 1029 KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT); 1030 } 1031 1032 /** 1033 * Fills a boolean array randomly with a fixed number of {@code true} values. 1034 * The method uses a simplified version of the Fisher-Yates shuffle algorithm. 1035 * By processing first the {@code true} values followed by the remaining {@code false} values 1036 * less random numbers need to be generated. The method is optimized for the case 1037 * that the number of {@code true} values is larger than or equal to the number of 1038 * {@code false} values. 1039 * 1040 * @param b boolean array 1041 * @param numberOfTrueValues number of {@code true} values the boolean array should finally have 1042 * @param rng random data generator 1043 */ fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final RandomGenerator rng)1044 static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final RandomGenerator rng) { 1045 Arrays.fill(b, true); 1046 for (int k = numberOfTrueValues; k < b.length; k++) { 1047 final int r = rng.nextInt(k + 1); 1048 b[(b[r]) ? r : k] = false; 1049 } 1050 } 1051 1052 /** 1053 * Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the 1054 * 2-sample Kolmogorov-Smirnov statistic. See 1055 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 1056 * <p> 1057 * The simulation generates {@code iterations} random partitions of {@code m + n} into an 1058 * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning 1059 * the proportion of values that are greater than {@code d}, or greater than or equal to 1060 * {@code d} if {@code strict} is {@code false}. 1061 * </p> 1062 * 1063 * @param d D-statistic value 1064 * @param n first sample size 1065 * @param m second sample size 1066 * @param iterations number of random partitions to generate 1067 * @param strict whether or not the probability to compute is expressed as a strict inequality 1068 * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) 1069 * greater than (resp. greater than or equal to) {@code d} 1070 */ monteCarloP(final double d, final int n, final int m, final boolean strict, final int iterations)1071 public double monteCarloP(final double d, final int n, final int m, final boolean strict, 1072 final int iterations) { 1073 return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations); 1074 } 1075 1076 /** 1077 * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d/(n*m))\) where \(D_{n,m}\) is the 1078 * 2-sample Kolmogorov-Smirnov statistic. 1079 * <p> 1080 * Here d is the D-statistic represented as long value. 1081 * The real D-statistic is obtained by dividing d by n*m. 1082 * See also {@link #monteCarloP(double, int, int, boolean, int)}. 1083 * 1084 * @param d integral D-statistic 1085 * @param n first sample size 1086 * @param m second sample size 1087 * @param iterations number of random partitions to generate 1088 * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) 1089 * greater than or equal to {@code d/(n*m))} 1090 */ integralMonteCarloP(final long d, final int n, final int m, final int iterations)1091 private double integralMonteCarloP(final long d, final int n, final int m, final int iterations) { 1092 1093 // ensure that nn is always the max of (n, m) to require fewer random numbers 1094 final int nn = FastMath.max(n, m); 1095 final int mm = FastMath.min(n, m); 1096 final int sum = nn + mm; 1097 1098 int tail = 0; 1099 final boolean b[] = new boolean[sum]; 1100 for (int i = 0; i < iterations; i++) { 1101 fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng); 1102 long curD = 0l; 1103 for(int j = 0; j < b.length; ++j) { 1104 if (b[j]) { 1105 curD += mm; 1106 if (curD >= d) { 1107 tail++; 1108 break; 1109 } 1110 } else { 1111 curD -= nn; 1112 if (curD <= -d) { 1113 tail++; 1114 break; 1115 } 1116 } 1117 } 1118 } 1119 return (double) tail / iterations; 1120 } 1121 1122 /** 1123 * If there are no ties in the combined dataset formed from x and y, this 1124 * method is a no-op. If there are ties, a uniform random deviate in 1125 * (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where 1126 * minDelta is the minimum difference between unequal values in the combined 1127 * sample. A fixed seed is used to generate the jitter, so repeated activations 1128 * with the same input arrays result in the same values. 1129 * 1130 * NOTE: if there are ties in the data, this method overwrites the data in 1131 * x and y with the jittered values. 1132 * 1133 * @param x first sample 1134 * @param y second sample 1135 */ fixTies(double[] x, double[] y)1136 private static void fixTies(double[] x, double[] y) { 1137 final double[] values = MathArrays.unique(MathArrays.concatenate(x,y)); 1138 if (values.length == x.length + y.length) { 1139 return; // There are no ties 1140 } 1141 1142 // Find the smallest difference between values, or 1 if all values are the same 1143 double minDelta = 1; 1144 double prev = values[0]; 1145 double delta = 1; 1146 for (int i = 1; i < values.length; i++) { 1147 delta = prev - values[i]; 1148 if (delta < minDelta) { 1149 minDelta = delta; 1150 } 1151 prev = values[i]; 1152 } 1153 minDelta /= 2; 1154 1155 // Add jitter using a fixed seed (so same arguments always give same results), 1156 // low-initialization-overhead generator 1157 final RealDistribution dist = 1158 new UniformRealDistribution(new JDKRandomGenerator(100), -minDelta, minDelta); 1159 1160 // It is theoretically possible that jitter does not break ties, so repeat 1161 // until all ties are gone. Bound the loop and throw MIE if bound is exceeded. 1162 int ct = 0; 1163 boolean ties = true; 1164 do { 1165 jitter(x, dist); 1166 jitter(y, dist); 1167 ties = hasTies(x, y); 1168 ct++; 1169 } while (ties && ct < 1000); 1170 if (ties) { 1171 throw new MathInternalError(); // Should never happen 1172 } 1173 } 1174 1175 /** 1176 * Returns true iff there are ties in the combined sample 1177 * formed from x and y. 1178 * 1179 * @param x first sample 1180 * @param y second sample 1181 * @return true if x and y together contain ties 1182 */ hasTies(double[] x, double[] y)1183 private static boolean hasTies(double[] x, double[] y) { 1184 final HashSet<Double> values = new HashSet<Double>(); 1185 for (int i = 0; i < x.length; i++) { 1186 if (!values.add(x[i])) { 1187 return true; 1188 } 1189 } 1190 for (int i = 0; i < y.length; i++) { 1191 if (!values.add(y[i])) { 1192 return true; 1193 } 1194 } 1195 return false; 1196 } 1197 1198 /** 1199 * Adds random jitter to {@code data} using deviates sampled from {@code dist}. 1200 * <p> 1201 * Note that jitter is applied in-place - i.e., the array 1202 * values are overwritten with the result of applying jitter.</p> 1203 * 1204 * @param data input/output data array - entries overwritten by the method 1205 * @param dist probability distribution to sample for jitter values 1206 * @throws NullPointerException if either of the parameters is null 1207 */ jitter(double[] data, RealDistribution dist)1208 private static void jitter(double[] data, RealDistribution dist) { 1209 for (int i = 0; i < data.length; i++) { 1210 data[i] += dist.sample(); 1211 } 1212 } 1213 1214 /** 1215 * The function C(i, j) defined in [4] (class javadoc), formula (5.5). 1216 * defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up 1217 * and recoded as a long to avoid rounding errors in comparison tests, so what 1218 * is actually tested is |im - jn| <= cmn. 1219 * 1220 * @param i first path parameter 1221 * @param j second path paramter 1222 * @param m first sample size 1223 * @param n second sample size 1224 * @param cmn integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)}) 1225 * @param strict whether or not the null hypothesis uses strict inequality 1226 * @return C(i,j) for given m, n, c 1227 */ c(int i, int j, int m, int n, long cmn, boolean strict)1228 private static int c(int i, int j, int m, int n, long cmn, boolean strict) { 1229 if (strict) { 1230 return FastMath.abs(i*(long)n - j*(long)m) <= cmn ? 1 : 0; 1231 } 1232 return FastMath.abs(i*(long)n - j*(long)m) < cmn ? 1 : 0; 1233 } 1234 1235 /** 1236 * The function N(i, j) defined in [4] (class javadoc). 1237 * Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m} 1238 * from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path. 1239 * The return value is integral, but subject to overflow, so it is maintained and 1240 * returned as a double. 1241 * 1242 * @param i first path parameter 1243 * @param j second path parameter 1244 * @param m first sample size 1245 * @param n second sample size 1246 * @param cnm integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)}) 1247 * @param strict whether or not the null hypothesis uses strict inequality 1248 * @return number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n 1249 */ n(int i, int j, int m, int n, long cnm, boolean strict)1250 private static double n(int i, int j, int m, int n, long cnm, boolean strict) { 1251 /* 1252 * Unwind the recursive definition given in [4]. 1253 * Compute n(1,1), n(1,2)...n(2,1), n(2,2)... up to n(i,j), one row at a time. 1254 * When n(i,*) are being computed, lag[] holds the values of n(i - 1, *). 1255 */ 1256 final double[] lag = new double[n]; 1257 double last = 0; 1258 for (int k = 0; k < n; k++) { 1259 lag[k] = c(0, k + 1, m, n, cnm, strict); 1260 } 1261 for (int k = 1; k <= i; k++) { 1262 last = c(k, 0, m, n, cnm, strict); 1263 for (int l = 1; l <= j; l++) { 1264 lag[l - 1] = c(k, l, m, n, cnm, strict) * (last + lag[l - 1]); 1265 last = lag[l - 1]; 1266 } 1267 } 1268 return last; 1269 } 1270 } 1271