1 /* statistic_tests.hpp header file
2 *
3 * Copyright Jens Maurer 2000
4 * Distributed under the Boost Software License, Version 1.0. (See
5 * accompanying file LICENSE_1_0.txt or copy at
6 * http://www.boost.org/LICENSE_1_0.txt)
7 *
8 * $Id$
9 *
10 */
11
12 #ifndef STATISTIC_TESTS_HPP
13 #define STATISTIC_TESTS_HPP
14
15 #include <stdexcept>
16 #include <iterator>
17 #include <vector>
18 #include <boost/limits.hpp>
19 #include <algorithm>
20 #include <cmath>
21
22 #include <boost/config.hpp>
23 #include <boost/bind.hpp>
24 #include <boost/random/uniform_01.hpp>
25 #include <boost/random/variate_generator.hpp>
26
27 #include "integrate.hpp"
28
29 #if defined(BOOST_MSVC) && BOOST_MSVC <= 1300
30 namespace std
31 {
pow(double a,double b)32 inline double pow(double a, double b) { return ::pow(a,b); }
ceil(double x)33 inline double ceil(double x) { return ::ceil(x); }
34 } // namespace std
35 #endif
36
37
38 template<class T>
fac(int k)39 inline T fac(int k)
40 {
41 T result = 1;
42 for(T i = 2; i <= k; ++i)
43 result *= i;
44 return result;
45 }
46
47 template<class T>
binomial(int n,int k)48 T binomial(int n, int k)
49 {
50 if(k < n/2)
51 k = n-k;
52 T result = 1;
53 for(int i = k+1; i<= n; ++i)
54 result *= i;
55 return result / fac<T>(n-k);
56 }
57
58 template<class T>
stirling2(int n,int m)59 T stirling2(int n, int m)
60 {
61 T sum = 0;
62 for(int k = 0; k <= m; ++k)
63 sum += binomial<T>(m, k) * std::pow(double(k), n) *
64 ( (m-k)%2 == 0 ? 1 : -1);
65 return sum / fac<T>(m);
66 }
67
68 /*
69 * Experiments which create an empirical distribution in classes,
70 * suitable for the chi-square test.
71 */
72 // std::floor(gen() * classes)
73
74 class experiment_base
75 {
76 public:
experiment_base(int cls)77 experiment_base(int cls) : _classes(cls) { }
classes() const78 unsigned int classes() const { return _classes; }
79 protected:
80 unsigned int _classes;
81 };
82
83 class equidistribution_experiment : public experiment_base
84 {
85 public:
equidistribution_experiment(unsigned int classes)86 explicit equidistribution_experiment(unsigned int classes)
87 : experiment_base(classes) { }
88
89 template<class NumberGenerator, class Counter>
run(NumberGenerator & f,Counter & count,int n) const90 void run(NumberGenerator & f, Counter & count, int n) const
91 {
92 assert((f.min)() == 0 &&
93 static_cast<unsigned int>((f.max)()) == classes()-1);
94 for(int i = 0; i < n; ++i)
95 count(f());
96 }
probability(int) const97 double probability(int /*i*/) const { return 1.0/classes(); }
98 };
99
100 // two-dimensional equidistribution experiment
101 class equidistribution_2d_experiment : public equidistribution_experiment
102 {
103 public:
equidistribution_2d_experiment(unsigned int classes)104 explicit equidistribution_2d_experiment(unsigned int classes)
105 : equidistribution_experiment(classes) { }
106
107 template<class NumberGenerator, class Counter>
run(NumberGenerator & f,Counter & count,int n) const108 void run(NumberGenerator & f, Counter & count, int n) const
109 {
110 unsigned int range = (f.max)()+1;
111 assert((f.min)() == 0 && range*range == classes());
112 for(int i = 0; i < n; ++i) {
113 int y1 = f();
114 int y2 = f();
115 count(y1 + range * y2);
116 }
117 }
118 };
119
120 // distribution experiment: assume a probability density and
121 // count events so that an equidistribution results.
122 class distribution_experiment : public equidistribution_experiment
123 {
124 public:
125 template<class Distribution>
distribution_experiment(Distribution dist,unsigned int classes)126 distribution_experiment(Distribution dist , unsigned int classes)
127 : equidistribution_experiment(classes), limit(classes)
128 {
129 for(unsigned int i = 0; i < classes-1; ++i)
130 limit[i] = quantile(dist, (i+1)*0.05);
131 limit[classes-1] = std::numeric_limits<double>::infinity();
132 if(limit[classes-1] < (std::numeric_limits<double>::max)())
133 limit[classes-1] = (std::numeric_limits<double>::max)();
134 #if 0
135 std::cout << __PRETTY_FUNCTION__ << ": ";
136 for(unsigned int i = 0; i < classes; ++i)
137 std::cout << limit[i] << " ";
138 std::cout << std::endl;
139 #endif
140 }
141
142 template<class NumberGenerator, class Counter>
run(NumberGenerator & f,Counter & count,int n) const143 void run(NumberGenerator & f, Counter & count, int n) const
144 {
145 for(int i = 0; i < n; ++i) {
146 limits_type::const_iterator it =
147 std::lower_bound(limit.begin(), limit.end(), f());
148 count(it-limit.begin());
149 }
150 }
151 private:
152 typedef std::vector<double> limits_type;
153 limits_type limit;
154 };
155
156 template<bool up, bool is_float>
157 struct runs_direction_helper
158 {
159 template<class T>
initruns_direction_helper160 static T init(T)
161 {
162 return (std::numeric_limits<T>::max)();
163 }
164 };
165
166 template<>
167 struct runs_direction_helper<true, true>
168 {
169 template<class T>
initruns_direction_helper170 static T init(T)
171 {
172 return -(std::numeric_limits<T>::max)();
173 }
174 };
175
176 template<>
177 struct runs_direction_helper<true, false>
178 {
179 template<class T>
initruns_direction_helper180 static T init(T)
181 {
182 return (std::numeric_limits<T>::min)();
183 }
184 };
185
186 // runs-up/runs-down experiment
187 template<bool up>
188 class runs_experiment : public experiment_base
189 {
190 public:
runs_experiment(unsigned int classes)191 explicit runs_experiment(unsigned int classes) : experiment_base(classes) { }
192
193 template<class NumberGenerator, class Counter>
run(NumberGenerator & f,Counter & count,int n) const194 void run(NumberGenerator & f, Counter & count, int n) const
195 {
196 typedef typename NumberGenerator::result_type result_type;
197 result_type init =
198 runs_direction_helper<
199 up,
200 !std::numeric_limits<result_type>::is_integer
201 >::init(result_type());
202 result_type previous = init;
203 unsigned int length = 0;
204 for(int i = 0; i < n; ++i) {
205 result_type val = f();
206 if(up ? previous <= val : previous >= val) {
207 previous = val;
208 ++length;
209 } else {
210 count((std::min)(length, classes())-1);
211 length = 0;
212 previous = init;
213 // don't use this value, so that runs are independent
214 }
215 }
216 }
probability(unsigned int r) const217 double probability(unsigned int r) const
218 {
219 if(r == classes()-1)
220 return 1.0/fac<double>(classes());
221 else
222 return static_cast<double>(r+1)/fac<double>(r+2);
223 }
224 };
225
226 // gap length experiment
227 class gap_experiment : public experiment_base
228 {
229 public:
230 template<class Dist>
gap_experiment(unsigned int classes,const Dist & dist,double alpha,double beta)231 gap_experiment(unsigned int classes, const Dist & dist, double alpha, double beta)
232 : experiment_base(classes), alpha(alpha), beta(beta), low(quantile(dist, alpha)), high(quantile(dist, beta)) {}
233
234 template<class NumberGenerator, class Counter>
run(NumberGenerator & f,Counter & count,int n) const235 void run(NumberGenerator & f, Counter & count, int n) const
236 {
237 typedef typename NumberGenerator::result_type result_type;
238 unsigned int length = 0;
239 for(int i = 0; i < n; ) {
240 result_type value = f();
241 if(value < low || value > high)
242 ++length;
243 else {
244 count((std::min)(length, classes()-1));
245 length = 0;
246 ++i;
247 }
248 }
249 }
probability(unsigned int r) const250 double probability(unsigned int r) const
251 {
252 double p = beta-alpha;
253 if(r == classes()-1)
254 return std::pow(1-p, static_cast<double>(r));
255 else
256 return p * std::pow(1-p, static_cast<double>(r));
257 }
258 private:
259 double alpha, beta;
260 double low, high;
261 };
262
263 // poker experiment
264 class poker_experiment : public experiment_base
265 {
266 public:
poker_experiment(unsigned int d,unsigned int k)267 poker_experiment(unsigned int d, unsigned int k)
268 : experiment_base(k), range(d)
269 {
270 assert(range > 1);
271 }
272
273 template<class UniformRandomNumberGenerator, class Counter>
run(UniformRandomNumberGenerator & f,Counter & count,int n) const274 void run(UniformRandomNumberGenerator & f, Counter & count, int n) const
275 {
276 typedef typename UniformRandomNumberGenerator::result_type result_type;
277 assert(std::numeric_limits<result_type>::is_integer);
278 assert((f.min)() == 0);
279 assert((f.max)() == static_cast<result_type>(range-1));
280 std::vector<result_type> v(classes());
281 for(int i = 0; i < n; ++i) {
282 for(unsigned int j = 0; j < classes(); ++j)
283 v[j] = f();
284 std::sort(v.begin(), v.end());
285 result_type prev = v[0];
286 int r = 1; // count different values in v
287 for(unsigned int i = 1; i < classes(); ++i) {
288 if(prev != v[i]) {
289 prev = v[i];
290 ++r;
291 }
292 }
293 count(r-1);
294 }
295 }
296
probability(unsigned int r) const297 double probability(unsigned int r) const
298 {
299 ++r; // transform to 1 <= r <= 5
300 double result = range;
301 for(unsigned int i = 1; i < r; ++i)
302 result *= range-i;
303 return result / std::pow(range, static_cast<double>(classes())) *
304 stirling2<double>(classes(), r);
305 }
306 private:
307 unsigned int range;
308 };
309
310 // coupon collector experiment
311 class coupon_collector_experiment : public experiment_base
312 {
313 public:
coupon_collector_experiment(unsigned int d,unsigned int cls)314 coupon_collector_experiment(unsigned int d, unsigned int cls)
315 : experiment_base(cls), d(d)
316 {
317 assert(d > 1);
318 }
319
320 template<class UniformRandomNumberGenerator, class Counter>
run(UniformRandomNumberGenerator & f,Counter & count,int n) const321 void run(UniformRandomNumberGenerator & f, Counter & count, int n) const
322 {
323 typedef typename UniformRandomNumberGenerator::result_type result_type;
324 assert(std::numeric_limits<result_type>::is_integer);
325 assert((f.min)() == 0);
326 assert((f.max)() == static_cast<result_type>(d-1));
327 std::vector<bool> occurs(d);
328 for(int i = 0; i < n; ++i) {
329 occurs.assign(d, false);
330 unsigned int r = 0; // length of current sequence
331 int q = 0; // number of non-duplicates in current set
332 for(;;) {
333 result_type val = f();
334 ++r;
335 if(!occurs[val]) { // new set element
336 occurs[val] = true;
337 ++q;
338 if(q == d)
339 break; // one complete set
340 }
341 }
342 count((std::min)(r-d, classes()-1));
343 }
344 }
probability(unsigned int r) const345 double probability(unsigned int r) const
346 {
347 if(r == classes()-1)
348 return 1-fac<double>(d)/
349 std::pow(static_cast<double>(d), static_cast<double>(d+classes()-2)) *
350 stirling2<double>(d+classes()-2, d);
351 else
352 return fac<double>(d)/
353 std::pow(static_cast<double>(d), static_cast<double>(d+r)) *
354 stirling2<double>(d+r-1, d-1);
355 }
356 private:
357 int d;
358 };
359
360 // permutation test
361 class permutation_experiment : public equidistribution_experiment
362 {
363 public:
permutation_experiment(unsigned int t)364 permutation_experiment(unsigned int t)
365 : equidistribution_experiment(fac<int>(t)), t(t)
366 {
367 assert(t > 1);
368 }
369
370 template<class UniformRandomNumberGenerator, class Counter>
run(UniformRandomNumberGenerator & f,Counter & count,int n) const371 void run(UniformRandomNumberGenerator & f, Counter & count, int n) const
372 {
373 typedef typename UniformRandomNumberGenerator::result_type result_type;
374 std::vector<result_type> v(t);
375 for(int i = 0; i < n; ++i) {
376 for(int j = 0; j < t; ++j) {
377 v[j] = f();
378 }
379 int x = 0;
380 for(int r = t-1; r > 0; r--) {
381 typename std::vector<result_type>::iterator it =
382 std::max_element(v.begin(), v.begin()+r+1);
383 x = (r+1)*x + (it-v.begin());
384 std::iter_swap(it, v.begin()+r);
385 }
386 count(x);
387 }
388 }
389 private:
390 int t;
391 };
392
393 // birthday spacing experiment test
394 class birthday_spacing_experiment : public experiment_base
395 {
396 public:
birthday_spacing_experiment(unsigned int d,int n,int m)397 birthday_spacing_experiment(unsigned int d, int n, int m)
398 : experiment_base(d), n(n), m(m)
399 {
400 }
401
402 template<class UniformRandomNumberGenerator, class Counter>
run(UniformRandomNumberGenerator & f,Counter & count,int n_total) const403 void run(UniformRandomNumberGenerator & f, Counter & count, int n_total) const
404 {
405 typedef typename UniformRandomNumberGenerator::result_type result_type;
406 assert(std::numeric_limits<result_type>::is_integer);
407 assert((f.min)() == 0);
408 assert((f.max)() == static_cast<result_type>(m-1));
409
410 for(int j = 0; j < n_total; j++) {
411 std::vector<result_type> v(n);
412 std::generate_n(v.begin(), n, f);
413 std::sort(v.begin(), v.end());
414 std::vector<result_type> spacing(n);
415 for(int i = 0; i < n-1; i++)
416 spacing[i] = v[i+1]-v[i];
417 spacing[n-1] = v[0] + m - v[n-1];
418 std::sort(spacing.begin(), spacing.end());
419 unsigned int k = 0;
420 for(int i = 0; i < n-1; ++i) {
421 if(spacing[i] == spacing[i+1])
422 ++k;
423 }
424 count((std::min)(k, classes()-1));
425 }
426 }
427
probability(unsigned int r) const428 double probability(unsigned int r) const
429 {
430 assert(classes() == 4);
431 assert(m == (1<<25));
432 assert(n == 512);
433 static const double prob[] = { 0.368801577, 0.369035243, 0.183471182,
434 0.078691997 };
435 return prob[r];
436 }
437 private:
438 int n, m;
439 };
440 /*
441 * Misc. helper functions.
442 */
443
444 template<class Float>
445 struct distribution_function
446 {
447 typedef Float result_type;
448 typedef Float argument_type;
449 typedef Float first_argument_type;
450 typedef Float second_argument_type;
451 };
452
453 // computes P(K_n <= t) or P(t1 <= K_n <= t2). See Knuth, 3.3.1
454 class kolmogorov_smirnov_probability : public distribution_function<double>
455 {
456 public:
kolmogorov_smirnov_probability(int n)457 kolmogorov_smirnov_probability(int n)
458 : approx(n > 50), n(n), sqrt_n(std::sqrt(double(n)))
459 {
460 if(!approx)
461 n_n = std::pow(static_cast<double>(n), n);
462 }
463
cdf(double t) const464 double cdf(double t) const
465 {
466 if(approx) {
467 return 1-std::exp(-2*t*t)*(1-2.0/3.0*t/sqrt_n);
468 } else {
469 t *= sqrt_n;
470 double sum = 0;
471 for(int k = static_cast<int>(std::ceil(t)); k <= n; k++)
472 sum += binomial<double>(n, k) * std::pow(k-t, k) *
473 std::pow(t+n-k, n-k-1);
474 return 1 - t/n_n * sum;
475 }
476 }
477 //double operator()(double t1, double t2) const
478 //{ return operator()(t2) - operator()(t1); }
479
480 private:
481 bool approx;
482 int n;
483 double sqrt_n;
484 double n_n;
485 };
486
cdf(const kolmogorov_smirnov_probability & dist,double val)487 inline double cdf(const kolmogorov_smirnov_probability& dist, double val)
488 {
489 return dist.cdf(val);
490 }
491
quantile(const kolmogorov_smirnov_probability & dist,double val)492 inline double quantile(const kolmogorov_smirnov_probability& dist, double val)
493 {
494 return invert_monotone_inc(boost::bind(&cdf, dist, _1), val, 0.0, 1000.0);
495 }
496
497 /*
498 * Experiments for generators with continuous distribution functions
499 */
500 class kolmogorov_experiment
501 {
502 public:
kolmogorov_experiment(int n)503 kolmogorov_experiment(int n) : n(n), ksp(n) { }
504 template<class NumberGenerator, class Distribution>
run(NumberGenerator & gen,Distribution distrib) const505 double run(NumberGenerator & gen, Distribution distrib) const
506 {
507 const int m = n;
508 typedef std::vector<double> saved_temp;
509 saved_temp a(m,1.0), b(m,0);
510 std::vector<int> c(m,0);
511 for(int i = 0; i < n; ++i) {
512 double val = static_cast<double>(gen());
513 double y = cdf(distrib, val);
514 int k = static_cast<int>(std::floor(m*y));
515 if(k >= m)
516 --k; // should not happen
517 a[k] = (std::min)(a[k], y);
518 b[k] = (std::max)(b[k], y);
519 ++c[k];
520 }
521 double kplus = 0, kminus = 0;
522 int j = 0;
523 for(int k = 0; k < m; ++k) {
524 if(c[k] > 0) {
525 kminus = (std::max)(kminus, a[k]-j/static_cast<double>(n));
526 j += c[k];
527 kplus = (std::max)(kplus, j/static_cast<double>(n) - b[k]);
528 }
529 }
530 kplus *= std::sqrt(double(n));
531 kminus *= std::sqrt(double(n));
532 // std::cout << "k+ " << kplus << " k- " << kminus << std::endl;
533 return kplus;
534 }
probability(double x) const535 double probability(double x) const
536 {
537 return cdf(ksp, x);
538 }
539 private:
540 int n;
541 kolmogorov_smirnov_probability ksp;
542 };
543
544 struct power_distribution
545 {
power_distributionpower_distribution546 power_distribution(double t) : t(t) {}
547 double t;
548 };
549
cdf(const power_distribution & dist,double val)550 double cdf(const power_distribution& dist, double val)
551 {
552 return std::pow(val, dist.t);
553 }
554
555 // maximum-of-t test (KS-based)
556 template<class UniformRandomNumberGenerator>
557 class maximum_experiment
558 {
559 public:
560 typedef UniformRandomNumberGenerator base_type;
maximum_experiment(base_type & f,int n,int t)561 maximum_experiment(base_type & f, int n, int t) : f(f), ke(n), t(t)
562 { }
563
operator ()() const564 double operator()() const
565 {
566 generator gen(f, t);
567 return ke.run(gen, power_distribution(t));
568 }
569
570 private:
571 struct generator {
generatormaximum_experiment::generator572 generator(base_type & f, int t) : f(f, boost::uniform_01<>()), t(t) { }
operator ()maximum_experiment::generator573 double operator()()
574 {
575 double mx = f();
576 for(int i = 1; i < t; ++i)
577 mx = (std::max)(mx, f());
578 return mx;
579 }
580 private:
581 boost::variate_generator<base_type&, boost::uniform_01<> > f;
582 int t;
583 };
584 base_type & f;
585 kolmogorov_experiment ke;
586 int t;
587 };
588
589 // compute a chi-square value for the distribution approximation error
590 template<class ForwardIterator, class UnaryFunction>
591 typename UnaryFunction::result_type
chi_square_value(ForwardIterator first,ForwardIterator last,UnaryFunction probability)592 chi_square_value(ForwardIterator first, ForwardIterator last,
593 UnaryFunction probability)
594 {
595 typedef std::iterator_traits<ForwardIterator> iter_traits;
596 typedef typename iter_traits::value_type counter_type;
597 typedef typename UnaryFunction::result_type result_type;
598 unsigned int classes = std::distance(first, last);
599 result_type sum = 0;
600 counter_type n = 0;
601 for(unsigned int i = 0; i < classes; ++first, ++i) {
602 counter_type count = *first;
603 n += count;
604 sum += (count/probability(i)) * count; // avoid overflow
605 }
606 #if 0
607 for(unsigned int i = 0; i < classes; ++i) {
608 // std::cout << (n*probability(i)) << " ";
609 if(n * probability(i) < 5)
610 std::cerr << "Not enough test runs for slot " << i
611 << " p=" << probability(i) << ", n=" << n
612 << std::endl;
613 }
614 #endif
615 // std::cout << std::endl;
616 // throw std::invalid_argument("not enough test runs");
617
618 return sum/n - n;
619 }
620 template<class RandomAccessContainer>
621 class generic_counter
622 {
623 public:
generic_counter(unsigned int classes)624 explicit generic_counter(unsigned int classes) : container(classes, 0) { }
operator ()(int i)625 void operator()(int i)
626 {
627 assert(i >= 0);
628 assert(static_cast<unsigned int>(i) < container.size());
629 ++container[i];
630 }
begin() const631 typename RandomAccessContainer::const_iterator begin() const
632 { return container.begin(); }
end() const633 typename RandomAccessContainer::const_iterator end() const
634 { return container.end(); }
635
636 private:
637 RandomAccessContainer container;
638 };
639
640 // chi_square test
641 template<class Experiment, class Generator>
run_experiment(const Experiment & experiment,Generator & gen,int n)642 double run_experiment(const Experiment & experiment, Generator & gen, int n)
643 {
644 generic_counter<std::vector<int> > v(experiment.classes());
645 experiment.run(gen, v, n);
646 return chi_square_value(v.begin(), v.end(),
647 boost::bind(&Experiment::probability,
648 experiment, boost::placeholders::_1));
649 }
650
651 // chi_square test
652 template<class Experiment, class Generator>
run_experiment(const Experiment & experiment,const Generator & gen,int n)653 double run_experiment(const Experiment & experiment, const Generator & gen, int n)
654 {
655 generic_counter<std::vector<int> > v(experiment.classes());
656 experiment.run(gen, v, n);
657 return chi_square_value(v.begin(), v.end(),
658 boost::bind(&Experiment::probability,
659 experiment, boost::placeholders::_1));
660 }
661
662 // number generator with experiment results (for nesting)
663 template<class Experiment, class Generator>
664 class experiment_generator_t
665 {
666 public:
experiment_generator_t(const Experiment & exper,Generator & gen,int n)667 experiment_generator_t(const Experiment & exper, Generator & gen, int n)
668 : experiment(exper), generator(gen), n(n) { }
operator ()() const669 double operator()() const { return run_experiment(experiment, generator, n); }
670 private:
671 const Experiment & experiment;
672 Generator & generator;
673 int n;
674 };
675
676 template<class Experiment, class Generator>
677 experiment_generator_t<Experiment, Generator>
experiment_generator(const Experiment & e,Generator & gen,int n)678 experiment_generator(const Experiment & e, Generator & gen, int n)
679 {
680 return experiment_generator_t<Experiment, Generator>(e, gen, n);
681 }
682
683
684 template<class Experiment, class Generator, class Distribution>
685 class ks_experiment_generator_t
686 {
687 public:
ks_experiment_generator_t(const Experiment & exper,Generator & gen,const Distribution & distrib)688 ks_experiment_generator_t(const Experiment & exper, Generator & gen,
689 const Distribution & distrib)
690 : experiment(exper), generator(gen), distribution(distrib) { }
operator ()() const691 double operator()() const { return experiment.run(generator, distribution); }
692 private:
693 const Experiment & experiment;
694 Generator & generator;
695 Distribution distribution;
696 };
697
698 template<class Experiment, class Generator, class Distribution>
699 ks_experiment_generator_t<Experiment, Generator, Distribution>
ks_experiment_generator(const Experiment & e,Generator & gen,const Distribution & distrib)700 ks_experiment_generator(const Experiment & e, Generator & gen,
701 const Distribution & distrib)
702 {
703 return ks_experiment_generator_t<Experiment, Generator, Distribution>
704 (e, gen, distrib);
705 }
706
707
708 #endif /* STATISTIC_TESTS_HPP */
709
710