• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* statistic_tests.hpp header file
2  *
3  * Copyright Jens Maurer 2000
4  * Distributed under the Boost Software License, Version 1.0. (See
5  * accompanying file LICENSE_1_0.txt or copy at
6  * http://www.boost.org/LICENSE_1_0.txt)
7  *
8  * $Id$
9  *
10  */
11 
12 #ifndef STATISTIC_TESTS_HPP
13 #define STATISTIC_TESTS_HPP
14 
15 #include <stdexcept>
16 #include <iterator>
17 #include <vector>
18 #include <boost/limits.hpp>
19 #include <algorithm>
20 #include <cmath>
21 
22 #include <boost/config.hpp>
23 #include <boost/bind.hpp>
24 #include <boost/random/uniform_01.hpp>
25 #include <boost/random/variate_generator.hpp>
26 
27 #include "integrate.hpp"
28 
29 #if defined(BOOST_MSVC) && BOOST_MSVC <= 1300
30 namespace std
31 {
pow(double a,double b)32   inline double pow(double a, double b) { return ::pow(a,b); }
ceil(double x)33   inline double ceil(double x) { return ::ceil(x); }
34 } // namespace std
35 #endif
36 
37 
38 template<class T>
fac(int k)39 inline T fac(int k)
40 {
41   T result = 1;
42   for(T i = 2; i <= k; ++i)
43     result *= i;
44   return result;
45 }
46 
47 template<class T>
binomial(int n,int k)48 T binomial(int n, int k)
49 {
50   if(k < n/2)
51     k = n-k;
52   T result = 1;
53   for(int i = k+1; i<= n; ++i)
54     result *= i;
55   return result / fac<T>(n-k);
56 }
57 
58 template<class T>
stirling2(int n,int m)59 T stirling2(int n, int m)
60 {
61   T sum = 0;
62   for(int k = 0; k <= m; ++k)
63     sum += binomial<T>(m, k) * std::pow(double(k), n) *
64       ( (m-k)%2 == 0 ? 1 : -1);
65   return sum / fac<T>(m);
66 }
67 
68 /*
69  * Experiments which create an empirical distribution in classes,
70  * suitable for the chi-square test.
71  */
72 // std::floor(gen() * classes)
73 
74 class experiment_base
75 {
76 public:
experiment_base(int cls)77   experiment_base(int cls) : _classes(cls) { }
classes() const78   unsigned int classes() const { return _classes; }
79 protected:
80   unsigned int _classes;
81 };
82 
83 class equidistribution_experiment : public experiment_base
84 {
85 public:
equidistribution_experiment(unsigned int classes)86   explicit equidistribution_experiment(unsigned int classes)
87     : experiment_base(classes) { }
88 
89   template<class NumberGenerator, class Counter>
run(NumberGenerator & f,Counter & count,int n) const90   void run(NumberGenerator & f, Counter & count, int n) const
91   {
92     assert((f.min)() == 0 &&
93            static_cast<unsigned int>((f.max)()) == classes()-1);
94     for(int i = 0; i < n; ++i)
95       count(f());
96   }
probability(int) const97   double probability(int /*i*/) const { return 1.0/classes(); }
98 };
99 
100 // two-dimensional equidistribution experiment
101 class equidistribution_2d_experiment : public equidistribution_experiment
102 {
103 public:
equidistribution_2d_experiment(unsigned int classes)104   explicit equidistribution_2d_experiment(unsigned int classes)
105     : equidistribution_experiment(classes) { }
106 
107   template<class NumberGenerator, class Counter>
run(NumberGenerator & f,Counter & count,int n) const108   void run(NumberGenerator & f, Counter & count, int n) const
109   {
110     unsigned int range = (f.max)()+1;
111     assert((f.min)() == 0 && range*range == classes());
112     for(int i = 0; i < n; ++i) {
113       int y1 = f();
114       int y2 = f();
115       count(y1 + range * y2);
116     }
117   }
118 };
119 
120 // distribution experiment: assume a probability density and
121 // count events so that an equidistribution results.
122 class distribution_experiment : public equidistribution_experiment
123 {
124 public:
125   template<class Distribution>
distribution_experiment(Distribution dist,unsigned int classes)126   distribution_experiment(Distribution dist , unsigned int classes)
127     : equidistribution_experiment(classes), limit(classes)
128   {
129     for(unsigned int i = 0; i < classes-1; ++i)
130       limit[i] = quantile(dist, (i+1)*0.05);
131     limit[classes-1] = std::numeric_limits<double>::infinity();
132     if(limit[classes-1] < (std::numeric_limits<double>::max)())
133       limit[classes-1] = (std::numeric_limits<double>::max)();
134 #if 0
135     std::cout << __PRETTY_FUNCTION__ << ": ";
136     for(unsigned int i = 0; i < classes; ++i)
137       std::cout << limit[i] << " ";
138     std::cout << std::endl;
139 #endif
140   }
141 
142   template<class NumberGenerator, class Counter>
run(NumberGenerator & f,Counter & count,int n) const143   void run(NumberGenerator & f, Counter & count, int n) const
144   {
145     for(int i = 0; i < n; ++i) {
146       limits_type::const_iterator it =
147         std::lower_bound(limit.begin(), limit.end(), f());
148       count(it-limit.begin());
149     }
150   }
151 private:
152   typedef std::vector<double> limits_type;
153   limits_type limit;
154 };
155 
156 template<bool up, bool is_float>
157 struct runs_direction_helper
158 {
159   template<class T>
initruns_direction_helper160   static T init(T)
161   {
162     return (std::numeric_limits<T>::max)();
163   }
164 };
165 
166 template<>
167 struct runs_direction_helper<true, true>
168 {
169   template<class T>
initruns_direction_helper170   static T init(T)
171   {
172     return -(std::numeric_limits<T>::max)();
173   }
174 };
175 
176 template<>
177 struct runs_direction_helper<true, false>
178 {
179   template<class T>
initruns_direction_helper180   static T init(T)
181   {
182     return (std::numeric_limits<T>::min)();
183   }
184 };
185 
186 // runs-up/runs-down experiment
187 template<bool up>
188 class runs_experiment : public experiment_base
189 {
190 public:
runs_experiment(unsigned int classes)191   explicit runs_experiment(unsigned int classes) : experiment_base(classes) { }
192 
193   template<class NumberGenerator, class Counter>
run(NumberGenerator & f,Counter & count,int n) const194   void run(NumberGenerator & f, Counter & count, int n) const
195   {
196     typedef typename NumberGenerator::result_type result_type;
197     result_type init =
198       runs_direction_helper<
199         up,
200         !std::numeric_limits<result_type>::is_integer
201       >::init(result_type());
202     result_type previous = init;
203     unsigned int length = 0;
204     for(int i = 0; i < n; ++i) {
205       result_type val = f();
206       if(up ? previous <= val : previous >= val) {
207         previous = val;
208         ++length;
209       } else {
210         count((std::min)(length, classes())-1);
211         length = 0;
212         previous = init;
213         // don't use this value, so that runs are independent
214       }
215     }
216   }
probability(unsigned int r) const217   double probability(unsigned int r) const
218   {
219     if(r == classes()-1)
220       return 1.0/fac<double>(classes());
221     else
222       return static_cast<double>(r+1)/fac<double>(r+2);
223   }
224 };
225 
226 // gap length experiment
227 class gap_experiment : public experiment_base
228 {
229 public:
230   template<class Dist>
gap_experiment(unsigned int classes,const Dist & dist,double alpha,double beta)231   gap_experiment(unsigned int classes, const Dist & dist, double alpha, double beta)
232     : experiment_base(classes), alpha(alpha), beta(beta), low(quantile(dist, alpha)), high(quantile(dist, beta)) {}
233 
234   template<class NumberGenerator, class Counter>
run(NumberGenerator & f,Counter & count,int n) const235   void run(NumberGenerator & f, Counter & count, int n) const
236   {
237     typedef typename NumberGenerator::result_type result_type;
238     unsigned int length = 0;
239     for(int i = 0; i < n; ) {
240       result_type value = f();
241       if(value < low || value > high)
242         ++length;
243       else {
244         count((std::min)(length, classes()-1));
245         length = 0;
246         ++i;
247       }
248     }
249   }
probability(unsigned int r) const250   double probability(unsigned int r) const
251   {
252     double p = beta-alpha;
253     if(r == classes()-1)
254       return std::pow(1-p, static_cast<double>(r));
255     else
256       return p * std::pow(1-p, static_cast<double>(r));
257   }
258 private:
259   double alpha, beta;
260   double low, high;
261 };
262 
263 // poker experiment
264 class poker_experiment : public experiment_base
265 {
266 public:
poker_experiment(unsigned int d,unsigned int k)267   poker_experiment(unsigned int d, unsigned int k)
268     : experiment_base(k), range(d)
269   {
270     assert(range > 1);
271   }
272 
273   template<class UniformRandomNumberGenerator, class Counter>
run(UniformRandomNumberGenerator & f,Counter & count,int n) const274   void run(UniformRandomNumberGenerator & f, Counter & count, int n) const
275   {
276     typedef typename UniformRandomNumberGenerator::result_type result_type;
277     assert(std::numeric_limits<result_type>::is_integer);
278     assert((f.min)() == 0);
279     assert((f.max)() == static_cast<result_type>(range-1));
280     std::vector<result_type> v(classes());
281     for(int i = 0; i < n; ++i) {
282       for(unsigned int j = 0; j < classes(); ++j)
283         v[j] = f();
284       std::sort(v.begin(), v.end());
285       result_type prev = v[0];
286       int r = 1;     // count different values in v
287       for(unsigned int i = 1; i < classes(); ++i) {
288         if(prev != v[i]) {
289           prev = v[i];
290           ++r;
291         }
292       }
293       count(r-1);
294     }
295   }
296 
probability(unsigned int r) const297   double probability(unsigned int r) const
298   {
299     ++r;       // transform to 1 <= r <= 5
300     double result = range;
301     for(unsigned int i = 1; i < r; ++i)
302       result *= range-i;
303     return result / std::pow(range, static_cast<double>(classes())) *
304       stirling2<double>(classes(), r);
305   }
306 private:
307   unsigned int range;
308 };
309 
310 // coupon collector experiment
311 class coupon_collector_experiment : public experiment_base
312 {
313 public:
coupon_collector_experiment(unsigned int d,unsigned int cls)314   coupon_collector_experiment(unsigned int d, unsigned int cls)
315     : experiment_base(cls), d(d)
316   {
317     assert(d > 1);
318   }
319 
320   template<class UniformRandomNumberGenerator, class Counter>
run(UniformRandomNumberGenerator & f,Counter & count,int n) const321   void run(UniformRandomNumberGenerator & f, Counter & count, int n) const
322   {
323     typedef typename UniformRandomNumberGenerator::result_type result_type;
324     assert(std::numeric_limits<result_type>::is_integer);
325     assert((f.min)() == 0);
326     assert((f.max)() == static_cast<result_type>(d-1));
327     std::vector<bool> occurs(d);
328     for(int i = 0; i < n; ++i) {
329       occurs.assign(d, false);
330       unsigned int r = 0;            // length of current sequence
331       int q = 0;                     // number of non-duplicates in current set
332       for(;;) {
333         result_type val = f();
334         ++r;
335         if(!occurs[val]) {       // new set element
336           occurs[val] = true;
337           ++q;
338           if(q == d)
339             break;     // one complete set
340         }
341       }
342       count((std::min)(r-d, classes()-1));
343     }
344   }
probability(unsigned int r) const345   double probability(unsigned int r) const
346   {
347     if(r == classes()-1)
348       return 1-fac<double>(d)/
349         std::pow(static_cast<double>(d), static_cast<double>(d+classes()-2)) *
350         stirling2<double>(d+classes()-2, d);
351     else
352       return fac<double>(d)/
353         std::pow(static_cast<double>(d), static_cast<double>(d+r)) *
354         stirling2<double>(d+r-1, d-1);
355   }
356 private:
357   int d;
358 };
359 
360 // permutation test
361 class permutation_experiment : public equidistribution_experiment
362 {
363 public:
permutation_experiment(unsigned int t)364   permutation_experiment(unsigned int t)
365     : equidistribution_experiment(fac<int>(t)), t(t)
366   {
367     assert(t > 1);
368   }
369 
370   template<class UniformRandomNumberGenerator, class Counter>
run(UniformRandomNumberGenerator & f,Counter & count,int n) const371   void run(UniformRandomNumberGenerator & f, Counter & count, int n) const
372   {
373     typedef typename UniformRandomNumberGenerator::result_type result_type;
374     std::vector<result_type> v(t);
375     for(int i = 0; i < n; ++i) {
376       for(int j = 0; j < t; ++j) {
377         v[j] = f();
378       }
379       int x = 0;
380       for(int r = t-1; r > 0; r--) {
381         typename std::vector<result_type>::iterator it =
382           std::max_element(v.begin(), v.begin()+r+1);
383         x = (r+1)*x + (it-v.begin());
384         std::iter_swap(it, v.begin()+r);
385       }
386       count(x);
387     }
388   }
389 private:
390   int t;
391 };
392 
393 // birthday spacing experiment test
394 class birthday_spacing_experiment : public experiment_base
395 {
396 public:
birthday_spacing_experiment(unsigned int d,int n,int m)397   birthday_spacing_experiment(unsigned int d, int n, int m)
398     : experiment_base(d), n(n), m(m)
399   {
400   }
401 
402   template<class UniformRandomNumberGenerator, class Counter>
run(UniformRandomNumberGenerator & f,Counter & count,int n_total) const403   void run(UniformRandomNumberGenerator & f, Counter & count, int n_total) const
404   {
405     typedef typename UniformRandomNumberGenerator::result_type result_type;
406     assert(std::numeric_limits<result_type>::is_integer);
407     assert((f.min)() == 0);
408     assert((f.max)() == static_cast<result_type>(m-1));
409 
410     for(int j = 0; j < n_total; j++) {
411       std::vector<result_type> v(n);
412       std::generate_n(v.begin(), n, f);
413       std::sort(v.begin(), v.end());
414       std::vector<result_type> spacing(n);
415       for(int i = 0; i < n-1; i++)
416         spacing[i] = v[i+1]-v[i];
417       spacing[n-1] = v[0] + m - v[n-1];
418       std::sort(spacing.begin(), spacing.end());
419       unsigned int k = 0;
420       for(int i = 0; i < n-1; ++i) {
421         if(spacing[i] == spacing[i+1])
422           ++k;
423       }
424       count((std::min)(k, classes()-1));
425     }
426   }
427 
probability(unsigned int r) const428   double probability(unsigned int r) const
429   {
430     assert(classes() == 4);
431     assert(m == (1<<25));
432     assert(n == 512);
433     static const double prob[] = { 0.368801577, 0.369035243, 0.183471182,
434                                    0.078691997 };
435     return prob[r];
436   }
437 private:
438   int n, m;
439 };
440 /*
441  * Misc. helper functions.
442  */
443 
444 template<class Float>
445 struct distribution_function
446 {
447   typedef Float result_type;
448   typedef Float argument_type;
449   typedef Float first_argument_type;
450   typedef Float second_argument_type;
451 };
452 
453 // computes P(K_n <= t) or P(t1 <= K_n <= t2).  See Knuth, 3.3.1
454 class kolmogorov_smirnov_probability : public distribution_function<double>
455 {
456 public:
kolmogorov_smirnov_probability(int n)457   kolmogorov_smirnov_probability(int n)
458     : approx(n > 50), n(n), sqrt_n(std::sqrt(double(n)))
459   {
460     if(!approx)
461       n_n = std::pow(static_cast<double>(n), n);
462   }
463 
cdf(double t) const464   double cdf(double t) const
465   {
466     if(approx) {
467       return 1-std::exp(-2*t*t)*(1-2.0/3.0*t/sqrt_n);
468     } else {
469       t *= sqrt_n;
470       double sum = 0;
471       for(int k = static_cast<int>(std::ceil(t)); k <= n; k++)
472         sum += binomial<double>(n, k) * std::pow(k-t, k) *
473           std::pow(t+n-k, n-k-1);
474       return 1 - t/n_n * sum;
475     }
476   }
477   //double operator()(double t1, double t2) const
478   //{ return operator()(t2) - operator()(t1); }
479 
480 private:
481   bool approx;
482   int n;
483   double sqrt_n;
484   double n_n;
485 };
486 
cdf(const kolmogorov_smirnov_probability & dist,double val)487 inline double cdf(const kolmogorov_smirnov_probability& dist, double val)
488 {
489   return dist.cdf(val);
490 }
491 
quantile(const kolmogorov_smirnov_probability & dist,double val)492 inline double quantile(const kolmogorov_smirnov_probability& dist, double val)
493 {
494     return invert_monotone_inc(boost::bind(&cdf, dist, _1), val, 0.0, 1000.0);
495 }
496 
497 /*
498  * Experiments for generators with continuous distribution functions
499  */
500 class kolmogorov_experiment
501 {
502 public:
kolmogorov_experiment(int n)503   kolmogorov_experiment(int n) : n(n), ksp(n) { }
504   template<class NumberGenerator, class Distribution>
run(NumberGenerator & gen,Distribution distrib) const505   double run(NumberGenerator & gen, Distribution distrib) const
506   {
507     const int m = n;
508     typedef std::vector<double> saved_temp;
509     saved_temp a(m,1.0), b(m,0);
510     std::vector<int> c(m,0);
511     for(int i = 0; i < n; ++i) {
512       double val = static_cast<double>(gen());
513       double y = cdf(distrib, val);
514       int k = static_cast<int>(std::floor(m*y));
515       if(k >= m)
516         --k;    // should not happen
517       a[k] = (std::min)(a[k], y);
518       b[k] = (std::max)(b[k], y);
519       ++c[k];
520     }
521     double kplus = 0, kminus = 0;
522     int j = 0;
523     for(int k = 0; k < m; ++k) {
524       if(c[k] > 0) {
525         kminus = (std::max)(kminus, a[k]-j/static_cast<double>(n));
526         j += c[k];
527         kplus = (std::max)(kplus, j/static_cast<double>(n) - b[k]);
528       }
529     }
530     kplus *= std::sqrt(double(n));
531     kminus *= std::sqrt(double(n));
532     // std::cout << "k+ " << kplus << "   k- " << kminus << std::endl;
533     return kplus;
534   }
probability(double x) const535   double probability(double x) const
536   {
537     return cdf(ksp, x);
538   }
539 private:
540   int n;
541   kolmogorov_smirnov_probability ksp;
542 };
543 
544 struct power_distribution
545 {
power_distributionpower_distribution546   power_distribution(double t) : t(t) {}
547   double t;
548 };
549 
cdf(const power_distribution & dist,double val)550 double cdf(const power_distribution& dist, double val)
551 {
552   return std::pow(val, dist.t);
553 }
554 
555 // maximum-of-t test (KS-based)
556 template<class UniformRandomNumberGenerator>
557 class maximum_experiment
558 {
559 public:
560   typedef UniformRandomNumberGenerator base_type;
maximum_experiment(base_type & f,int n,int t)561   maximum_experiment(base_type & f, int n, int t) : f(f), ke(n), t(t)
562   { }
563 
operator ()() const564   double operator()() const
565   {
566     generator gen(f, t);
567     return ke.run(gen, power_distribution(t));
568   }
569 
570 private:
571   struct generator {
generatormaximum_experiment::generator572     generator(base_type & f, int t) : f(f, boost::uniform_01<>()), t(t) { }
operator ()maximum_experiment::generator573     double operator()()
574     {
575       double mx = f();
576       for(int i = 1; i < t; ++i)
577         mx = (std::max)(mx, f());
578       return mx;
579     }
580   private:
581     boost::variate_generator<base_type&, boost::uniform_01<> > f;
582     int t;
583   };
584   base_type & f;
585   kolmogorov_experiment ke;
586   int t;
587 };
588 
589 // compute a chi-square value for the distribution approximation error
590 template<class ForwardIterator, class UnaryFunction>
591 typename UnaryFunction::result_type
chi_square_value(ForwardIterator first,ForwardIterator last,UnaryFunction probability)592 chi_square_value(ForwardIterator first, ForwardIterator last,
593                  UnaryFunction probability)
594 {
595   typedef std::iterator_traits<ForwardIterator> iter_traits;
596   typedef typename iter_traits::value_type counter_type;
597   typedef typename UnaryFunction::result_type result_type;
598   unsigned int classes = std::distance(first, last);
599   result_type sum = 0;
600   counter_type n = 0;
601   for(unsigned int i = 0; i < classes; ++first, ++i) {
602     counter_type count = *first;
603     n += count;
604     sum += (count/probability(i)) * count;  // avoid overflow
605   }
606 #if 0
607   for(unsigned int i = 0; i < classes; ++i) {
608     // std::cout << (n*probability(i)) << " ";
609     if(n * probability(i) < 5)
610       std::cerr << "Not enough test runs for slot " << i
611                 << " p=" << probability(i) << ", n=" << n
612                 << std::endl;
613   }
614 #endif
615   // std::cout << std::endl;
616   // throw std::invalid_argument("not enough test runs");
617 
618   return sum/n - n;
619 }
620 template<class RandomAccessContainer>
621 class generic_counter
622 {
623 public:
generic_counter(unsigned int classes)624   explicit generic_counter(unsigned int classes) : container(classes, 0) { }
operator ()(int i)625   void operator()(int i)
626   {
627     assert(i >= 0);
628     assert(static_cast<unsigned int>(i) < container.size());
629     ++container[i];
630   }
begin() const631   typename RandomAccessContainer::const_iterator begin() const
632   { return container.begin(); }
end() const633   typename RandomAccessContainer::const_iterator end() const
634   { return container.end(); }
635 
636 private:
637   RandomAccessContainer container;
638 };
639 
640 // chi_square test
641 template<class Experiment, class Generator>
run_experiment(const Experiment & experiment,Generator & gen,int n)642 double run_experiment(const Experiment & experiment, Generator & gen, int n)
643 {
644   generic_counter<std::vector<int> > v(experiment.classes());
645   experiment.run(gen, v, n);
646   return chi_square_value(v.begin(), v.end(),
647                           boost::bind(&Experiment::probability,
648                                        experiment, boost::placeholders::_1));
649 }
650 
651 // chi_square test
652 template<class Experiment, class Generator>
run_experiment(const Experiment & experiment,const Generator & gen,int n)653 double run_experiment(const Experiment & experiment, const Generator & gen, int n)
654 {
655   generic_counter<std::vector<int> > v(experiment.classes());
656   experiment.run(gen, v, n);
657   return chi_square_value(v.begin(), v.end(),
658                           boost::bind(&Experiment::probability,
659                                        experiment, boost::placeholders::_1));
660 }
661 
662 // number generator with experiment results (for nesting)
663 template<class Experiment, class Generator>
664 class experiment_generator_t
665 {
666 public:
experiment_generator_t(const Experiment & exper,Generator & gen,int n)667   experiment_generator_t(const Experiment & exper, Generator & gen, int n)
668     : experiment(exper), generator(gen), n(n) { }
operator ()() const669   double operator()() const { return run_experiment(experiment, generator, n); }
670 private:
671   const Experiment & experiment;
672   Generator & generator;
673   int n;
674 };
675 
676 template<class Experiment, class Generator>
677 experiment_generator_t<Experiment, Generator>
experiment_generator(const Experiment & e,Generator & gen,int n)678 experiment_generator(const Experiment & e, Generator & gen, int n)
679 {
680   return experiment_generator_t<Experiment, Generator>(e, gen, n);
681 }
682 
683 
684 template<class Experiment, class Generator, class Distribution>
685 class ks_experiment_generator_t
686 {
687 public:
ks_experiment_generator_t(const Experiment & exper,Generator & gen,const Distribution & distrib)688   ks_experiment_generator_t(const Experiment & exper, Generator & gen,
689                             const Distribution & distrib)
690     : experiment(exper), generator(gen), distribution(distrib) { }
operator ()() const691   double operator()() const { return experiment.run(generator, distribution); }
692 private:
693   const Experiment & experiment;
694   Generator & generator;
695   Distribution distribution;
696 };
697 
698 template<class Experiment, class Generator, class Distribution>
699 ks_experiment_generator_t<Experiment, Generator, Distribution>
ks_experiment_generator(const Experiment & e,Generator & gen,const Distribution & distrib)700 ks_experiment_generator(const Experiment & e, Generator & gen,
701                         const Distribution & distrib)
702 {
703   return ks_experiment_generator_t<Experiment, Generator, Distribution>
704     (e, gen, distrib);
705 }
706 
707 
708 #endif /* STATISTIC_TESTS_HPP */
709 
710