• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //  (C) Copyright Nick Thompson 2018.
2 //  Use, modification and distribution are subject to the
3 //  Boost Software License, Version 1.0. (See accompanying file
4 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5 
6 #ifndef BOOST_MATH_STATISTICS_UNIVARIATE_STATISTICS_HPP
7 #define BOOST_MATH_STATISTICS_UNIVARIATE_STATISTICS_HPP
8 
9 #include <algorithm>
10 #include <iterator>
11 #include <tuple>
12 #include <cmath>
13 #include <boost/assert.hpp>
14 
15 namespace boost::math::statistics {
16 
17 template<class ForwardIterator>
mean(ForwardIterator first,ForwardIterator last)18 auto mean(ForwardIterator first, ForwardIterator last)
19 {
20     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
21     BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the mean.");
22     if constexpr (std::is_integral<Real>::value)
23     {
24         double mu = 0;
25         double i = 1;
26         for(auto it = first; it != last; ++it) {
27             mu = mu + (*it - mu)/i;
28             i += 1;
29         }
30         return mu;
31     }
32     else if constexpr (std::is_same_v<typename std::iterator_traits<ForwardIterator>::iterator_category, std::random_access_iterator_tag>)
33     {
34         size_t elements = std::distance(first, last);
35         Real mu0 = 0;
36         Real mu1 = 0;
37         Real mu2 = 0;
38         Real mu3 = 0;
39         Real i = 1;
40         auto end = last - (elements % 4);
41         for(auto it = first; it != end;  it += 4) {
42             Real inv = Real(1)/i;
43             Real tmp0 = (*it - mu0);
44             Real tmp1 = (*(it+1) - mu1);
45             Real tmp2 = (*(it+2) - mu2);
46             Real tmp3 = (*(it+3) - mu3);
47             // please generate a vectorized fma here
48             mu0 += tmp0*inv;
49             mu1 += tmp1*inv;
50             mu2 += tmp2*inv;
51             mu3 += tmp3*inv;
52             i += 1;
53         }
54         Real num1 = Real(elements  - (elements %4))/Real(4);
55         Real num2 = num1 + Real(elements % 4);
56 
57         for (auto it = end; it != last; ++it)
58         {
59             mu3 += (*it-mu3)/i;
60             i += 1;
61         }
62 
63         return (num1*(mu0+mu1+mu2) + num2*mu3)/Real(elements);
64     }
65     else
66     {
67         auto it = first;
68         Real mu = *it;
69         Real i = 2;
70         while(++it != last)
71         {
72             mu += (*it - mu)/i;
73             i += 1;
74         }
75         return mu;
76     }
77 }
78 
79 template<class Container>
mean(Container const & v)80 inline auto mean(Container const & v)
81 {
82     return mean(v.cbegin(), v.cend());
83 }
84 
85 template<class ForwardIterator>
variance(ForwardIterator first,ForwardIterator last)86 auto variance(ForwardIterator first, ForwardIterator last)
87 {
88     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
89     BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance.");
90     // Higham, Accuracy and Stability, equation 1.6a and 1.6b:
91     if constexpr (std::is_integral<Real>::value)
92     {
93         double M = *first;
94         double Q = 0;
95         double k = 2;
96         for (auto it = std::next(first); it != last; ++it)
97         {
98             double tmp = *it - M;
99             Q = Q + ((k-1)*tmp*tmp)/k;
100             M = M + tmp/k;
101             k += 1;
102         }
103         return Q/(k-1);
104     }
105     else
106     {
107         Real M = *first;
108         Real Q = 0;
109         Real k = 2;
110         for (auto it = std::next(first); it != last; ++it)
111         {
112             Real tmp = (*it - M)/k;
113             Q += k*(k-1)*tmp*tmp;
114             M += tmp;
115             k += 1;
116         }
117         return Q/(k-1);
118     }
119 }
120 
121 template<class Container>
variance(Container const & v)122 inline auto variance(Container const & v)
123 {
124     return variance(v.cbegin(), v.cend());
125 }
126 
127 template<class ForwardIterator>
sample_variance(ForwardIterator first,ForwardIterator last)128 auto sample_variance(ForwardIterator first, ForwardIterator last)
129 {
130     size_t n = std::distance(first, last);
131     BOOST_ASSERT_MSG(n > 1, "At least two samples are required to compute the sample variance.");
132     return n*variance(first, last)/(n-1);
133 }
134 
135 template<class Container>
sample_variance(Container const & v)136 inline auto sample_variance(Container const & v)
137 {
138     return sample_variance(v.cbegin(), v.cend());
139 }
140 
141 template<class ForwardIterator>
mean_and_sample_variance(ForwardIterator first,ForwardIterator last)142 auto mean_and_sample_variance(ForwardIterator first, ForwardIterator last)
143 {
144     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
145     BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance.");
146     // Higham, Accuracy and Stability, equation 1.6a and 1.6b:
147     if constexpr (std::is_integral<Real>::value)
148     {
149         double M = *first;
150         double Q = 0;
151         double k = 2;
152         for (auto it = std::next(first); it != last; ++it)
153         {
154             double tmp = *it - M;
155             Q = Q + ((k-1)*tmp*tmp)/k;
156             M = M + tmp/k;
157             k += 1;
158         }
159         return std::pair<double, double>{M, Q/(k-2)};
160     }
161     else
162     {
163         Real M = *first;
164         Real Q = 0;
165         Real k = 2;
166         for (auto it = std::next(first); it != last; ++it)
167         {
168             Real tmp = (*it - M)/k;
169             Q += k*(k-1)*tmp*tmp;
170             M += tmp;
171             k += 1;
172         }
173         return std::pair<Real, Real>{M, Q/(k-2)};
174     }
175 }
176 
177 template<class Container>
mean_and_sample_variance(Container const & v)178 auto mean_and_sample_variance(Container const & v)
179 {
180     return mean_and_sample_variance(v.begin(), v.end());
181 }
182 
183 // Follows equation 1.5 of:
184 // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
185 template<class ForwardIterator>
skewness(ForwardIterator first,ForwardIterator last)186 auto skewness(ForwardIterator first, ForwardIterator last)
187 {
188     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
189     using std::sqrt;
190     BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute skewness.");
191     if constexpr (std::is_integral<Real>::value)
192     {
193         double M1 = *first;
194         double M2 = 0;
195         double M3 = 0;
196         double n = 2;
197         for (auto it = std::next(first); it != last; ++it)
198         {
199             double delta21 = *it - M1;
200             double tmp = delta21/n;
201             M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
202             M2 = M2 + tmp*(n-1)*delta21;
203             M1 = M1 + tmp;
204             n += 1;
205         }
206 
207         double var = M2/(n-1);
208         if (var == 0)
209         {
210             // The limit is technically undefined, but the interpretation here is clear:
211             // A constant dataset has no skewness.
212             return double(0);
213         }
214         double skew = M3/(M2*sqrt(var));
215         return skew;
216     }
217     else
218     {
219         Real M1 = *first;
220         Real M2 = 0;
221         Real M3 = 0;
222         Real n = 2;
223         for (auto it = std::next(first); it != last; ++it)
224         {
225             Real delta21 = *it - M1;
226             Real tmp = delta21/n;
227             M3 += tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
228             M2 += tmp*(n-1)*delta21;
229             M1 += tmp;
230             n += 1;
231         }
232 
233         Real var = M2/(n-1);
234         if (var == 0)
235         {
236             // The limit is technically undefined, but the interpretation here is clear:
237             // A constant dataset has no skewness.
238             return Real(0);
239         }
240         Real skew = M3/(M2*sqrt(var));
241         return skew;
242     }
243 }
244 
245 template<class Container>
skewness(Container const & v)246 inline auto skewness(Container const & v)
247 {
248     return skewness(v.cbegin(), v.cend());
249 }
250 
251 // Follows equation 1.5/1.6 of:
252 // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
253 template<class ForwardIterator>
first_four_moments(ForwardIterator first,ForwardIterator last)254 auto first_four_moments(ForwardIterator first, ForwardIterator last)
255 {
256     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
257     BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the first four moments.");
258     if constexpr (std::is_integral<Real>::value)
259     {
260         double M1 = *first;
261         double M2 = 0;
262         double M3 = 0;
263         double M4 = 0;
264         double n = 2;
265         for (auto it = std::next(first); it != last; ++it)
266         {
267             double delta21 = *it - M1;
268             double tmp = delta21/n;
269             M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
270             M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
271             M2 = M2 + tmp*(n-1)*delta21;
272             M1 = M1 + tmp;
273             n += 1;
274         }
275 
276         return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
277     }
278     else
279     {
280         Real M1 = *first;
281         Real M2 = 0;
282         Real M3 = 0;
283         Real M4 = 0;
284         Real n = 2;
285         for (auto it = std::next(first); it != last; ++it)
286         {
287             Real delta21 = *it - M1;
288             Real tmp = delta21/n;
289             M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
290             M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
291             M2 = M2 + tmp*(n-1)*delta21;
292             M1 = M1 + tmp;
293             n += 1;
294         }
295 
296         return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
297     }
298 }
299 
300 template<class Container>
first_four_moments(Container const & v)301 inline auto first_four_moments(Container const & v)
302 {
303     return first_four_moments(v.cbegin(), v.cend());
304 }
305 
306 
307 // Follows equation 1.6 of:
308 // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
309 template<class ForwardIterator>
kurtosis(ForwardIterator first,ForwardIterator last)310 auto kurtosis(ForwardIterator first, ForwardIterator last)
311 {
312     auto [M1, M2, M3, M4] = first_four_moments(first, last);
313     if (M2 == 0)
314     {
315         return M2;
316     }
317     return M4/(M2*M2);
318 }
319 
320 template<class Container>
kurtosis(Container const & v)321 inline auto kurtosis(Container const & v)
322 {
323     return kurtosis(v.cbegin(), v.cend());
324 }
325 
326 template<class ForwardIterator>
excess_kurtosis(ForwardIterator first,ForwardIterator last)327 auto excess_kurtosis(ForwardIterator first, ForwardIterator last)
328 {
329     return kurtosis(first, last) - 3;
330 }
331 
332 template<class Container>
excess_kurtosis(Container const & v)333 inline auto excess_kurtosis(Container const & v)
334 {
335     return excess_kurtosis(v.cbegin(), v.cend());
336 }
337 
338 
339 template<class RandomAccessIterator>
median(RandomAccessIterator first,RandomAccessIterator last)340 auto median(RandomAccessIterator first, RandomAccessIterator last)
341 {
342     size_t num_elems = std::distance(first, last);
343     BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero length vector is undefined.");
344     if (num_elems & 1)
345     {
346         auto middle = first + (num_elems - 1)/2;
347         std::nth_element(first, middle, last);
348         return *middle;
349     }
350     else
351     {
352         auto middle = first + num_elems/2 - 1;
353         std::nth_element(first, middle, last);
354         std::nth_element(middle, middle+1, last);
355         return (*middle + *(middle+1))/2;
356     }
357 }
358 
359 
360 template<class RandomAccessContainer>
median(RandomAccessContainer & v)361 inline auto median(RandomAccessContainer & v)
362 {
363     return median(v.begin(), v.end());
364 }
365 
366 template<class RandomAccessIterator>
gini_coefficient(RandomAccessIterator first,RandomAccessIterator last)367 auto gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
368 {
369     using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
370     BOOST_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples.");
371 
372     std::sort(first, last);
373     if constexpr (std::is_integral<Real>::value)
374     {
375         double i = 1;
376         double num = 0;
377         double denom = 0;
378         for (auto it = first; it != last; ++it)
379         {
380             num += *it*i;
381             denom += *it;
382             ++i;
383         }
384 
385         // If the l1 norm is zero, all elements are zero, so every element is the same.
386         if (denom == 0)
387         {
388             return double(0);
389         }
390 
391         return ((2*num)/denom - i)/(i-1);
392     }
393     else
394     {
395         Real i = 1;
396         Real num = 0;
397         Real denom = 0;
398         for (auto it = first; it != last; ++it)
399         {
400             num += *it*i;
401             denom += *it;
402             ++i;
403         }
404 
405         // If the l1 norm is zero, all elements are zero, so every element is the same.
406         if (denom == 0)
407         {
408             return Real(0);
409         }
410 
411         return ((2*num)/denom - i)/(i-1);
412     }
413 }
414 
415 template<class RandomAccessContainer>
gini_coefficient(RandomAccessContainer & v)416 inline auto gini_coefficient(RandomAccessContainer & v)
417 {
418     return gini_coefficient(v.begin(), v.end());
419 }
420 
421 template<class RandomAccessIterator>
sample_gini_coefficient(RandomAccessIterator first,RandomAccessIterator last)422 inline auto sample_gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
423 {
424     size_t n = std::distance(first, last);
425     return n*gini_coefficient(first, last)/(n-1);
426 }
427 
428 template<class RandomAccessContainer>
sample_gini_coefficient(RandomAccessContainer & v)429 inline auto sample_gini_coefficient(RandomAccessContainer & v)
430 {
431     return sample_gini_coefficient(v.begin(), v.end());
432 }
433 
434 template<class RandomAccessIterator>
median_absolute_deviation(RandomAccessIterator first,RandomAccessIterator last,typename std::iterator_traits<RandomAccessIterator>::value_type center=std::numeric_limits<typename std::iterator_traits<RandomAccessIterator>::value_type>::quiet_NaN ())435 auto median_absolute_deviation(RandomAccessIterator first, RandomAccessIterator last, typename std::iterator_traits<RandomAccessIterator>::value_type center=std::numeric_limits<typename std::iterator_traits<RandomAccessIterator>::value_type>::quiet_NaN())
436 {
437     using std::abs;
438     using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
439     using std::isnan;
440     if (isnan(center))
441     {
442         center = boost::math::statistics::median(first, last);
443     }
444     size_t num_elems = std::distance(first, last);
445     BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero-length vector is undefined.");
446     auto comparator = [&center](Real a, Real b) { return abs(a-center) < abs(b-center);};
447     if (num_elems & 1)
448     {
449         auto middle = first + (num_elems - 1)/2;
450         std::nth_element(first, middle, last, comparator);
451         return abs(*middle);
452     }
453     else
454     {
455         auto middle = first + num_elems/2 - 1;
456         std::nth_element(first, middle, last, comparator);
457         std::nth_element(middle, middle+1, last, comparator);
458         return (abs(*middle) + abs(*(middle+1)))/abs(static_cast<Real>(2));
459     }
460 }
461 
462 template<class RandomAccessContainer>
median_absolute_deviation(RandomAccessContainer & v,typename RandomAccessContainer::value_type center=std::numeric_limits<typename RandomAccessContainer::value_type>::quiet_NaN ())463 inline auto median_absolute_deviation(RandomAccessContainer & v, typename RandomAccessContainer::value_type center=std::numeric_limits<typename RandomAccessContainer::value_type>::quiet_NaN())
464 {
465     return median_absolute_deviation(v.begin(), v.end(), center);
466 }
467 
468 template<class ForwardIterator>
interquartile_range(ForwardIterator first,ForwardIterator last)469 auto interquartile_range(ForwardIterator first, ForwardIterator last)
470 {
471     using Real = typename std::iterator_traits<ForwardIterator>::value_type;
472     static_assert(!std::is_integral<Real>::value, "Integer values have not yet been implemented.");
473     auto m = std::distance(first,last);
474     BOOST_ASSERT_MSG(m >= 3, "At least 3 samples are required to compute the interquartile range.");
475     auto k = m/4;
476     auto j = m - (4*k);
477     // m = 4k+j.
478     // If j = 0 or j = 1, then there are an even number of samples below the median, and an even number above the median.
479     //    Then we must average adjacent elements to get the quartiles.
480     // If j = 2 or j = 3, there are an odd number of samples above and below the median, these elements may be directly extracted to get the quartiles.
481 
482     if (j==2 || j==3)
483     {
484         auto q1 = first + k;
485         auto q3 = first + 3*k + j - 1;
486         std::nth_element(first, q1, last);
487         Real Q1 = *q1;
488         std::nth_element(q1, q3, last);
489         Real Q3 = *q3;
490         return Q3 - Q1;
491     } else {
492         // j == 0 or j==1:
493         auto q1 = first + k - 1;
494         auto q3 = first + 3*k - 1 + j;
495         std::nth_element(first, q1, last);
496         Real a = *q1;
497         std::nth_element(q1, q1 + 1, last);
498         Real b = *(q1 + 1);
499         Real Q1 = (a+b)/2;
500         std::nth_element(q1, q3, last);
501         a = *q3;
502         std::nth_element(q3, q3 + 1, last);
503         b = *(q3 + 1);
504         Real Q3 = (a+b)/2;
505         return Q3 - Q1;
506     }
507 }
508 
509 template<class RandomAccessContainer>
interquartile_range(RandomAccessContainer & v)510 inline auto interquartile_range(RandomAccessContainer & v)
511 {
512     return interquartile_range(v.begin(), v.end());
513 }
514 
515 
516 }
517 #endif
518