1 // (C) Copyright Nick Thompson 2018.
2 // Use, modification and distribution are subject to the
3 // Boost Software License, Version 1.0. (See accompanying file
4 // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5
6 #ifndef BOOST_MATH_STATISTICS_UNIVARIATE_STATISTICS_HPP
7 #define BOOST_MATH_STATISTICS_UNIVARIATE_STATISTICS_HPP
8
9 #include <algorithm>
10 #include <iterator>
11 #include <tuple>
12 #include <cmath>
13 #include <boost/assert.hpp>
14
15 namespace boost::math::statistics {
16
17 template<class ForwardIterator>
mean(ForwardIterator first,ForwardIterator last)18 auto mean(ForwardIterator first, ForwardIterator last)
19 {
20 using Real = typename std::iterator_traits<ForwardIterator>::value_type;
21 BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the mean.");
22 if constexpr (std::is_integral<Real>::value)
23 {
24 double mu = 0;
25 double i = 1;
26 for(auto it = first; it != last; ++it) {
27 mu = mu + (*it - mu)/i;
28 i += 1;
29 }
30 return mu;
31 }
32 else if constexpr (std::is_same_v<typename std::iterator_traits<ForwardIterator>::iterator_category, std::random_access_iterator_tag>)
33 {
34 size_t elements = std::distance(first, last);
35 Real mu0 = 0;
36 Real mu1 = 0;
37 Real mu2 = 0;
38 Real mu3 = 0;
39 Real i = 1;
40 auto end = last - (elements % 4);
41 for(auto it = first; it != end; it += 4) {
42 Real inv = Real(1)/i;
43 Real tmp0 = (*it - mu0);
44 Real tmp1 = (*(it+1) - mu1);
45 Real tmp2 = (*(it+2) - mu2);
46 Real tmp3 = (*(it+3) - mu3);
47 // please generate a vectorized fma here
48 mu0 += tmp0*inv;
49 mu1 += tmp1*inv;
50 mu2 += tmp2*inv;
51 mu3 += tmp3*inv;
52 i += 1;
53 }
54 Real num1 = Real(elements - (elements %4))/Real(4);
55 Real num2 = num1 + Real(elements % 4);
56
57 for (auto it = end; it != last; ++it)
58 {
59 mu3 += (*it-mu3)/i;
60 i += 1;
61 }
62
63 return (num1*(mu0+mu1+mu2) + num2*mu3)/Real(elements);
64 }
65 else
66 {
67 auto it = first;
68 Real mu = *it;
69 Real i = 2;
70 while(++it != last)
71 {
72 mu += (*it - mu)/i;
73 i += 1;
74 }
75 return mu;
76 }
77 }
78
79 template<class Container>
mean(Container const & v)80 inline auto mean(Container const & v)
81 {
82 return mean(v.cbegin(), v.cend());
83 }
84
85 template<class ForwardIterator>
variance(ForwardIterator first,ForwardIterator last)86 auto variance(ForwardIterator first, ForwardIterator last)
87 {
88 using Real = typename std::iterator_traits<ForwardIterator>::value_type;
89 BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance.");
90 // Higham, Accuracy and Stability, equation 1.6a and 1.6b:
91 if constexpr (std::is_integral<Real>::value)
92 {
93 double M = *first;
94 double Q = 0;
95 double k = 2;
96 for (auto it = std::next(first); it != last; ++it)
97 {
98 double tmp = *it - M;
99 Q = Q + ((k-1)*tmp*tmp)/k;
100 M = M + tmp/k;
101 k += 1;
102 }
103 return Q/(k-1);
104 }
105 else
106 {
107 Real M = *first;
108 Real Q = 0;
109 Real k = 2;
110 for (auto it = std::next(first); it != last; ++it)
111 {
112 Real tmp = (*it - M)/k;
113 Q += k*(k-1)*tmp*tmp;
114 M += tmp;
115 k += 1;
116 }
117 return Q/(k-1);
118 }
119 }
120
121 template<class Container>
variance(Container const & v)122 inline auto variance(Container const & v)
123 {
124 return variance(v.cbegin(), v.cend());
125 }
126
127 template<class ForwardIterator>
sample_variance(ForwardIterator first,ForwardIterator last)128 auto sample_variance(ForwardIterator first, ForwardIterator last)
129 {
130 size_t n = std::distance(first, last);
131 BOOST_ASSERT_MSG(n > 1, "At least two samples are required to compute the sample variance.");
132 return n*variance(first, last)/(n-1);
133 }
134
135 template<class Container>
sample_variance(Container const & v)136 inline auto sample_variance(Container const & v)
137 {
138 return sample_variance(v.cbegin(), v.cend());
139 }
140
141 template<class ForwardIterator>
mean_and_sample_variance(ForwardIterator first,ForwardIterator last)142 auto mean_and_sample_variance(ForwardIterator first, ForwardIterator last)
143 {
144 using Real = typename std::iterator_traits<ForwardIterator>::value_type;
145 BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance.");
146 // Higham, Accuracy and Stability, equation 1.6a and 1.6b:
147 if constexpr (std::is_integral<Real>::value)
148 {
149 double M = *first;
150 double Q = 0;
151 double k = 2;
152 for (auto it = std::next(first); it != last; ++it)
153 {
154 double tmp = *it - M;
155 Q = Q + ((k-1)*tmp*tmp)/k;
156 M = M + tmp/k;
157 k += 1;
158 }
159 return std::pair<double, double>{M, Q/(k-2)};
160 }
161 else
162 {
163 Real M = *first;
164 Real Q = 0;
165 Real k = 2;
166 for (auto it = std::next(first); it != last; ++it)
167 {
168 Real tmp = (*it - M)/k;
169 Q += k*(k-1)*tmp*tmp;
170 M += tmp;
171 k += 1;
172 }
173 return std::pair<Real, Real>{M, Q/(k-2)};
174 }
175 }
176
177 template<class Container>
mean_and_sample_variance(Container const & v)178 auto mean_and_sample_variance(Container const & v)
179 {
180 return mean_and_sample_variance(v.begin(), v.end());
181 }
182
183 // Follows equation 1.5 of:
184 // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
185 template<class ForwardIterator>
skewness(ForwardIterator first,ForwardIterator last)186 auto skewness(ForwardIterator first, ForwardIterator last)
187 {
188 using Real = typename std::iterator_traits<ForwardIterator>::value_type;
189 using std::sqrt;
190 BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute skewness.");
191 if constexpr (std::is_integral<Real>::value)
192 {
193 double M1 = *first;
194 double M2 = 0;
195 double M3 = 0;
196 double n = 2;
197 for (auto it = std::next(first); it != last; ++it)
198 {
199 double delta21 = *it - M1;
200 double tmp = delta21/n;
201 M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
202 M2 = M2 + tmp*(n-1)*delta21;
203 M1 = M1 + tmp;
204 n += 1;
205 }
206
207 double var = M2/(n-1);
208 if (var == 0)
209 {
210 // The limit is technically undefined, but the interpretation here is clear:
211 // A constant dataset has no skewness.
212 return double(0);
213 }
214 double skew = M3/(M2*sqrt(var));
215 return skew;
216 }
217 else
218 {
219 Real M1 = *first;
220 Real M2 = 0;
221 Real M3 = 0;
222 Real n = 2;
223 for (auto it = std::next(first); it != last; ++it)
224 {
225 Real delta21 = *it - M1;
226 Real tmp = delta21/n;
227 M3 += tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
228 M2 += tmp*(n-1)*delta21;
229 M1 += tmp;
230 n += 1;
231 }
232
233 Real var = M2/(n-1);
234 if (var == 0)
235 {
236 // The limit is technically undefined, but the interpretation here is clear:
237 // A constant dataset has no skewness.
238 return Real(0);
239 }
240 Real skew = M3/(M2*sqrt(var));
241 return skew;
242 }
243 }
244
245 template<class Container>
skewness(Container const & v)246 inline auto skewness(Container const & v)
247 {
248 return skewness(v.cbegin(), v.cend());
249 }
250
251 // Follows equation 1.5/1.6 of:
252 // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
253 template<class ForwardIterator>
first_four_moments(ForwardIterator first,ForwardIterator last)254 auto first_four_moments(ForwardIterator first, ForwardIterator last)
255 {
256 using Real = typename std::iterator_traits<ForwardIterator>::value_type;
257 BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the first four moments.");
258 if constexpr (std::is_integral<Real>::value)
259 {
260 double M1 = *first;
261 double M2 = 0;
262 double M3 = 0;
263 double M4 = 0;
264 double n = 2;
265 for (auto it = std::next(first); it != last; ++it)
266 {
267 double delta21 = *it - M1;
268 double tmp = delta21/n;
269 M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
270 M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
271 M2 = M2 + tmp*(n-1)*delta21;
272 M1 = M1 + tmp;
273 n += 1;
274 }
275
276 return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
277 }
278 else
279 {
280 Real M1 = *first;
281 Real M2 = 0;
282 Real M3 = 0;
283 Real M4 = 0;
284 Real n = 2;
285 for (auto it = std::next(first); it != last; ++it)
286 {
287 Real delta21 = *it - M1;
288 Real tmp = delta21/n;
289 M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
290 M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
291 M2 = M2 + tmp*(n-1)*delta21;
292 M1 = M1 + tmp;
293 n += 1;
294 }
295
296 return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
297 }
298 }
299
300 template<class Container>
first_four_moments(Container const & v)301 inline auto first_four_moments(Container const & v)
302 {
303 return first_four_moments(v.cbegin(), v.cend());
304 }
305
306
307 // Follows equation 1.6 of:
308 // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
309 template<class ForwardIterator>
kurtosis(ForwardIterator first,ForwardIterator last)310 auto kurtosis(ForwardIterator first, ForwardIterator last)
311 {
312 auto [M1, M2, M3, M4] = first_four_moments(first, last);
313 if (M2 == 0)
314 {
315 return M2;
316 }
317 return M4/(M2*M2);
318 }
319
320 template<class Container>
kurtosis(Container const & v)321 inline auto kurtosis(Container const & v)
322 {
323 return kurtosis(v.cbegin(), v.cend());
324 }
325
326 template<class ForwardIterator>
excess_kurtosis(ForwardIterator first,ForwardIterator last)327 auto excess_kurtosis(ForwardIterator first, ForwardIterator last)
328 {
329 return kurtosis(first, last) - 3;
330 }
331
332 template<class Container>
excess_kurtosis(Container const & v)333 inline auto excess_kurtosis(Container const & v)
334 {
335 return excess_kurtosis(v.cbegin(), v.cend());
336 }
337
338
339 template<class RandomAccessIterator>
median(RandomAccessIterator first,RandomAccessIterator last)340 auto median(RandomAccessIterator first, RandomAccessIterator last)
341 {
342 size_t num_elems = std::distance(first, last);
343 BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero length vector is undefined.");
344 if (num_elems & 1)
345 {
346 auto middle = first + (num_elems - 1)/2;
347 std::nth_element(first, middle, last);
348 return *middle;
349 }
350 else
351 {
352 auto middle = first + num_elems/2 - 1;
353 std::nth_element(first, middle, last);
354 std::nth_element(middle, middle+1, last);
355 return (*middle + *(middle+1))/2;
356 }
357 }
358
359
360 template<class RandomAccessContainer>
median(RandomAccessContainer & v)361 inline auto median(RandomAccessContainer & v)
362 {
363 return median(v.begin(), v.end());
364 }
365
366 template<class RandomAccessIterator>
gini_coefficient(RandomAccessIterator first,RandomAccessIterator last)367 auto gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
368 {
369 using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
370 BOOST_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples.");
371
372 std::sort(first, last);
373 if constexpr (std::is_integral<Real>::value)
374 {
375 double i = 1;
376 double num = 0;
377 double denom = 0;
378 for (auto it = first; it != last; ++it)
379 {
380 num += *it*i;
381 denom += *it;
382 ++i;
383 }
384
385 // If the l1 norm is zero, all elements are zero, so every element is the same.
386 if (denom == 0)
387 {
388 return double(0);
389 }
390
391 return ((2*num)/denom - i)/(i-1);
392 }
393 else
394 {
395 Real i = 1;
396 Real num = 0;
397 Real denom = 0;
398 for (auto it = first; it != last; ++it)
399 {
400 num += *it*i;
401 denom += *it;
402 ++i;
403 }
404
405 // If the l1 norm is zero, all elements are zero, so every element is the same.
406 if (denom == 0)
407 {
408 return Real(0);
409 }
410
411 return ((2*num)/denom - i)/(i-1);
412 }
413 }
414
415 template<class RandomAccessContainer>
gini_coefficient(RandomAccessContainer & v)416 inline auto gini_coefficient(RandomAccessContainer & v)
417 {
418 return gini_coefficient(v.begin(), v.end());
419 }
420
421 template<class RandomAccessIterator>
sample_gini_coefficient(RandomAccessIterator first,RandomAccessIterator last)422 inline auto sample_gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
423 {
424 size_t n = std::distance(first, last);
425 return n*gini_coefficient(first, last)/(n-1);
426 }
427
428 template<class RandomAccessContainer>
sample_gini_coefficient(RandomAccessContainer & v)429 inline auto sample_gini_coefficient(RandomAccessContainer & v)
430 {
431 return sample_gini_coefficient(v.begin(), v.end());
432 }
433
434 template<class RandomAccessIterator>
median_absolute_deviation(RandomAccessIterator first,RandomAccessIterator last,typename std::iterator_traits<RandomAccessIterator>::value_type center=std::numeric_limits<typename std::iterator_traits<RandomAccessIterator>::value_type>::quiet_NaN ())435 auto median_absolute_deviation(RandomAccessIterator first, RandomAccessIterator last, typename std::iterator_traits<RandomAccessIterator>::value_type center=std::numeric_limits<typename std::iterator_traits<RandomAccessIterator>::value_type>::quiet_NaN())
436 {
437 using std::abs;
438 using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
439 using std::isnan;
440 if (isnan(center))
441 {
442 center = boost::math::statistics::median(first, last);
443 }
444 size_t num_elems = std::distance(first, last);
445 BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero-length vector is undefined.");
446 auto comparator = [¢er](Real a, Real b) { return abs(a-center) < abs(b-center);};
447 if (num_elems & 1)
448 {
449 auto middle = first + (num_elems - 1)/2;
450 std::nth_element(first, middle, last, comparator);
451 return abs(*middle);
452 }
453 else
454 {
455 auto middle = first + num_elems/2 - 1;
456 std::nth_element(first, middle, last, comparator);
457 std::nth_element(middle, middle+1, last, comparator);
458 return (abs(*middle) + abs(*(middle+1)))/abs(static_cast<Real>(2));
459 }
460 }
461
462 template<class RandomAccessContainer>
median_absolute_deviation(RandomAccessContainer & v,typename RandomAccessContainer::value_type center=std::numeric_limits<typename RandomAccessContainer::value_type>::quiet_NaN ())463 inline auto median_absolute_deviation(RandomAccessContainer & v, typename RandomAccessContainer::value_type center=std::numeric_limits<typename RandomAccessContainer::value_type>::quiet_NaN())
464 {
465 return median_absolute_deviation(v.begin(), v.end(), center);
466 }
467
468 template<class ForwardIterator>
interquartile_range(ForwardIterator first,ForwardIterator last)469 auto interquartile_range(ForwardIterator first, ForwardIterator last)
470 {
471 using Real = typename std::iterator_traits<ForwardIterator>::value_type;
472 static_assert(!std::is_integral<Real>::value, "Integer values have not yet been implemented.");
473 auto m = std::distance(first,last);
474 BOOST_ASSERT_MSG(m >= 3, "At least 3 samples are required to compute the interquartile range.");
475 auto k = m/4;
476 auto j = m - (4*k);
477 // m = 4k+j.
478 // If j = 0 or j = 1, then there are an even number of samples below the median, and an even number above the median.
479 // Then we must average adjacent elements to get the quartiles.
480 // If j = 2 or j = 3, there are an odd number of samples above and below the median, these elements may be directly extracted to get the quartiles.
481
482 if (j==2 || j==3)
483 {
484 auto q1 = first + k;
485 auto q3 = first + 3*k + j - 1;
486 std::nth_element(first, q1, last);
487 Real Q1 = *q1;
488 std::nth_element(q1, q3, last);
489 Real Q3 = *q3;
490 return Q3 - Q1;
491 } else {
492 // j == 0 or j==1:
493 auto q1 = first + k - 1;
494 auto q3 = first + 3*k - 1 + j;
495 std::nth_element(first, q1, last);
496 Real a = *q1;
497 std::nth_element(q1, q1 + 1, last);
498 Real b = *(q1 + 1);
499 Real Q1 = (a+b)/2;
500 std::nth_element(q1, q3, last);
501 a = *q3;
502 std::nth_element(q3, q3 + 1, last);
503 b = *(q3 + 1);
504 Real Q3 = (a+b)/2;
505 return Q3 - Q1;
506 }
507 }
508
509 template<class RandomAccessContainer>
interquartile_range(RandomAccessContainer & v)510 inline auto interquartile_range(RandomAccessContainer & v)
511 {
512 return interquartile_range(v.begin(), v.end());
513 }
514
515
516 }
517 #endif
518