1""" 2Basic statistics module. 3 4This module provides functions for calculating statistics of data, including 5averages, variance, and standard deviation. 6 7Calculating averages 8-------------------- 9 10================== ================================================== 11Function Description 12================== ================================================== 13mean Arithmetic mean (average) of data. 14fmean Fast, floating point arithmetic mean. 15geometric_mean Geometric mean of data. 16harmonic_mean Harmonic mean of data. 17median Median (middle value) of data. 18median_low Low median of data. 19median_high High median of data. 20median_grouped Median, or 50th percentile, of grouped data. 21mode Mode (most common value) of data. 22multimode List of modes (most common values of data). 23quantiles Divide data into intervals with equal probability. 24================== ================================================== 25 26Calculate the arithmetic mean ("the average") of data: 27 28>>> mean([-1.0, 2.5, 3.25, 5.75]) 292.625 30 31 32Calculate the standard median of discrete data: 33 34>>> median([2, 3, 4, 5]) 353.5 36 37 38Calculate the median, or 50th percentile, of data grouped into class intervals 39centred on the data values provided. E.g. if your data points are rounded to 40the nearest whole number: 41 42>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS 432.8333333333... 44 45This should be interpreted in this way: you have two data points in the class 46interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in 47the class interval 3.5-4.5. The median of these data points is 2.8333... 48 49 50Calculating variability or spread 51--------------------------------- 52 53================== ============================================= 54Function Description 55================== ============================================= 56pvariance Population variance of data. 57variance Sample variance of data. 58pstdev Population standard deviation of data. 59stdev Sample standard deviation of data. 60================== ============================================= 61 62Calculate the standard deviation of sample data: 63 64>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS 654.38961843444... 66 67If you have previously calculated the mean, you can pass it as the optional 68second argument to the four "spread" functions to avoid recalculating it: 69 70>>> data = [1, 2, 2, 4, 4, 4, 5, 6] 71>>> mu = mean(data) 72>>> pvariance(data, mu) 732.5 74 75 76Exceptions 77---------- 78 79A single exception is defined: StatisticsError is a subclass of ValueError. 80 81""" 82 83__all__ = [ 84 'NormalDist', 85 'StatisticsError', 86 'fmean', 87 'geometric_mean', 88 'harmonic_mean', 89 'mean', 90 'median', 91 'median_grouped', 92 'median_high', 93 'median_low', 94 'mode', 95 'multimode', 96 'pstdev', 97 'pvariance', 98 'quantiles', 99 'stdev', 100 'variance', 101] 102 103import math 104import numbers 105import random 106 107from fractions import Fraction 108from decimal import Decimal 109from itertools import groupby 110from bisect import bisect_left, bisect_right 111from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum 112from operator import itemgetter 113from collections import Counter 114 115# === Exceptions === 116 117class StatisticsError(ValueError): 118 pass 119 120 121# === Private utilities === 122 123def _sum(data, start=0): 124 """_sum(data [, start]) -> (type, sum, count) 125 126 Return a high-precision sum of the given numeric data as a fraction, 127 together with the type to be converted to and the count of items. 128 129 If optional argument ``start`` is given, it is added to the total. 130 If ``data`` is empty, ``start`` (defaulting to 0) is returned. 131 132 133 Examples 134 -------- 135 136 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75) 137 (<class 'float'>, Fraction(11, 1), 5) 138 139 Some sources of round-off error will be avoided: 140 141 # Built-in sum returns zero. 142 >>> _sum([1e50, 1, -1e50] * 1000) 143 (<class 'float'>, Fraction(1000, 1), 3000) 144 145 Fractions and Decimals are also supported: 146 147 >>> from fractions import Fraction as F 148 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)]) 149 (<class 'fractions.Fraction'>, Fraction(63, 20), 4) 150 151 >>> from decimal import Decimal as D 152 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")] 153 >>> _sum(data) 154 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4) 155 156 Mixed types are currently treated as an error, except that int is 157 allowed. 158 """ 159 count = 0 160 n, d = _exact_ratio(start) 161 partials = {d: n} 162 partials_get = partials.get 163 T = _coerce(int, type(start)) 164 for typ, values in groupby(data, type): 165 T = _coerce(T, typ) # or raise TypeError 166 for n,d in map(_exact_ratio, values): 167 count += 1 168 partials[d] = partials_get(d, 0) + n 169 if None in partials: 170 # The sum will be a NAN or INF. We can ignore all the finite 171 # partials, and just look at this special one. 172 total = partials[None] 173 assert not _isfinite(total) 174 else: 175 # Sum all the partial sums using builtin sum. 176 # FIXME is this faster if we sum them in order of the denominator? 177 total = sum(Fraction(n, d) for d, n in sorted(partials.items())) 178 return (T, total, count) 179 180 181def _isfinite(x): 182 try: 183 return x.is_finite() # Likely a Decimal. 184 except AttributeError: 185 return math.isfinite(x) # Coerces to float first. 186 187 188def _coerce(T, S): 189 """Coerce types T and S to a common type, or raise TypeError. 190 191 Coercion rules are currently an implementation detail. See the CoerceTest 192 test class in test_statistics for details. 193 """ 194 # See http://bugs.python.org/issue24068. 195 assert T is not bool, "initial type T is bool" 196 # If the types are the same, no need to coerce anything. Put this 197 # first, so that the usual case (no coercion needed) happens as soon 198 # as possible. 199 if T is S: return T 200 # Mixed int & other coerce to the other type. 201 if S is int or S is bool: return T 202 if T is int: return S 203 # If one is a (strict) subclass of the other, coerce to the subclass. 204 if issubclass(S, T): return S 205 if issubclass(T, S): return T 206 # Ints coerce to the other type. 207 if issubclass(T, int): return S 208 if issubclass(S, int): return T 209 # Mixed fraction & float coerces to float (or float subclass). 210 if issubclass(T, Fraction) and issubclass(S, float): 211 return S 212 if issubclass(T, float) and issubclass(S, Fraction): 213 return T 214 # Any other combination is disallowed. 215 msg = "don't know how to coerce %s and %s" 216 raise TypeError(msg % (T.__name__, S.__name__)) 217 218 219def _exact_ratio(x): 220 """Return Real number x to exact (numerator, denominator) pair. 221 222 >>> _exact_ratio(0.25) 223 (1, 4) 224 225 x is expected to be an int, Fraction, Decimal or float. 226 """ 227 try: 228 # Optimise the common case of floats. We expect that the most often 229 # used numeric type will be builtin floats, so try to make this as 230 # fast as possible. 231 if type(x) is float or type(x) is Decimal: 232 return x.as_integer_ratio() 233 try: 234 # x may be an int, Fraction, or Integral ABC. 235 return (x.numerator, x.denominator) 236 except AttributeError: 237 try: 238 # x may be a float or Decimal subclass. 239 return x.as_integer_ratio() 240 except AttributeError: 241 # Just give up? 242 pass 243 except (OverflowError, ValueError): 244 # float NAN or INF. 245 assert not _isfinite(x) 246 return (x, None) 247 msg = "can't convert type '{}' to numerator/denominator" 248 raise TypeError(msg.format(type(x).__name__)) 249 250 251def _convert(value, T): 252 """Convert value to given numeric type T.""" 253 if type(value) is T: 254 # This covers the cases where T is Fraction, or where value is 255 # a NAN or INF (Decimal or float). 256 return value 257 if issubclass(T, int) and value.denominator != 1: 258 T = float 259 try: 260 # FIXME: what do we do if this overflows? 261 return T(value) 262 except TypeError: 263 if issubclass(T, Decimal): 264 return T(value.numerator)/T(value.denominator) 265 else: 266 raise 267 268 269def _find_lteq(a, x): 270 'Locate the leftmost value exactly equal to x' 271 i = bisect_left(a, x) 272 if i != len(a) and a[i] == x: 273 return i 274 raise ValueError 275 276 277def _find_rteq(a, l, x): 278 'Locate the rightmost value exactly equal to x' 279 i = bisect_right(a, x, lo=l) 280 if i != (len(a)+1) and a[i-1] == x: 281 return i-1 282 raise ValueError 283 284 285def _fail_neg(values, errmsg='negative value'): 286 """Iterate over values, failing if any are less than zero.""" 287 for x in values: 288 if x < 0: 289 raise StatisticsError(errmsg) 290 yield x 291 292 293# === Measures of central tendency (averages) === 294 295def mean(data): 296 """Return the sample arithmetic mean of data. 297 298 >>> mean([1, 2, 3, 4, 4]) 299 2.8 300 301 >>> from fractions import Fraction as F 302 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)]) 303 Fraction(13, 21) 304 305 >>> from decimal import Decimal as D 306 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")]) 307 Decimal('0.5625') 308 309 If ``data`` is empty, StatisticsError will be raised. 310 """ 311 if iter(data) is data: 312 data = list(data) 313 n = len(data) 314 if n < 1: 315 raise StatisticsError('mean requires at least one data point') 316 T, total, count = _sum(data) 317 assert count == n 318 return _convert(total/n, T) 319 320 321def fmean(data): 322 """Convert data to floats and compute the arithmetic mean. 323 324 This runs faster than the mean() function and it always returns a float. 325 If the input dataset is empty, it raises a StatisticsError. 326 327 >>> fmean([3.5, 4.0, 5.25]) 328 4.25 329 """ 330 try: 331 n = len(data) 332 except TypeError: 333 # Handle iterators that do not define __len__(). 334 n = 0 335 def count(iterable): 336 nonlocal n 337 for n, x in enumerate(iterable, start=1): 338 yield x 339 total = fsum(count(data)) 340 else: 341 total = fsum(data) 342 try: 343 return total / n 344 except ZeroDivisionError: 345 raise StatisticsError('fmean requires at least one data point') from None 346 347 348def geometric_mean(data): 349 """Convert data to floats and compute the geometric mean. 350 351 Raises a StatisticsError if the input dataset is empty, 352 if it contains a zero, or if it contains a negative value. 353 354 No special efforts are made to achieve exact results. 355 (However, this may change in the future.) 356 357 >>> round(geometric_mean([54, 24, 36]), 9) 358 36.0 359 """ 360 try: 361 return exp(fmean(map(log, data))) 362 except ValueError: 363 raise StatisticsError('geometric mean requires a non-empty dataset ' 364 ' containing positive numbers') from None 365 366 367def harmonic_mean(data): 368 """Return the harmonic mean of data. 369 370 The harmonic mean, sometimes called the subcontrary mean, is the 371 reciprocal of the arithmetic mean of the reciprocals of the data, 372 and is often appropriate when averaging quantities which are rates 373 or ratios, for example speeds. Example: 374 375 Suppose an investor purchases an equal value of shares in each of 376 three companies, with P/E (price/earning) ratios of 2.5, 3 and 10. 377 What is the average P/E ratio for the investor's portfolio? 378 379 >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio. 380 3.6 381 382 Using the arithmetic mean would give an average of about 5.167, which 383 is too high. 384 385 If ``data`` is empty, or any element is less than zero, 386 ``harmonic_mean`` will raise ``StatisticsError``. 387 """ 388 # For a justification for using harmonic mean for P/E ratios, see 389 # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/ 390 # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087 391 if iter(data) is data: 392 data = list(data) 393 errmsg = 'harmonic mean does not support negative values' 394 n = len(data) 395 if n < 1: 396 raise StatisticsError('harmonic_mean requires at least one data point') 397 elif n == 1: 398 x = data[0] 399 if isinstance(x, (numbers.Real, Decimal)): 400 if x < 0: 401 raise StatisticsError(errmsg) 402 return x 403 else: 404 raise TypeError('unsupported type') 405 try: 406 T, total, count = _sum(1/x for x in _fail_neg(data, errmsg)) 407 except ZeroDivisionError: 408 return 0 409 assert count == n 410 return _convert(n/total, T) 411 412 413# FIXME: investigate ways to calculate medians without sorting? Quickselect? 414def median(data): 415 """Return the median (middle value) of numeric data. 416 417 When the number of data points is odd, return the middle data point. 418 When the number of data points is even, the median is interpolated by 419 taking the average of the two middle values: 420 421 >>> median([1, 3, 5]) 422 3 423 >>> median([1, 3, 5, 7]) 424 4.0 425 426 """ 427 data = sorted(data) 428 n = len(data) 429 if n == 0: 430 raise StatisticsError("no median for empty data") 431 if n%2 == 1: 432 return data[n//2] 433 else: 434 i = n//2 435 return (data[i - 1] + data[i])/2 436 437 438def median_low(data): 439 """Return the low median of numeric data. 440 441 When the number of data points is odd, the middle value is returned. 442 When it is even, the smaller of the two middle values is returned. 443 444 >>> median_low([1, 3, 5]) 445 3 446 >>> median_low([1, 3, 5, 7]) 447 3 448 449 """ 450 data = sorted(data) 451 n = len(data) 452 if n == 0: 453 raise StatisticsError("no median for empty data") 454 if n%2 == 1: 455 return data[n//2] 456 else: 457 return data[n//2 - 1] 458 459 460def median_high(data): 461 """Return the high median of data. 462 463 When the number of data points is odd, the middle value is returned. 464 When it is even, the larger of the two middle values is returned. 465 466 >>> median_high([1, 3, 5]) 467 3 468 >>> median_high([1, 3, 5, 7]) 469 5 470 471 """ 472 data = sorted(data) 473 n = len(data) 474 if n == 0: 475 raise StatisticsError("no median for empty data") 476 return data[n//2] 477 478 479def median_grouped(data, interval=1): 480 """Return the 50th percentile (median) of grouped continuous data. 481 482 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5]) 483 3.7 484 >>> median_grouped([52, 52, 53, 54]) 485 52.5 486 487 This calculates the median as the 50th percentile, and should be 488 used when your data is continuous and grouped. In the above example, 489 the values 1, 2, 3, etc. actually represent the midpoint of classes 490 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in 491 class 3.5-4.5, and interpolation is used to estimate it. 492 493 Optional argument ``interval`` represents the class interval, and 494 defaults to 1. Changing the class interval naturally will change the 495 interpolated 50th percentile value: 496 497 >>> median_grouped([1, 3, 3, 5, 7], interval=1) 498 3.25 499 >>> median_grouped([1, 3, 3, 5, 7], interval=2) 500 3.5 501 502 This function does not check whether the data points are at least 503 ``interval`` apart. 504 """ 505 data = sorted(data) 506 n = len(data) 507 if n == 0: 508 raise StatisticsError("no median for empty data") 509 elif n == 1: 510 return data[0] 511 # Find the value at the midpoint. Remember this corresponds to the 512 # centre of the class interval. 513 x = data[n//2] 514 for obj in (x, interval): 515 if isinstance(obj, (str, bytes)): 516 raise TypeError('expected number but got %r' % obj) 517 try: 518 L = x - interval/2 # The lower limit of the median interval. 519 except TypeError: 520 # Mixed type. For now we just coerce to float. 521 L = float(x) - float(interval)/2 522 523 # Uses bisection search to search for x in data with log(n) time complexity 524 # Find the position of leftmost occurrence of x in data 525 l1 = _find_lteq(data, x) 526 # Find the position of rightmost occurrence of x in data[l1...len(data)] 527 # Assuming always l1 <= l2 528 l2 = _find_rteq(data, l1, x) 529 cf = l1 530 f = l2 - l1 + 1 531 return L + interval*(n/2 - cf)/f 532 533 534def mode(data): 535 """Return the most common data point from discrete or nominal data. 536 537 ``mode`` assumes discrete data, and returns a single value. This is the 538 standard treatment of the mode as commonly taught in schools: 539 540 >>> mode([1, 1, 2, 3, 3, 3, 3, 4]) 541 3 542 543 This also works with nominal (non-numeric) data: 544 545 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"]) 546 'red' 547 548 If there are multiple modes with same frequency, return the first one 549 encountered: 550 551 >>> mode(['red', 'red', 'green', 'blue', 'blue']) 552 'red' 553 554 If *data* is empty, ``mode``, raises StatisticsError. 555 556 """ 557 data = iter(data) 558 pairs = Counter(data).most_common(1) 559 try: 560 return pairs[0][0] 561 except IndexError: 562 raise StatisticsError('no mode for empty data') from None 563 564 565def multimode(data): 566 """Return a list of the most frequently occurring values. 567 568 Will return more than one result if there are multiple modes 569 or an empty list if *data* is empty. 570 571 >>> multimode('aabbbbbbbbcc') 572 ['b'] 573 >>> multimode('aabbbbccddddeeffffgg') 574 ['b', 'd', 'f'] 575 >>> multimode('') 576 [] 577 """ 578 counts = Counter(iter(data)).most_common() 579 maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, [])) 580 return list(map(itemgetter(0), mode_items)) 581 582 583# Notes on methods for computing quantiles 584# ---------------------------------------- 585# 586# There is no one perfect way to compute quantiles. Here we offer 587# two methods that serve common needs. Most other packages 588# surveyed offered at least one or both of these two, making them 589# "standard" in the sense of "widely-adopted and reproducible". 590# They are also easy to explain, easy to compute manually, and have 591# straight-forward interpretations that aren't surprising. 592 593# The default method is known as "R6", "PERCENTILE.EXC", or "expected 594# value of rank order statistics". The alternative method is known as 595# "R7", "PERCENTILE.INC", or "mode of rank order statistics". 596 597# For sample data where there is a positive probability for values 598# beyond the range of the data, the R6 exclusive method is a 599# reasonable choice. Consider a random sample of nine values from a 600# population with a uniform distribution from 0.0 to 100.0. The 601# distribution of the third ranked sample point is described by 602# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and 603# mean=0.300. Only the latter (which corresponds with R6) gives the 604# desired cut point with 30% of the population falling below that 605# value, making it comparable to a result from an inv_cdf() function. 606# The R6 exclusive method is also idempotent. 607 608# For describing population data where the end points are known to 609# be included in the data, the R7 inclusive method is a reasonable 610# choice. Instead of the mean, it uses the mode of the beta 611# distribution for the interior points. Per Hyndman & Fan, "One nice 612# property is that the vertices of Q7(p) divide the range into n - 1 613# intervals, and exactly 100p% of the intervals lie to the left of 614# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)." 615 616# If needed, other methods could be added. However, for now, the 617# position is that fewer options make for easier choices and that 618# external packages can be used for anything more advanced. 619 620def quantiles(data, *, n=4, method='exclusive'): 621 """Divide *data* into *n* continuous intervals with equal probability. 622 623 Returns a list of (n - 1) cut points separating the intervals. 624 625 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. 626 Set *n* to 100 for percentiles which gives the 99 cuts points that 627 separate *data* in to 100 equal sized groups. 628 629 The *data* can be any iterable containing sample. 630 The cut points are linearly interpolated between data points. 631 632 If *method* is set to *inclusive*, *data* is treated as population 633 data. The minimum value is treated as the 0th percentile and the 634 maximum value is treated as the 100th percentile. 635 """ 636 if n < 1: 637 raise StatisticsError('n must be at least 1') 638 data = sorted(data) 639 ld = len(data) 640 if ld < 2: 641 raise StatisticsError('must have at least two data points') 642 if method == 'inclusive': 643 m = ld - 1 644 result = [] 645 for i in range(1, n): 646 j = i * m // n 647 delta = i*m - j*n 648 interpolated = (data[j] * (n - delta) + data[j+1] * delta) / n 649 result.append(interpolated) 650 return result 651 if method == 'exclusive': 652 m = ld + 1 653 result = [] 654 for i in range(1, n): 655 j = i * m // n # rescale i to m/n 656 j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1 657 delta = i*m - j*n # exact integer math 658 interpolated = (data[j-1] * (n - delta) + data[j] * delta) / n 659 result.append(interpolated) 660 return result 661 raise ValueError(f'Unknown method: {method!r}') 662 663 664# === Measures of spread === 665 666# See http://mathworld.wolfram.com/Variance.html 667# http://mathworld.wolfram.com/SampleVariance.html 668# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 669# 670# Under no circumstances use the so-called "computational formula for 671# variance", as that is only suitable for hand calculations with a small 672# amount of low-precision data. It has terrible numeric properties. 673# 674# See a comparison of three computational methods here: 675# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/ 676 677def _ss(data, c=None): 678 """Return sum of square deviations of sequence data. 679 680 If ``c`` is None, the mean is calculated in one pass, and the deviations 681 from the mean are calculated in a second pass. Otherwise, deviations are 682 calculated from ``c`` as given. Use the second case with care, as it can 683 lead to garbage results. 684 """ 685 if c is None: 686 c = mean(data) 687 T, total, count = _sum((x-c)**2 for x in data) 688 # The following sum should mathematically equal zero, but due to rounding 689 # error may not. 690 U, total2, count2 = _sum((x-c) for x in data) 691 assert T == U and count == count2 692 total -= total2**2/len(data) 693 assert not total < 0, 'negative sum of square deviations: %f' % total 694 return (T, total) 695 696 697def variance(data, xbar=None): 698 """Return the sample variance of data. 699 700 data should be an iterable of Real-valued numbers, with at least two 701 values. The optional argument xbar, if given, should be the mean of 702 the data. If it is missing or None, the mean is automatically calculated. 703 704 Use this function when your data is a sample from a population. To 705 calculate the variance from the entire population, see ``pvariance``. 706 707 Examples: 708 709 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5] 710 >>> variance(data) 711 1.3720238095238095 712 713 If you have already calculated the mean of your data, you can pass it as 714 the optional second argument ``xbar`` to avoid recalculating it: 715 716 >>> m = mean(data) 717 >>> variance(data, m) 718 1.3720238095238095 719 720 This function does not check that ``xbar`` is actually the mean of 721 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or 722 impossible results. 723 724 Decimals and Fractions are supported: 725 726 >>> from decimal import Decimal as D 727 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) 728 Decimal('31.01875') 729 730 >>> from fractions import Fraction as F 731 >>> variance([F(1, 6), F(1, 2), F(5, 3)]) 732 Fraction(67, 108) 733 734 """ 735 if iter(data) is data: 736 data = list(data) 737 n = len(data) 738 if n < 2: 739 raise StatisticsError('variance requires at least two data points') 740 T, ss = _ss(data, xbar) 741 return _convert(ss/(n-1), T) 742 743 744def pvariance(data, mu=None): 745 """Return the population variance of ``data``. 746 747 data should be a sequence or iterable of Real-valued numbers, with at least one 748 value. The optional argument mu, if given, should be the mean of 749 the data. If it is missing or None, the mean is automatically calculated. 750 751 Use this function to calculate the variance from the entire population. 752 To estimate the variance from a sample, the ``variance`` function is 753 usually a better choice. 754 755 Examples: 756 757 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25] 758 >>> pvariance(data) 759 1.25 760 761 If you have already calculated the mean of the data, you can pass it as 762 the optional second argument to avoid recalculating it: 763 764 >>> mu = mean(data) 765 >>> pvariance(data, mu) 766 1.25 767 768 Decimals and Fractions are supported: 769 770 >>> from decimal import Decimal as D 771 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) 772 Decimal('24.815') 773 774 >>> from fractions import Fraction as F 775 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)]) 776 Fraction(13, 72) 777 778 """ 779 if iter(data) is data: 780 data = list(data) 781 n = len(data) 782 if n < 1: 783 raise StatisticsError('pvariance requires at least one data point') 784 T, ss = _ss(data, mu) 785 return _convert(ss/n, T) 786 787 788def stdev(data, xbar=None): 789 """Return the square root of the sample variance. 790 791 See ``variance`` for arguments and other details. 792 793 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 794 1.0810874155219827 795 796 """ 797 var = variance(data, xbar) 798 try: 799 return var.sqrt() 800 except AttributeError: 801 return math.sqrt(var) 802 803 804def pstdev(data, mu=None): 805 """Return the square root of the population variance. 806 807 See ``pvariance`` for arguments and other details. 808 809 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 810 0.986893273527251 811 812 """ 813 var = pvariance(data, mu) 814 try: 815 return var.sqrt() 816 except AttributeError: 817 return math.sqrt(var) 818 819 820## Normal Distribution ##################################################### 821 822 823def _normal_dist_inv_cdf(p, mu, sigma): 824 # There is no closed-form solution to the inverse CDF for the normal 825 # distribution, so we use a rational approximation instead: 826 # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the 827 # Normal Distribution". Applied Statistics. Blackwell Publishing. 37 828 # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330. 829 q = p - 0.5 830 if fabs(q) <= 0.425: 831 r = 0.180625 - q * q 832 # Hash sum: 55.88319_28806_14901_4439 833 num = (((((((2.50908_09287_30122_6727e+3 * r + 834 3.34305_75583_58812_8105e+4) * r + 835 6.72657_70927_00870_0853e+4) * r + 836 4.59219_53931_54987_1457e+4) * r + 837 1.37316_93765_50946_1125e+4) * r + 838 1.97159_09503_06551_4427e+3) * r + 839 1.33141_66789_17843_7745e+2) * r + 840 3.38713_28727_96366_6080e+0) * q 841 den = (((((((5.22649_52788_52854_5610e+3 * r + 842 2.87290_85735_72194_2674e+4) * r + 843 3.93078_95800_09271_0610e+4) * r + 844 2.12137_94301_58659_5867e+4) * r + 845 5.39419_60214_24751_1077e+3) * r + 846 6.87187_00749_20579_0830e+2) * r + 847 4.23133_30701_60091_1252e+1) * r + 848 1.0) 849 x = num / den 850 return mu + (x * sigma) 851 r = p if q <= 0.0 else 1.0 - p 852 r = sqrt(-log(r)) 853 if r <= 5.0: 854 r = r - 1.6 855 # Hash sum: 49.33206_50330_16102_89036 856 num = (((((((7.74545_01427_83414_07640e-4 * r + 857 2.27238_44989_26918_45833e-2) * r + 858 2.41780_72517_74506_11770e-1) * r + 859 1.27045_82524_52368_38258e+0) * r + 860 3.64784_83247_63204_60504e+0) * r + 861 5.76949_72214_60691_40550e+0) * r + 862 4.63033_78461_56545_29590e+0) * r + 863 1.42343_71107_49683_57734e+0) 864 den = (((((((1.05075_00716_44416_84324e-9 * r + 865 5.47593_80849_95344_94600e-4) * r + 866 1.51986_66563_61645_71966e-2) * r + 867 1.48103_97642_74800_74590e-1) * r + 868 6.89767_33498_51000_04550e-1) * r + 869 1.67638_48301_83803_84940e+0) * r + 870 2.05319_16266_37758_82187e+0) * r + 871 1.0) 872 else: 873 r = r - 5.0 874 # Hash sum: 47.52583_31754_92896_71629 875 num = (((((((2.01033_43992_92288_13265e-7 * r + 876 2.71155_55687_43487_57815e-5) * r + 877 1.24266_09473_88078_43860e-3) * r + 878 2.65321_89526_57612_30930e-2) * r + 879 2.96560_57182_85048_91230e-1) * r + 880 1.78482_65399_17291_33580e+0) * r + 881 5.46378_49111_64114_36990e+0) * r + 882 6.65790_46435_01103_77720e+0) 883 den = (((((((2.04426_31033_89939_78564e-15 * r + 884 1.42151_17583_16445_88870e-7) * r + 885 1.84631_83175_10054_68180e-5) * r + 886 7.86869_13114_56132_59100e-4) * r + 887 1.48753_61290_85061_48525e-2) * r + 888 1.36929_88092_27358_05310e-1) * r + 889 5.99832_20655_58879_37690e-1) * r + 890 1.0) 891 x = num / den 892 if q < 0.0: 893 x = -x 894 return mu + (x * sigma) 895 896 897class NormalDist: 898 "Normal distribution of a random variable" 899 # https://en.wikipedia.org/wiki/Normal_distribution 900 # https://en.wikipedia.org/wiki/Variance#Properties 901 902 __slots__ = { 903 '_mu': 'Arithmetic mean of a normal distribution', 904 '_sigma': 'Standard deviation of a normal distribution', 905 } 906 907 def __init__(self, mu=0.0, sigma=1.0): 908 "NormalDist where mu is the mean and sigma is the standard deviation." 909 if sigma < 0.0: 910 raise StatisticsError('sigma must be non-negative') 911 self._mu = float(mu) 912 self._sigma = float(sigma) 913 914 @classmethod 915 def from_samples(cls, data): 916 "Make a normal distribution instance from sample data." 917 if not isinstance(data, (list, tuple)): 918 data = list(data) 919 xbar = fmean(data) 920 return cls(xbar, stdev(data, xbar)) 921 922 def samples(self, n, *, seed=None): 923 "Generate *n* samples for a given mean and standard deviation." 924 gauss = random.gauss if seed is None else random.Random(seed).gauss 925 mu, sigma = self._mu, self._sigma 926 return [gauss(mu, sigma) for i in range(n)] 927 928 def pdf(self, x): 929 "Probability density function. P(x <= X < x+dx) / dx" 930 variance = self._sigma ** 2.0 931 if not variance: 932 raise StatisticsError('pdf() not defined when sigma is zero') 933 return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance) 934 935 def cdf(self, x): 936 "Cumulative distribution function. P(X <= x)" 937 if not self._sigma: 938 raise StatisticsError('cdf() not defined when sigma is zero') 939 return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0)))) 940 941 def inv_cdf(self, p): 942 """Inverse cumulative distribution function. x : P(X <= x) = p 943 944 Finds the value of the random variable such that the probability of 945 the variable being less than or equal to that value equals the given 946 probability. 947 948 This function is also called the percent point function or quantile 949 function. 950 """ 951 if p <= 0.0 or p >= 1.0: 952 raise StatisticsError('p must be in the range 0.0 < p < 1.0') 953 if self._sigma <= 0.0: 954 raise StatisticsError('cdf() not defined when sigma at or below zero') 955 return _normal_dist_inv_cdf(p, self._mu, self._sigma) 956 957 def quantiles(self, n=4): 958 """Divide into *n* continuous intervals with equal probability. 959 960 Returns a list of (n - 1) cut points separating the intervals. 961 962 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. 963 Set *n* to 100 for percentiles which gives the 99 cuts points that 964 separate the normal distribution in to 100 equal sized groups. 965 """ 966 return [self.inv_cdf(i / n) for i in range(1, n)] 967 968 def overlap(self, other): 969 """Compute the overlapping coefficient (OVL) between two normal distributions. 970 971 Measures the agreement between two normal probability distributions. 972 Returns a value between 0.0 and 1.0 giving the overlapping area in 973 the two underlying probability density functions. 974 975 >>> N1 = NormalDist(2.4, 1.6) 976 >>> N2 = NormalDist(3.2, 2.0) 977 >>> N1.overlap(N2) 978 0.8035050657330205 979 """ 980 # See: "The overlapping coefficient as a measure of agreement between 981 # probability distributions and point estimation of the overlap of two 982 # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr 983 # http://dx.doi.org/10.1080/03610928908830127 984 if not isinstance(other, NormalDist): 985 raise TypeError('Expected another NormalDist instance') 986 X, Y = self, other 987 if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity 988 X, Y = Y, X 989 X_var, Y_var = X.variance, Y.variance 990 if not X_var or not Y_var: 991 raise StatisticsError('overlap() not defined when sigma is zero') 992 dv = Y_var - X_var 993 dm = fabs(Y._mu - X._mu) 994 if not dv: 995 return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0))) 996 a = X._mu * Y_var - Y._mu * X_var 997 b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var)) 998 x1 = (a + b) / dv 999 x2 = (a - b) / dv 1000 return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2))) 1001 1002 @property 1003 def mean(self): 1004 "Arithmetic mean of the normal distribution." 1005 return self._mu 1006 1007 @property 1008 def median(self): 1009 "Return the median of the normal distribution" 1010 return self._mu 1011 1012 @property 1013 def mode(self): 1014 """Return the mode of the normal distribution 1015 1016 The mode is the value x where which the probability density 1017 function (pdf) takes its maximum value. 1018 """ 1019 return self._mu 1020 1021 @property 1022 def stdev(self): 1023 "Standard deviation of the normal distribution." 1024 return self._sigma 1025 1026 @property 1027 def variance(self): 1028 "Square of the standard deviation." 1029 return self._sigma ** 2.0 1030 1031 def __add__(x1, x2): 1032 """Add a constant or another NormalDist instance. 1033 1034 If *other* is a constant, translate mu by the constant, 1035 leaving sigma unchanged. 1036 1037 If *other* is a NormalDist, add both the means and the variances. 1038 Mathematically, this works only if the two distributions are 1039 independent or if they are jointly normally distributed. 1040 """ 1041 if isinstance(x2, NormalDist): 1042 return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma)) 1043 return NormalDist(x1._mu + x2, x1._sigma) 1044 1045 def __sub__(x1, x2): 1046 """Subtract a constant or another NormalDist instance. 1047 1048 If *other* is a constant, translate by the constant mu, 1049 leaving sigma unchanged. 1050 1051 If *other* is a NormalDist, subtract the means and add the variances. 1052 Mathematically, this works only if the two distributions are 1053 independent or if they are jointly normally distributed. 1054 """ 1055 if isinstance(x2, NormalDist): 1056 return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma)) 1057 return NormalDist(x1._mu - x2, x1._sigma) 1058 1059 def __mul__(x1, x2): 1060 """Multiply both mu and sigma by a constant. 1061 1062 Used for rescaling, perhaps to change measurement units. 1063 Sigma is scaled with the absolute value of the constant. 1064 """ 1065 return NormalDist(x1._mu * x2, x1._sigma * fabs(x2)) 1066 1067 def __truediv__(x1, x2): 1068 """Divide both mu and sigma by a constant. 1069 1070 Used for rescaling, perhaps to change measurement units. 1071 Sigma is scaled with the absolute value of the constant. 1072 """ 1073 return NormalDist(x1._mu / x2, x1._sigma / fabs(x2)) 1074 1075 def __pos__(x1): 1076 "Return a copy of the instance." 1077 return NormalDist(x1._mu, x1._sigma) 1078 1079 def __neg__(x1): 1080 "Negates mu while keeping sigma the same." 1081 return NormalDist(-x1._mu, x1._sigma) 1082 1083 __radd__ = __add__ 1084 1085 def __rsub__(x1, x2): 1086 "Subtract a NormalDist from a constant or another NormalDist." 1087 return -(x1 - x2) 1088 1089 __rmul__ = __mul__ 1090 1091 def __eq__(x1, x2): 1092 "Two NormalDist objects are equal if their mu and sigma are both equal." 1093 if not isinstance(x2, NormalDist): 1094 return NotImplemented 1095 return x1._mu == x2._mu and x1._sigma == x2._sigma 1096 1097 def __hash__(self): 1098 "NormalDist objects hash equal if their mu and sigma are both equal." 1099 return hash((self._mu, self._sigma)) 1100 1101 def __repr__(self): 1102 return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})' 1103 1104# If available, use C implementation 1105try: 1106 from _statistics import _normal_dist_inv_cdf 1107except ImportError: 1108 pass 1109 1110 1111if __name__ == '__main__': 1112 1113 # Show math operations computed analytically in comparsion 1114 # to a monte carlo simulation of the same operations 1115 1116 from math import isclose 1117 from operator import add, sub, mul, truediv 1118 from itertools import repeat 1119 import doctest 1120 1121 g1 = NormalDist(10, 20) 1122 g2 = NormalDist(-5, 25) 1123 1124 # Test scaling by a constant 1125 assert (g1 * 5 / 5).mean == g1.mean 1126 assert (g1 * 5 / 5).stdev == g1.stdev 1127 1128 n = 100_000 1129 G1 = g1.samples(n) 1130 G2 = g2.samples(n) 1131 1132 for func in (add, sub): 1133 print(f'\nTest {func.__name__} with another NormalDist:') 1134 print(func(g1, g2)) 1135 print(NormalDist.from_samples(map(func, G1, G2))) 1136 1137 const = 11 1138 for func in (add, sub, mul, truediv): 1139 print(f'\nTest {func.__name__} with a constant:') 1140 print(func(g1, const)) 1141 print(NormalDist.from_samples(map(func, G1, repeat(const)))) 1142 1143 const = 19 1144 for func in (add, sub, mul): 1145 print(f'\nTest constant with {func.__name__}:') 1146 print(func(const, g1)) 1147 print(NormalDist.from_samples(map(func, repeat(const), G1))) 1148 1149 def assert_close(G1, G2): 1150 assert isclose(G1.mean, G1.mean, rel_tol=0.01), (G1, G2) 1151 assert isclose(G1.stdev, G2.stdev, rel_tol=0.01), (G1, G2) 1152 1153 X = NormalDist(-105, 73) 1154 Y = NormalDist(31, 47) 1155 s = 32.75 1156 n = 100_000 1157 1158 S = NormalDist.from_samples([x + s for x in X.samples(n)]) 1159 assert_close(X + s, S) 1160 1161 S = NormalDist.from_samples([x - s for x in X.samples(n)]) 1162 assert_close(X - s, S) 1163 1164 S = NormalDist.from_samples([x * s for x in X.samples(n)]) 1165 assert_close(X * s, S) 1166 1167 S = NormalDist.from_samples([x / s for x in X.samples(n)]) 1168 assert_close(X / s, S) 1169 1170 S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n), 1171 Y.samples(n))]) 1172 assert_close(X + Y, S) 1173 1174 S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n), 1175 Y.samples(n))]) 1176 assert_close(X - Y, S) 1177 1178 print(doctest.testmod()) 1179