1""" 2Basic statistics module. 3 4This module provides functions for calculating statistics of data, including 5averages, variance, and standard deviation. 6 7Calculating averages 8-------------------- 9 10================== ================================================== 11Function Description 12================== ================================================== 13mean Arithmetic mean (average) of data. 14fmean Fast, floating point arithmetic mean. 15geometric_mean Geometric mean of data. 16harmonic_mean Harmonic mean of data. 17median Median (middle value) of data. 18median_low Low median of data. 19median_high High median of data. 20median_grouped Median, or 50th percentile, of grouped data. 21mode Mode (most common value) of data. 22multimode List of modes (most common values of data). 23quantiles Divide data into intervals with equal probability. 24================== ================================================== 25 26Calculate the arithmetic mean ("the average") of data: 27 28>>> mean([-1.0, 2.5, 3.25, 5.75]) 292.625 30 31 32Calculate the standard median of discrete data: 33 34>>> median([2, 3, 4, 5]) 353.5 36 37 38Calculate the median, or 50th percentile, of data grouped into class intervals 39centred on the data values provided. E.g. if your data points are rounded to 40the nearest whole number: 41 42>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS 432.8333333333... 44 45This should be interpreted in this way: you have two data points in the class 46interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in 47the class interval 3.5-4.5. The median of these data points is 2.8333... 48 49 50Calculating variability or spread 51--------------------------------- 52 53================== ============================================= 54Function Description 55================== ============================================= 56pvariance Population variance of data. 57variance Sample variance of data. 58pstdev Population standard deviation of data. 59stdev Sample standard deviation of data. 60================== ============================================= 61 62Calculate the standard deviation of sample data: 63 64>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS 654.38961843444... 66 67If you have previously calculated the mean, you can pass it as the optional 68second argument to the four "spread" functions to avoid recalculating it: 69 70>>> data = [1, 2, 2, 4, 4, 4, 5, 6] 71>>> mu = mean(data) 72>>> pvariance(data, mu) 732.5 74 75 76Exceptions 77---------- 78 79A single exception is defined: StatisticsError is a subclass of ValueError. 80 81""" 82 83__all__ = [ 84 'NormalDist', 85 'StatisticsError', 86 'fmean', 87 'geometric_mean', 88 'harmonic_mean', 89 'mean', 90 'median', 91 'median_grouped', 92 'median_high', 93 'median_low', 94 'mode', 95 'multimode', 96 'pstdev', 97 'pvariance', 98 'quantiles', 99 'stdev', 100 'variance', 101] 102 103import math 104import numbers 105import random 106 107from fractions import Fraction 108from decimal import Decimal 109from itertools import groupby 110from bisect import bisect_left, bisect_right 111from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum 112from operator import itemgetter 113from collections import Counter 114 115# === Exceptions === 116 117class StatisticsError(ValueError): 118 pass 119 120 121# === Private utilities === 122 123def _sum(data, start=0): 124 """_sum(data [, start]) -> (type, sum, count) 125 126 Return a high-precision sum of the given numeric data as a fraction, 127 together with the type to be converted to and the count of items. 128 129 If optional argument ``start`` is given, it is added to the total. 130 If ``data`` is empty, ``start`` (defaulting to 0) is returned. 131 132 133 Examples 134 -------- 135 136 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75) 137 (<class 'float'>, Fraction(11, 1), 5) 138 139 Some sources of round-off error will be avoided: 140 141 # Built-in sum returns zero. 142 >>> _sum([1e50, 1, -1e50] * 1000) 143 (<class 'float'>, Fraction(1000, 1), 3000) 144 145 Fractions and Decimals are also supported: 146 147 >>> from fractions import Fraction as F 148 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)]) 149 (<class 'fractions.Fraction'>, Fraction(63, 20), 4) 150 151 >>> from decimal import Decimal as D 152 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")] 153 >>> _sum(data) 154 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4) 155 156 Mixed types are currently treated as an error, except that int is 157 allowed. 158 """ 159 count = 0 160 n, d = _exact_ratio(start) 161 partials = {d: n} 162 partials_get = partials.get 163 T = _coerce(int, type(start)) 164 for typ, values in groupby(data, type): 165 T = _coerce(T, typ) # or raise TypeError 166 for n, d in map(_exact_ratio, values): 167 count += 1 168 partials[d] = partials_get(d, 0) + n 169 if None in partials: 170 # The sum will be a NAN or INF. We can ignore all the finite 171 # partials, and just look at this special one. 172 total = partials[None] 173 assert not _isfinite(total) 174 else: 175 # Sum all the partial sums using builtin sum. 176 # FIXME is this faster if we sum them in order of the denominator? 177 total = sum(Fraction(n, d) for d, n in sorted(partials.items())) 178 return (T, total, count) 179 180 181def _isfinite(x): 182 try: 183 return x.is_finite() # Likely a Decimal. 184 except AttributeError: 185 return math.isfinite(x) # Coerces to float first. 186 187 188def _coerce(T, S): 189 """Coerce types T and S to a common type, or raise TypeError. 190 191 Coercion rules are currently an implementation detail. See the CoerceTest 192 test class in test_statistics for details. 193 """ 194 # See http://bugs.python.org/issue24068. 195 assert T is not bool, "initial type T is bool" 196 # If the types are the same, no need to coerce anything. Put this 197 # first, so that the usual case (no coercion needed) happens as soon 198 # as possible. 199 if T is S: return T 200 # Mixed int & other coerce to the other type. 201 if S is int or S is bool: return T 202 if T is int: return S 203 # If one is a (strict) subclass of the other, coerce to the subclass. 204 if issubclass(S, T): return S 205 if issubclass(T, S): return T 206 # Ints coerce to the other type. 207 if issubclass(T, int): return S 208 if issubclass(S, int): return T 209 # Mixed fraction & float coerces to float (or float subclass). 210 if issubclass(T, Fraction) and issubclass(S, float): 211 return S 212 if issubclass(T, float) and issubclass(S, Fraction): 213 return T 214 # Any other combination is disallowed. 215 msg = "don't know how to coerce %s and %s" 216 raise TypeError(msg % (T.__name__, S.__name__)) 217 218 219def _exact_ratio(x): 220 """Return Real number x to exact (numerator, denominator) pair. 221 222 >>> _exact_ratio(0.25) 223 (1, 4) 224 225 x is expected to be an int, Fraction, Decimal or float. 226 """ 227 try: 228 # Optimise the common case of floats. We expect that the most often 229 # used numeric type will be builtin floats, so try to make this as 230 # fast as possible. 231 if type(x) is float or type(x) is Decimal: 232 return x.as_integer_ratio() 233 try: 234 # x may be an int, Fraction, or Integral ABC. 235 return (x.numerator, x.denominator) 236 except AttributeError: 237 try: 238 # x may be a float or Decimal subclass. 239 return x.as_integer_ratio() 240 except AttributeError: 241 # Just give up? 242 pass 243 except (OverflowError, ValueError): 244 # float NAN or INF. 245 assert not _isfinite(x) 246 return (x, None) 247 msg = "can't convert type '{}' to numerator/denominator" 248 raise TypeError(msg.format(type(x).__name__)) 249 250 251def _convert(value, T): 252 """Convert value to given numeric type T.""" 253 if type(value) is T: 254 # This covers the cases where T is Fraction, or where value is 255 # a NAN or INF (Decimal or float). 256 return value 257 if issubclass(T, int) and value.denominator != 1: 258 T = float 259 try: 260 # FIXME: what do we do if this overflows? 261 return T(value) 262 except TypeError: 263 if issubclass(T, Decimal): 264 return T(value.numerator) / T(value.denominator) 265 else: 266 raise 267 268 269def _find_lteq(a, x): 270 'Locate the leftmost value exactly equal to x' 271 i = bisect_left(a, x) 272 if i != len(a) and a[i] == x: 273 return i 274 raise ValueError 275 276 277def _find_rteq(a, l, x): 278 'Locate the rightmost value exactly equal to x' 279 i = bisect_right(a, x, lo=l) 280 if i != (len(a) + 1) and a[i - 1] == x: 281 return i - 1 282 raise ValueError 283 284 285def _fail_neg(values, errmsg='negative value'): 286 """Iterate over values, failing if any are less than zero.""" 287 for x in values: 288 if x < 0: 289 raise StatisticsError(errmsg) 290 yield x 291 292 293# === Measures of central tendency (averages) === 294 295def mean(data): 296 """Return the sample arithmetic mean of data. 297 298 >>> mean([1, 2, 3, 4, 4]) 299 2.8 300 301 >>> from fractions import Fraction as F 302 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)]) 303 Fraction(13, 21) 304 305 >>> from decimal import Decimal as D 306 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")]) 307 Decimal('0.5625') 308 309 If ``data`` is empty, StatisticsError will be raised. 310 """ 311 if iter(data) is data: 312 data = list(data) 313 n = len(data) 314 if n < 1: 315 raise StatisticsError('mean requires at least one data point') 316 T, total, count = _sum(data) 317 assert count == n 318 return _convert(total / n, T) 319 320 321def fmean(data): 322 """Convert data to floats and compute the arithmetic mean. 323 324 This runs faster than the mean() function and it always returns a float. 325 If the input dataset is empty, it raises a StatisticsError. 326 327 >>> fmean([3.5, 4.0, 5.25]) 328 4.25 329 """ 330 try: 331 n = len(data) 332 except TypeError: 333 # Handle iterators that do not define __len__(). 334 n = 0 335 def count(iterable): 336 nonlocal n 337 for n, x in enumerate(iterable, start=1): 338 yield x 339 total = fsum(count(data)) 340 else: 341 total = fsum(data) 342 try: 343 return total / n 344 except ZeroDivisionError: 345 raise StatisticsError('fmean requires at least one data point') from None 346 347 348def geometric_mean(data): 349 """Convert data to floats and compute the geometric mean. 350 351 Raises a StatisticsError if the input dataset is empty, 352 if it contains a zero, or if it contains a negative value. 353 354 No special efforts are made to achieve exact results. 355 (However, this may change in the future.) 356 357 >>> round(geometric_mean([54, 24, 36]), 9) 358 36.0 359 """ 360 try: 361 return exp(fmean(map(log, data))) 362 except ValueError: 363 raise StatisticsError('geometric mean requires a non-empty dataset ' 364 ' containing positive numbers') from None 365 366 367def harmonic_mean(data): 368 """Return the harmonic mean of data. 369 370 The harmonic mean, sometimes called the subcontrary mean, is the 371 reciprocal of the arithmetic mean of the reciprocals of the data, 372 and is often appropriate when averaging quantities which are rates 373 or ratios, for example speeds. Example: 374 375 Suppose an investor purchases an equal value of shares in each of 376 three companies, with P/E (price/earning) ratios of 2.5, 3 and 10. 377 What is the average P/E ratio for the investor's portfolio? 378 379 >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio. 380 3.6 381 382 Using the arithmetic mean would give an average of about 5.167, which 383 is too high. 384 385 If ``data`` is empty, or any element is less than zero, 386 ``harmonic_mean`` will raise ``StatisticsError``. 387 """ 388 # For a justification for using harmonic mean for P/E ratios, see 389 # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/ 390 # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087 391 if iter(data) is data: 392 data = list(data) 393 errmsg = 'harmonic mean does not support negative values' 394 n = len(data) 395 if n < 1: 396 raise StatisticsError('harmonic_mean requires at least one data point') 397 elif n == 1: 398 x = data[0] 399 if isinstance(x, (numbers.Real, Decimal)): 400 if x < 0: 401 raise StatisticsError(errmsg) 402 return x 403 else: 404 raise TypeError('unsupported type') 405 try: 406 T, total, count = _sum(1 / x for x in _fail_neg(data, errmsg)) 407 except ZeroDivisionError: 408 return 0 409 assert count == n 410 return _convert(n / total, T) 411 412 413# FIXME: investigate ways to calculate medians without sorting? Quickselect? 414def median(data): 415 """Return the median (middle value) of numeric data. 416 417 When the number of data points is odd, return the middle data point. 418 When the number of data points is even, the median is interpolated by 419 taking the average of the two middle values: 420 421 >>> median([1, 3, 5]) 422 3 423 >>> median([1, 3, 5, 7]) 424 4.0 425 426 """ 427 data = sorted(data) 428 n = len(data) 429 if n == 0: 430 raise StatisticsError("no median for empty data") 431 if n % 2 == 1: 432 return data[n // 2] 433 else: 434 i = n // 2 435 return (data[i - 1] + data[i]) / 2 436 437 438def median_low(data): 439 """Return the low median of numeric data. 440 441 When the number of data points is odd, the middle value is returned. 442 When it is even, the smaller of the two middle values is returned. 443 444 >>> median_low([1, 3, 5]) 445 3 446 >>> median_low([1, 3, 5, 7]) 447 3 448 449 """ 450 data = sorted(data) 451 n = len(data) 452 if n == 0: 453 raise StatisticsError("no median for empty data") 454 if n % 2 == 1: 455 return data[n // 2] 456 else: 457 return data[n // 2 - 1] 458 459 460def median_high(data): 461 """Return the high median of data. 462 463 When the number of data points is odd, the middle value is returned. 464 When it is even, the larger of the two middle values is returned. 465 466 >>> median_high([1, 3, 5]) 467 3 468 >>> median_high([1, 3, 5, 7]) 469 5 470 471 """ 472 data = sorted(data) 473 n = len(data) 474 if n == 0: 475 raise StatisticsError("no median for empty data") 476 return data[n // 2] 477 478 479def median_grouped(data, interval=1): 480 """Return the 50th percentile (median) of grouped continuous data. 481 482 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5]) 483 3.7 484 >>> median_grouped([52, 52, 53, 54]) 485 52.5 486 487 This calculates the median as the 50th percentile, and should be 488 used when your data is continuous and grouped. In the above example, 489 the values 1, 2, 3, etc. actually represent the midpoint of classes 490 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in 491 class 3.5-4.5, and interpolation is used to estimate it. 492 493 Optional argument ``interval`` represents the class interval, and 494 defaults to 1. Changing the class interval naturally will change the 495 interpolated 50th percentile value: 496 497 >>> median_grouped([1, 3, 3, 5, 7], interval=1) 498 3.25 499 >>> median_grouped([1, 3, 3, 5, 7], interval=2) 500 3.5 501 502 This function does not check whether the data points are at least 503 ``interval`` apart. 504 """ 505 data = sorted(data) 506 n = len(data) 507 if n == 0: 508 raise StatisticsError("no median for empty data") 509 elif n == 1: 510 return data[0] 511 # Find the value at the midpoint. Remember this corresponds to the 512 # centre of the class interval. 513 x = data[n // 2] 514 for obj in (x, interval): 515 if isinstance(obj, (str, bytes)): 516 raise TypeError('expected number but got %r' % obj) 517 try: 518 L = x - interval / 2 # The lower limit of the median interval. 519 except TypeError: 520 # Mixed type. For now we just coerce to float. 521 L = float(x) - float(interval) / 2 522 523 # Uses bisection search to search for x in data with log(n) time complexity 524 # Find the position of leftmost occurrence of x in data 525 l1 = _find_lteq(data, x) 526 # Find the position of rightmost occurrence of x in data[l1...len(data)] 527 # Assuming always l1 <= l2 528 l2 = _find_rteq(data, l1, x) 529 cf = l1 530 f = l2 - l1 + 1 531 return L + interval * (n / 2 - cf) / f 532 533 534def mode(data): 535 """Return the most common data point from discrete or nominal data. 536 537 ``mode`` assumes discrete data, and returns a single value. This is the 538 standard treatment of the mode as commonly taught in schools: 539 540 >>> mode([1, 1, 2, 3, 3, 3, 3, 4]) 541 3 542 543 This also works with nominal (non-numeric) data: 544 545 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"]) 546 'red' 547 548 If there are multiple modes with same frequency, return the first one 549 encountered: 550 551 >>> mode(['red', 'red', 'green', 'blue', 'blue']) 552 'red' 553 554 If *data* is empty, ``mode``, raises StatisticsError. 555 556 """ 557 pairs = Counter(iter(data)).most_common(1) 558 try: 559 return pairs[0][0] 560 except IndexError: 561 raise StatisticsError('no mode for empty data') from None 562 563 564def multimode(data): 565 """Return a list of the most frequently occurring values. 566 567 Will return more than one result if there are multiple modes 568 or an empty list if *data* is empty. 569 570 >>> multimode('aabbbbbbbbcc') 571 ['b'] 572 >>> multimode('aabbbbccddddeeffffgg') 573 ['b', 'd', 'f'] 574 >>> multimode('') 575 [] 576 """ 577 counts = Counter(iter(data)).most_common() 578 maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, [])) 579 return list(map(itemgetter(0), mode_items)) 580 581 582# Notes on methods for computing quantiles 583# ---------------------------------------- 584# 585# There is no one perfect way to compute quantiles. Here we offer 586# two methods that serve common needs. Most other packages 587# surveyed offered at least one or both of these two, making them 588# "standard" in the sense of "widely-adopted and reproducible". 589# They are also easy to explain, easy to compute manually, and have 590# straight-forward interpretations that aren't surprising. 591 592# The default method is known as "R6", "PERCENTILE.EXC", or "expected 593# value of rank order statistics". The alternative method is known as 594# "R7", "PERCENTILE.INC", or "mode of rank order statistics". 595 596# For sample data where there is a positive probability for values 597# beyond the range of the data, the R6 exclusive method is a 598# reasonable choice. Consider a random sample of nine values from a 599# population with a uniform distribution from 0.0 to 1.0. The 600# distribution of the third ranked sample point is described by 601# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and 602# mean=0.300. Only the latter (which corresponds with R6) gives the 603# desired cut point with 30% of the population falling below that 604# value, making it comparable to a result from an inv_cdf() function. 605# The R6 exclusive method is also idempotent. 606 607# For describing population data where the end points are known to 608# be included in the data, the R7 inclusive method is a reasonable 609# choice. Instead of the mean, it uses the mode of the beta 610# distribution for the interior points. Per Hyndman & Fan, "One nice 611# property is that the vertices of Q7(p) divide the range into n - 1 612# intervals, and exactly 100p% of the intervals lie to the left of 613# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)." 614 615# If needed, other methods could be added. However, for now, the 616# position is that fewer options make for easier choices and that 617# external packages can be used for anything more advanced. 618 619def quantiles(data, *, n=4, method='exclusive'): 620 """Divide *data* into *n* continuous intervals with equal probability. 621 622 Returns a list of (n - 1) cut points separating the intervals. 623 624 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. 625 Set *n* to 100 for percentiles which gives the 99 cuts points that 626 separate *data* in to 100 equal sized groups. 627 628 The *data* can be any iterable containing sample. 629 The cut points are linearly interpolated between data points. 630 631 If *method* is set to *inclusive*, *data* is treated as population 632 data. The minimum value is treated as the 0th percentile and the 633 maximum value is treated as the 100th percentile. 634 """ 635 if n < 1: 636 raise StatisticsError('n must be at least 1') 637 data = sorted(data) 638 ld = len(data) 639 if ld < 2: 640 raise StatisticsError('must have at least two data points') 641 if method == 'inclusive': 642 m = ld - 1 643 result = [] 644 for i in range(1, n): 645 j, delta = divmod(i * m, n) 646 interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n 647 result.append(interpolated) 648 return result 649 if method == 'exclusive': 650 m = ld + 1 651 result = [] 652 for i in range(1, n): 653 j = i * m // n # rescale i to m/n 654 j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1 655 delta = i*m - j*n # exact integer math 656 interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n 657 result.append(interpolated) 658 return result 659 raise ValueError(f'Unknown method: {method!r}') 660 661 662# === Measures of spread === 663 664# See http://mathworld.wolfram.com/Variance.html 665# http://mathworld.wolfram.com/SampleVariance.html 666# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 667# 668# Under no circumstances use the so-called "computational formula for 669# variance", as that is only suitable for hand calculations with a small 670# amount of low-precision data. It has terrible numeric properties. 671# 672# See a comparison of three computational methods here: 673# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/ 674 675def _ss(data, c=None): 676 """Return sum of square deviations of sequence data. 677 678 If ``c`` is None, the mean is calculated in one pass, and the deviations 679 from the mean are calculated in a second pass. Otherwise, deviations are 680 calculated from ``c`` as given. Use the second case with care, as it can 681 lead to garbage results. 682 """ 683 if c is not None: 684 T, total, count = _sum((x-c)**2 for x in data) 685 return (T, total) 686 c = mean(data) 687 T, total, count = _sum((x-c)**2 for x in data) 688 # The following sum should mathematically equal zero, but due to rounding 689 # error may not. 690 U, total2, count2 = _sum((x - c) for x in data) 691 assert T == U and count == count2 692 total -= total2 ** 2 / len(data) 693 assert not total < 0, 'negative sum of square deviations: %f' % total 694 return (T, total) 695 696 697def variance(data, xbar=None): 698 """Return the sample variance of data. 699 700 data should be an iterable of Real-valued numbers, with at least two 701 values. The optional argument xbar, if given, should be the mean of 702 the data. If it is missing or None, the mean is automatically calculated. 703 704 Use this function when your data is a sample from a population. To 705 calculate the variance from the entire population, see ``pvariance``. 706 707 Examples: 708 709 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5] 710 >>> variance(data) 711 1.3720238095238095 712 713 If you have already calculated the mean of your data, you can pass it as 714 the optional second argument ``xbar`` to avoid recalculating it: 715 716 >>> m = mean(data) 717 >>> variance(data, m) 718 1.3720238095238095 719 720 This function does not check that ``xbar`` is actually the mean of 721 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or 722 impossible results. 723 724 Decimals and Fractions are supported: 725 726 >>> from decimal import Decimal as D 727 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) 728 Decimal('31.01875') 729 730 >>> from fractions import Fraction as F 731 >>> variance([F(1, 6), F(1, 2), F(5, 3)]) 732 Fraction(67, 108) 733 734 """ 735 if iter(data) is data: 736 data = list(data) 737 n = len(data) 738 if n < 2: 739 raise StatisticsError('variance requires at least two data points') 740 T, ss = _ss(data, xbar) 741 return _convert(ss / (n - 1), T) 742 743 744def pvariance(data, mu=None): 745 """Return the population variance of ``data``. 746 747 data should be a sequence or iterable of Real-valued numbers, with at least one 748 value. The optional argument mu, if given, should be the mean of 749 the data. If it is missing or None, the mean is automatically calculated. 750 751 Use this function to calculate the variance from the entire population. 752 To estimate the variance from a sample, the ``variance`` function is 753 usually a better choice. 754 755 Examples: 756 757 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25] 758 >>> pvariance(data) 759 1.25 760 761 If you have already calculated the mean of the data, you can pass it as 762 the optional second argument to avoid recalculating it: 763 764 >>> mu = mean(data) 765 >>> pvariance(data, mu) 766 1.25 767 768 Decimals and Fractions are supported: 769 770 >>> from decimal import Decimal as D 771 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) 772 Decimal('24.815') 773 774 >>> from fractions import Fraction as F 775 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)]) 776 Fraction(13, 72) 777 778 """ 779 if iter(data) is data: 780 data = list(data) 781 n = len(data) 782 if n < 1: 783 raise StatisticsError('pvariance requires at least one data point') 784 T, ss = _ss(data, mu) 785 return _convert(ss / n, T) 786 787 788def stdev(data, xbar=None): 789 """Return the square root of the sample variance. 790 791 See ``variance`` for arguments and other details. 792 793 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 794 1.0810874155219827 795 796 """ 797 var = variance(data, xbar) 798 try: 799 return var.sqrt() 800 except AttributeError: 801 return math.sqrt(var) 802 803 804def pstdev(data, mu=None): 805 """Return the square root of the population variance. 806 807 See ``pvariance`` for arguments and other details. 808 809 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 810 0.986893273527251 811 812 """ 813 var = pvariance(data, mu) 814 try: 815 return var.sqrt() 816 except AttributeError: 817 return math.sqrt(var) 818 819 820## Normal Distribution ##################################################### 821 822 823def _normal_dist_inv_cdf(p, mu, sigma): 824 # There is no closed-form solution to the inverse CDF for the normal 825 # distribution, so we use a rational approximation instead: 826 # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the 827 # Normal Distribution". Applied Statistics. Blackwell Publishing. 37 828 # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330. 829 q = p - 0.5 830 if fabs(q) <= 0.425: 831 r = 0.180625 - q * q 832 # Hash sum: 55.88319_28806_14901_4439 833 num = (((((((2.50908_09287_30122_6727e+3 * r + 834 3.34305_75583_58812_8105e+4) * r + 835 6.72657_70927_00870_0853e+4) * r + 836 4.59219_53931_54987_1457e+4) * r + 837 1.37316_93765_50946_1125e+4) * r + 838 1.97159_09503_06551_4427e+3) * r + 839 1.33141_66789_17843_7745e+2) * r + 840 3.38713_28727_96366_6080e+0) * q 841 den = (((((((5.22649_52788_52854_5610e+3 * r + 842 2.87290_85735_72194_2674e+4) * r + 843 3.93078_95800_09271_0610e+4) * r + 844 2.12137_94301_58659_5867e+4) * r + 845 5.39419_60214_24751_1077e+3) * r + 846 6.87187_00749_20579_0830e+2) * r + 847 4.23133_30701_60091_1252e+1) * r + 848 1.0) 849 x = num / den 850 return mu + (x * sigma) 851 r = p if q <= 0.0 else 1.0 - p 852 r = sqrt(-log(r)) 853 if r <= 5.0: 854 r = r - 1.6 855 # Hash sum: 49.33206_50330_16102_89036 856 num = (((((((7.74545_01427_83414_07640e-4 * r + 857 2.27238_44989_26918_45833e-2) * r + 858 2.41780_72517_74506_11770e-1) * r + 859 1.27045_82524_52368_38258e+0) * r + 860 3.64784_83247_63204_60504e+0) * r + 861 5.76949_72214_60691_40550e+0) * r + 862 4.63033_78461_56545_29590e+0) * r + 863 1.42343_71107_49683_57734e+0) 864 den = (((((((1.05075_00716_44416_84324e-9 * r + 865 5.47593_80849_95344_94600e-4) * r + 866 1.51986_66563_61645_71966e-2) * r + 867 1.48103_97642_74800_74590e-1) * r + 868 6.89767_33498_51000_04550e-1) * r + 869 1.67638_48301_83803_84940e+0) * r + 870 2.05319_16266_37758_82187e+0) * r + 871 1.0) 872 else: 873 r = r - 5.0 874 # Hash sum: 47.52583_31754_92896_71629 875 num = (((((((2.01033_43992_92288_13265e-7 * r + 876 2.71155_55687_43487_57815e-5) * r + 877 1.24266_09473_88078_43860e-3) * r + 878 2.65321_89526_57612_30930e-2) * r + 879 2.96560_57182_85048_91230e-1) * r + 880 1.78482_65399_17291_33580e+0) * r + 881 5.46378_49111_64114_36990e+0) * r + 882 6.65790_46435_01103_77720e+0) 883 den = (((((((2.04426_31033_89939_78564e-15 * r + 884 1.42151_17583_16445_88870e-7) * r + 885 1.84631_83175_10054_68180e-5) * r + 886 7.86869_13114_56132_59100e-4) * r + 887 1.48753_61290_85061_48525e-2) * r + 888 1.36929_88092_27358_05310e-1) * r + 889 5.99832_20655_58879_37690e-1) * r + 890 1.0) 891 x = num / den 892 if q < 0.0: 893 x = -x 894 return mu + (x * sigma) 895 896 897# If available, use C implementation 898try: 899 from _statistics import _normal_dist_inv_cdf 900except ImportError: 901 pass 902 903 904class NormalDist: 905 "Normal distribution of a random variable" 906 # https://en.wikipedia.org/wiki/Normal_distribution 907 # https://en.wikipedia.org/wiki/Variance#Properties 908 909 __slots__ = { 910 '_mu': 'Arithmetic mean of a normal distribution', 911 '_sigma': 'Standard deviation of a normal distribution', 912 } 913 914 def __init__(self, mu=0.0, sigma=1.0): 915 "NormalDist where mu is the mean and sigma is the standard deviation." 916 if sigma < 0.0: 917 raise StatisticsError('sigma must be non-negative') 918 self._mu = float(mu) 919 self._sigma = float(sigma) 920 921 @classmethod 922 def from_samples(cls, data): 923 "Make a normal distribution instance from sample data." 924 if not isinstance(data, (list, tuple)): 925 data = list(data) 926 xbar = fmean(data) 927 return cls(xbar, stdev(data, xbar)) 928 929 def samples(self, n, *, seed=None): 930 "Generate *n* samples for a given mean and standard deviation." 931 gauss = random.gauss if seed is None else random.Random(seed).gauss 932 mu, sigma = self._mu, self._sigma 933 return [gauss(mu, sigma) for i in range(n)] 934 935 def pdf(self, x): 936 "Probability density function. P(x <= X < x+dx) / dx" 937 variance = self._sigma ** 2.0 938 if not variance: 939 raise StatisticsError('pdf() not defined when sigma is zero') 940 return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance) 941 942 def cdf(self, x): 943 "Cumulative distribution function. P(X <= x)" 944 if not self._sigma: 945 raise StatisticsError('cdf() not defined when sigma is zero') 946 return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0)))) 947 948 def inv_cdf(self, p): 949 """Inverse cumulative distribution function. x : P(X <= x) = p 950 951 Finds the value of the random variable such that the probability of 952 the variable being less than or equal to that value equals the given 953 probability. 954 955 This function is also called the percent point function or quantile 956 function. 957 """ 958 if p <= 0.0 or p >= 1.0: 959 raise StatisticsError('p must be in the range 0.0 < p < 1.0') 960 if self._sigma <= 0.0: 961 raise StatisticsError('cdf() not defined when sigma at or below zero') 962 return _normal_dist_inv_cdf(p, self._mu, self._sigma) 963 964 def quantiles(self, n=4): 965 """Divide into *n* continuous intervals with equal probability. 966 967 Returns a list of (n - 1) cut points separating the intervals. 968 969 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. 970 Set *n* to 100 for percentiles which gives the 99 cuts points that 971 separate the normal distribution in to 100 equal sized groups. 972 """ 973 return [self.inv_cdf(i / n) for i in range(1, n)] 974 975 def overlap(self, other): 976 """Compute the overlapping coefficient (OVL) between two normal distributions. 977 978 Measures the agreement between two normal probability distributions. 979 Returns a value between 0.0 and 1.0 giving the overlapping area in 980 the two underlying probability density functions. 981 982 >>> N1 = NormalDist(2.4, 1.6) 983 >>> N2 = NormalDist(3.2, 2.0) 984 >>> N1.overlap(N2) 985 0.8035050657330205 986 """ 987 # See: "The overlapping coefficient as a measure of agreement between 988 # probability distributions and point estimation of the overlap of two 989 # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr 990 # http://dx.doi.org/10.1080/03610928908830127 991 if not isinstance(other, NormalDist): 992 raise TypeError('Expected another NormalDist instance') 993 X, Y = self, other 994 if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity 995 X, Y = Y, X 996 X_var, Y_var = X.variance, Y.variance 997 if not X_var or not Y_var: 998 raise StatisticsError('overlap() not defined when sigma is zero') 999 dv = Y_var - X_var 1000 dm = fabs(Y._mu - X._mu) 1001 if not dv: 1002 return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0))) 1003 a = X._mu * Y_var - Y._mu * X_var 1004 b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var)) 1005 x1 = (a + b) / dv 1006 x2 = (a - b) / dv 1007 return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2))) 1008 1009 def zscore(self, x): 1010 """Compute the Standard Score. (x - mean) / stdev 1011 1012 Describes *x* in terms of the number of standard deviations 1013 above or below the mean of the normal distribution. 1014 """ 1015 # https://www.statisticshowto.com/probability-and-statistics/z-score/ 1016 if not self._sigma: 1017 raise StatisticsError('zscore() not defined when sigma is zero') 1018 return (x - self._mu) / self._sigma 1019 1020 @property 1021 def mean(self): 1022 "Arithmetic mean of the normal distribution." 1023 return self._mu 1024 1025 @property 1026 def median(self): 1027 "Return the median of the normal distribution" 1028 return self._mu 1029 1030 @property 1031 def mode(self): 1032 """Return the mode of the normal distribution 1033 1034 The mode is the value x where which the probability density 1035 function (pdf) takes its maximum value. 1036 """ 1037 return self._mu 1038 1039 @property 1040 def stdev(self): 1041 "Standard deviation of the normal distribution." 1042 return self._sigma 1043 1044 @property 1045 def variance(self): 1046 "Square of the standard deviation." 1047 return self._sigma ** 2.0 1048 1049 def __add__(x1, x2): 1050 """Add a constant or another NormalDist instance. 1051 1052 If *other* is a constant, translate mu by the constant, 1053 leaving sigma unchanged. 1054 1055 If *other* is a NormalDist, add both the means and the variances. 1056 Mathematically, this works only if the two distributions are 1057 independent or if they are jointly normally distributed. 1058 """ 1059 if isinstance(x2, NormalDist): 1060 return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma)) 1061 return NormalDist(x1._mu + x2, x1._sigma) 1062 1063 def __sub__(x1, x2): 1064 """Subtract a constant or another NormalDist instance. 1065 1066 If *other* is a constant, translate by the constant mu, 1067 leaving sigma unchanged. 1068 1069 If *other* is a NormalDist, subtract the means and add the variances. 1070 Mathematically, this works only if the two distributions are 1071 independent or if they are jointly normally distributed. 1072 """ 1073 if isinstance(x2, NormalDist): 1074 return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma)) 1075 return NormalDist(x1._mu - x2, x1._sigma) 1076 1077 def __mul__(x1, x2): 1078 """Multiply both mu and sigma by a constant. 1079 1080 Used for rescaling, perhaps to change measurement units. 1081 Sigma is scaled with the absolute value of the constant. 1082 """ 1083 return NormalDist(x1._mu * x2, x1._sigma * fabs(x2)) 1084 1085 def __truediv__(x1, x2): 1086 """Divide both mu and sigma by a constant. 1087 1088 Used for rescaling, perhaps to change measurement units. 1089 Sigma is scaled with the absolute value of the constant. 1090 """ 1091 return NormalDist(x1._mu / x2, x1._sigma / fabs(x2)) 1092 1093 def __pos__(x1): 1094 "Return a copy of the instance." 1095 return NormalDist(x1._mu, x1._sigma) 1096 1097 def __neg__(x1): 1098 "Negates mu while keeping sigma the same." 1099 return NormalDist(-x1._mu, x1._sigma) 1100 1101 __radd__ = __add__ 1102 1103 def __rsub__(x1, x2): 1104 "Subtract a NormalDist from a constant or another NormalDist." 1105 return -(x1 - x2) 1106 1107 __rmul__ = __mul__ 1108 1109 def __eq__(x1, x2): 1110 "Two NormalDist objects are equal if their mu and sigma are both equal." 1111 if not isinstance(x2, NormalDist): 1112 return NotImplemented 1113 return x1._mu == x2._mu and x1._sigma == x2._sigma 1114 1115 def __hash__(self): 1116 "NormalDist objects hash equal if their mu and sigma are both equal." 1117 return hash((self._mu, self._sigma)) 1118 1119 def __repr__(self): 1120 return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})' 1121