1""" 2Basic statistics module. 3 4This module provides functions for calculating statistics of data, including 5averages, variance, and standard deviation. 6 7Calculating averages 8-------------------- 9 10================== ================================================== 11Function Description 12================== ================================================== 13mean Arithmetic mean (average) of data. 14fmean Fast, floating-point arithmetic mean. 15geometric_mean Geometric mean of data. 16harmonic_mean Harmonic mean of data. 17median Median (middle value) of data. 18median_low Low median of data. 19median_high High median of data. 20median_grouped Median, or 50th percentile, of grouped data. 21mode Mode (most common value) of data. 22multimode List of modes (most common values of data). 23quantiles Divide data into intervals with equal probability. 24================== ================================================== 25 26Calculate the arithmetic mean ("the average") of data: 27 28>>> mean([-1.0, 2.5, 3.25, 5.75]) 292.625 30 31 32Calculate the standard median of discrete data: 33 34>>> median([2, 3, 4, 5]) 353.5 36 37 38Calculate the median, or 50th percentile, of data grouped into class intervals 39centred on the data values provided. E.g. if your data points are rounded to 40the nearest whole number: 41 42>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS 432.8333333333... 44 45This should be interpreted in this way: you have two data points in the class 46interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in 47the class interval 3.5-4.5. The median of these data points is 2.8333... 48 49 50Calculating variability or spread 51--------------------------------- 52 53================== ============================================= 54Function Description 55================== ============================================= 56pvariance Population variance of data. 57variance Sample variance of data. 58pstdev Population standard deviation of data. 59stdev Sample standard deviation of data. 60================== ============================================= 61 62Calculate the standard deviation of sample data: 63 64>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS 654.38961843444... 66 67If you have previously calculated the mean, you can pass it as the optional 68second argument to the four "spread" functions to avoid recalculating it: 69 70>>> data = [1, 2, 2, 4, 4, 4, 5, 6] 71>>> mu = mean(data) 72>>> pvariance(data, mu) 732.5 74 75 76Statistics for relations between two inputs 77------------------------------------------- 78 79================== ==================================================== 80Function Description 81================== ==================================================== 82covariance Sample covariance for two variables. 83correlation Pearson's correlation coefficient for two variables. 84linear_regression Intercept and slope for simple linear regression. 85================== ==================================================== 86 87Calculate covariance, Pearson's correlation, and simple linear regression 88for two inputs: 89 90>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] 91>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] 92>>> covariance(x, y) 930.75 94>>> correlation(x, y) #doctest: +ELLIPSIS 950.31622776601... 96>>> linear_regression(x, y) #doctest: 97LinearRegression(slope=0.1, intercept=1.5) 98 99 100Exceptions 101---------- 102 103A single exception is defined: StatisticsError is a subclass of ValueError. 104 105""" 106 107__all__ = [ 108 'NormalDist', 109 'StatisticsError', 110 'correlation', 111 'covariance', 112 'fmean', 113 'geometric_mean', 114 'harmonic_mean', 115 'kde', 116 'kde_random', 117 'linear_regression', 118 'mean', 119 'median', 120 'median_grouped', 121 'median_high', 122 'median_low', 123 'mode', 124 'multimode', 125 'pstdev', 126 'pvariance', 127 'quantiles', 128 'stdev', 129 'variance', 130] 131 132import math 133import numbers 134import random 135import sys 136 137from fractions import Fraction 138from decimal import Decimal 139from itertools import count, groupby, repeat 140from bisect import bisect_left, bisect_right 141from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod 142from math import isfinite, isinf, pi, cos, sin, tan, cosh, asin, atan, acos 143from functools import reduce 144from operator import itemgetter 145from collections import Counter, namedtuple, defaultdict 146 147_SQRT2 = sqrt(2.0) 148_random = random 149 150# === Exceptions === 151 152class StatisticsError(ValueError): 153 pass 154 155 156# === Private utilities === 157 158def _sum(data): 159 """_sum(data) -> (type, sum, count) 160 161 Return a high-precision sum of the given numeric data as a fraction, 162 together with the type to be converted to and the count of items. 163 164 Examples 165 -------- 166 167 >>> _sum([3, 2.25, 4.5, -0.5, 0.25]) 168 (<class 'float'>, Fraction(19, 2), 5) 169 170 Some sources of round-off error will be avoided: 171 172 # Built-in sum returns zero. 173 >>> _sum([1e50, 1, -1e50] * 1000) 174 (<class 'float'>, Fraction(1000, 1), 3000) 175 176 Fractions and Decimals are also supported: 177 178 >>> from fractions import Fraction as F 179 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)]) 180 (<class 'fractions.Fraction'>, Fraction(63, 20), 4) 181 182 >>> from decimal import Decimal as D 183 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")] 184 >>> _sum(data) 185 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4) 186 187 Mixed types are currently treated as an error, except that int is 188 allowed. 189 """ 190 count = 0 191 types = set() 192 types_add = types.add 193 partials = {} 194 partials_get = partials.get 195 for typ, values in groupby(data, type): 196 types_add(typ) 197 for n, d in map(_exact_ratio, values): 198 count += 1 199 partials[d] = partials_get(d, 0) + n 200 if None in partials: 201 # The sum will be a NAN or INF. We can ignore all the finite 202 # partials, and just look at this special one. 203 total = partials[None] 204 assert not _isfinite(total) 205 else: 206 # Sum all the partial sums using builtin sum. 207 total = sum(Fraction(n, d) for d, n in partials.items()) 208 T = reduce(_coerce, types, int) # or raise TypeError 209 return (T, total, count) 210 211 212def _ss(data, c=None): 213 """Return the exact mean and sum of square deviations of sequence data. 214 215 Calculations are done in a single pass, allowing the input to be an iterator. 216 217 If given *c* is used the mean; otherwise, it is calculated from the data. 218 Use the *c* argument with care, as it can lead to garbage results. 219 220 """ 221 if c is not None: 222 T, ssd, count = _sum((d := x - c) * d for x in data) 223 return (T, ssd, c, count) 224 count = 0 225 types = set() 226 types_add = types.add 227 sx_partials = defaultdict(int) 228 sxx_partials = defaultdict(int) 229 for typ, values in groupby(data, type): 230 types_add(typ) 231 for n, d in map(_exact_ratio, values): 232 count += 1 233 sx_partials[d] += n 234 sxx_partials[d] += n * n 235 if not count: 236 ssd = c = Fraction(0) 237 elif None in sx_partials: 238 # The sum will be a NAN or INF. We can ignore all the finite 239 # partials, and just look at this special one. 240 ssd = c = sx_partials[None] 241 assert not _isfinite(ssd) 242 else: 243 sx = sum(Fraction(n, d) for d, n in sx_partials.items()) 244 sxx = sum(Fraction(n, d*d) for d, n in sxx_partials.items()) 245 # This formula has poor numeric properties for floats, 246 # but with fractions it is exact. 247 ssd = (count * sxx - sx * sx) / count 248 c = sx / count 249 T = reduce(_coerce, types, int) # or raise TypeError 250 return (T, ssd, c, count) 251 252 253def _isfinite(x): 254 try: 255 return x.is_finite() # Likely a Decimal. 256 except AttributeError: 257 return math.isfinite(x) # Coerces to float first. 258 259 260def _coerce(T, S): 261 """Coerce types T and S to a common type, or raise TypeError. 262 263 Coercion rules are currently an implementation detail. See the CoerceTest 264 test class in test_statistics for details. 265 """ 266 # See http://bugs.python.org/issue24068. 267 assert T is not bool, "initial type T is bool" 268 # If the types are the same, no need to coerce anything. Put this 269 # first, so that the usual case (no coercion needed) happens as soon 270 # as possible. 271 if T is S: return T 272 # Mixed int & other coerce to the other type. 273 if S is int or S is bool: return T 274 if T is int: return S 275 # If one is a (strict) subclass of the other, coerce to the subclass. 276 if issubclass(S, T): return S 277 if issubclass(T, S): return T 278 # Ints coerce to the other type. 279 if issubclass(T, int): return S 280 if issubclass(S, int): return T 281 # Mixed fraction & float coerces to float (or float subclass). 282 if issubclass(T, Fraction) and issubclass(S, float): 283 return S 284 if issubclass(T, float) and issubclass(S, Fraction): 285 return T 286 # Any other combination is disallowed. 287 msg = "don't know how to coerce %s and %s" 288 raise TypeError(msg % (T.__name__, S.__name__)) 289 290 291def _exact_ratio(x): 292 """Return Real number x to exact (numerator, denominator) pair. 293 294 >>> _exact_ratio(0.25) 295 (1, 4) 296 297 x is expected to be an int, Fraction, Decimal or float. 298 """ 299 300 # XXX We should revisit whether using fractions to accumulate exact 301 # ratios is the right way to go. 302 303 # The integer ratios for binary floats can have numerators or 304 # denominators with over 300 decimal digits. The problem is more 305 # acute with decimal floats where the default decimal context 306 # supports a huge range of exponents from Emin=-999999 to 307 # Emax=999999. When expanded with as_integer_ratio(), numbers like 308 # Decimal('3.14E+5000') and Decimal('3.14E-5000') have large 309 # numerators or denominators that will slow computation. 310 311 # When the integer ratios are accumulated as fractions, the size 312 # grows to cover the full range from the smallest magnitude to the 313 # largest. For example, Fraction(3.14E+300) + Fraction(3.14E-300), 314 # has a 616 digit numerator. Likewise, 315 # Fraction(Decimal('3.14E+5000')) + Fraction(Decimal('3.14E-5000')) 316 # has 10,003 digit numerator. 317 318 # This doesn't seem to have been problem in practice, but it is a 319 # potential pitfall. 320 321 try: 322 return x.as_integer_ratio() 323 except AttributeError: 324 pass 325 except (OverflowError, ValueError): 326 # float NAN or INF. 327 assert not _isfinite(x) 328 return (x, None) 329 try: 330 # x may be an Integral ABC. 331 return (x.numerator, x.denominator) 332 except AttributeError: 333 msg = f"can't convert type '{type(x).__name__}' to numerator/denominator" 334 raise TypeError(msg) 335 336 337def _convert(value, T): 338 """Convert value to given numeric type T.""" 339 if type(value) is T: 340 # This covers the cases where T is Fraction, or where value is 341 # a NAN or INF (Decimal or float). 342 return value 343 if issubclass(T, int) and value.denominator != 1: 344 T = float 345 try: 346 # FIXME: what do we do if this overflows? 347 return T(value) 348 except TypeError: 349 if issubclass(T, Decimal): 350 return T(value.numerator) / T(value.denominator) 351 else: 352 raise 353 354 355def _fail_neg(values, errmsg='negative value'): 356 """Iterate over values, failing if any are less than zero.""" 357 for x in values: 358 if x < 0: 359 raise StatisticsError(errmsg) 360 yield x 361 362 363def _rank(data, /, *, key=None, reverse=False, ties='average', start=1) -> list[float]: 364 """Rank order a dataset. The lowest value has rank 1. 365 366 Ties are averaged so that equal values receive the same rank: 367 368 >>> data = [31, 56, 31, 25, 75, 18] 369 >>> _rank(data) 370 [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] 371 372 The operation is idempotent: 373 374 >>> _rank([3.5, 5.0, 3.5, 2.0, 6.0, 1.0]) 375 [3.5, 5.0, 3.5, 2.0, 6.0, 1.0] 376 377 It is possible to rank the data in reverse order so that the 378 highest value has rank 1. Also, a key-function can extract 379 the field to be ranked: 380 381 >>> goals = [('eagles', 45), ('bears', 48), ('lions', 44)] 382 >>> _rank(goals, key=itemgetter(1), reverse=True) 383 [2.0, 1.0, 3.0] 384 385 Ranks are conventionally numbered starting from one; however, 386 setting *start* to zero allows the ranks to be used as array indices: 387 388 >>> prize = ['Gold', 'Silver', 'Bronze', 'Certificate'] 389 >>> scores = [8.1, 7.3, 9.4, 8.3] 390 >>> [prize[int(i)] for i in _rank(scores, start=0, reverse=True)] 391 ['Bronze', 'Certificate', 'Gold', 'Silver'] 392 393 """ 394 # If this function becomes public at some point, more thought 395 # needs to be given to the signature. A list of ints is 396 # plausible when ties is "min" or "max". When ties is "average", 397 # either list[float] or list[Fraction] is plausible. 398 399 # Default handling of ties matches scipy.stats.mstats.spearmanr. 400 if ties != 'average': 401 raise ValueError(f'Unknown tie resolution method: {ties!r}') 402 if key is not None: 403 data = map(key, data) 404 val_pos = sorted(zip(data, count()), reverse=reverse) 405 i = start - 1 406 result = [0] * len(val_pos) 407 for _, g in groupby(val_pos, key=itemgetter(0)): 408 group = list(g) 409 size = len(group) 410 rank = i + (size + 1) / 2 411 for value, orig_pos in group: 412 result[orig_pos] = rank 413 i += size 414 return result 415 416 417def _integer_sqrt_of_frac_rto(n: int, m: int) -> int: 418 """Square root of n/m, rounded to the nearest integer using round-to-odd.""" 419 # Reference: https://www.lri.fr/~melquion/doc/05-imacs17_1-expose.pdf 420 a = math.isqrt(n // m) 421 return a | (a*a*m != n) 422 423 424# For 53 bit precision floats, the bit width used in 425# _float_sqrt_of_frac() is 109. 426_sqrt_bit_width: int = 2 * sys.float_info.mant_dig + 3 427 428 429def _float_sqrt_of_frac(n: int, m: int) -> float: 430 """Square root of n/m as a float, correctly rounded.""" 431 # See principle and proof sketch at: https://bugs.python.org/msg407078 432 q = (n.bit_length() - m.bit_length() - _sqrt_bit_width) // 2 433 if q >= 0: 434 numerator = _integer_sqrt_of_frac_rto(n, m << 2 * q) << q 435 denominator = 1 436 else: 437 numerator = _integer_sqrt_of_frac_rto(n << -2 * q, m) 438 denominator = 1 << -q 439 return numerator / denominator # Convert to float 440 441 442def _decimal_sqrt_of_frac(n: int, m: int) -> Decimal: 443 """Square root of n/m as a Decimal, correctly rounded.""" 444 # Premise: For decimal, computing (n/m).sqrt() can be off 445 # by 1 ulp from the correctly rounded result. 446 # Method: Check the result, moving up or down a step if needed. 447 if n <= 0: 448 if not n: 449 return Decimal('0.0') 450 n, m = -n, -m 451 452 root = (Decimal(n) / Decimal(m)).sqrt() 453 nr, dr = root.as_integer_ratio() 454 455 plus = root.next_plus() 456 np, dp = plus.as_integer_ratio() 457 # test: n / m > ((root + plus) / 2) ** 2 458 if 4 * n * (dr*dp)**2 > m * (dr*np + dp*nr)**2: 459 return plus 460 461 minus = root.next_minus() 462 nm, dm = minus.as_integer_ratio() 463 # test: n / m < ((root + minus) / 2) ** 2 464 if 4 * n * (dr*dm)**2 < m * (dr*nm + dm*nr)**2: 465 return minus 466 467 return root 468 469 470# === Measures of central tendency (averages) === 471 472def mean(data): 473 """Return the sample arithmetic mean of data. 474 475 >>> mean([1, 2, 3, 4, 4]) 476 2.8 477 478 >>> from fractions import Fraction as F 479 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)]) 480 Fraction(13, 21) 481 482 >>> from decimal import Decimal as D 483 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")]) 484 Decimal('0.5625') 485 486 If ``data`` is empty, StatisticsError will be raised. 487 """ 488 T, total, n = _sum(data) 489 if n < 1: 490 raise StatisticsError('mean requires at least one data point') 491 return _convert(total / n, T) 492 493 494def fmean(data, weights=None): 495 """Convert data to floats and compute the arithmetic mean. 496 497 This runs faster than the mean() function and it always returns a float. 498 If the input dataset is empty, it raises a StatisticsError. 499 500 >>> fmean([3.5, 4.0, 5.25]) 501 4.25 502 """ 503 if weights is None: 504 try: 505 n = len(data) 506 except TypeError: 507 # Handle iterators that do not define __len__(). 508 n = 0 509 def count(iterable): 510 nonlocal n 511 for n, x in enumerate(iterable, start=1): 512 yield x 513 data = count(data) 514 total = fsum(data) 515 if not n: 516 raise StatisticsError('fmean requires at least one data point') 517 return total / n 518 if not isinstance(weights, (list, tuple)): 519 weights = list(weights) 520 try: 521 num = sumprod(data, weights) 522 except ValueError: 523 raise StatisticsError('data and weights must be the same length') 524 den = fsum(weights) 525 if not den: 526 raise StatisticsError('sum of weights must be non-zero') 527 return num / den 528 529 530def geometric_mean(data): 531 """Convert data to floats and compute the geometric mean. 532 533 Raises a StatisticsError if the input dataset is empty 534 or if it contains a negative value. 535 536 Returns zero if the product of inputs is zero. 537 538 No special efforts are made to achieve exact results. 539 (However, this may change in the future.) 540 541 >>> round(geometric_mean([54, 24, 36]), 9) 542 36.0 543 """ 544 n = 0 545 found_zero = False 546 def count_positive(iterable): 547 nonlocal n, found_zero 548 for n, x in enumerate(iterable, start=1): 549 if x > 0.0 or math.isnan(x): 550 yield x 551 elif x == 0.0: 552 found_zero = True 553 else: 554 raise StatisticsError('No negative inputs allowed', x) 555 total = fsum(map(log, count_positive(data))) 556 if not n: 557 raise StatisticsError('Must have a non-empty dataset') 558 if math.isnan(total): 559 return math.nan 560 if found_zero: 561 return math.nan if total == math.inf else 0.0 562 return exp(total / n) 563 564 565def harmonic_mean(data, weights=None): 566 """Return the harmonic mean of data. 567 568 The harmonic mean is the reciprocal of the arithmetic mean of the 569 reciprocals of the data. It can be used for averaging ratios or 570 rates, for example speeds. 571 572 Suppose a car travels 40 km/hr for 5 km and then speeds-up to 573 60 km/hr for another 5 km. What is the average speed? 574 575 >>> harmonic_mean([40, 60]) 576 48.0 577 578 Suppose a car travels 40 km/hr for 5 km, and when traffic clears, 579 speeds-up to 60 km/hr for the remaining 30 km of the journey. What 580 is the average speed? 581 582 >>> harmonic_mean([40, 60], weights=[5, 30]) 583 56.0 584 585 If ``data`` is empty, or any element is less than zero, 586 ``harmonic_mean`` will raise ``StatisticsError``. 587 """ 588 if iter(data) is data: 589 data = list(data) 590 errmsg = 'harmonic mean does not support negative values' 591 n = len(data) 592 if n < 1: 593 raise StatisticsError('harmonic_mean requires at least one data point') 594 elif n == 1 and weights is None: 595 x = data[0] 596 if isinstance(x, (numbers.Real, Decimal)): 597 if x < 0: 598 raise StatisticsError(errmsg) 599 return x 600 else: 601 raise TypeError('unsupported type') 602 if weights is None: 603 weights = repeat(1, n) 604 sum_weights = n 605 else: 606 if iter(weights) is weights: 607 weights = list(weights) 608 if len(weights) != n: 609 raise StatisticsError('Number of weights does not match data size') 610 _, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg)) 611 try: 612 data = _fail_neg(data, errmsg) 613 T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data)) 614 except ZeroDivisionError: 615 return 0 616 if total <= 0: 617 raise StatisticsError('Weighted sum must be positive') 618 return _convert(sum_weights / total, T) 619 620# FIXME: investigate ways to calculate medians without sorting? Quickselect? 621def median(data): 622 """Return the median (middle value) of numeric data. 623 624 When the number of data points is odd, return the middle data point. 625 When the number of data points is even, the median is interpolated by 626 taking the average of the two middle values: 627 628 >>> median([1, 3, 5]) 629 3 630 >>> median([1, 3, 5, 7]) 631 4.0 632 633 """ 634 data = sorted(data) 635 n = len(data) 636 if n == 0: 637 raise StatisticsError("no median for empty data") 638 if n % 2 == 1: 639 return data[n // 2] 640 else: 641 i = n // 2 642 return (data[i - 1] + data[i]) / 2 643 644 645def median_low(data): 646 """Return the low median of numeric data. 647 648 When the number of data points is odd, the middle value is returned. 649 When it is even, the smaller of the two middle values is returned. 650 651 >>> median_low([1, 3, 5]) 652 3 653 >>> median_low([1, 3, 5, 7]) 654 3 655 656 """ 657 data = sorted(data) 658 n = len(data) 659 if n == 0: 660 raise StatisticsError("no median for empty data") 661 if n % 2 == 1: 662 return data[n // 2] 663 else: 664 return data[n // 2 - 1] 665 666 667def median_high(data): 668 """Return the high median of data. 669 670 When the number of data points is odd, the middle value is returned. 671 When it is even, the larger of the two middle values is returned. 672 673 >>> median_high([1, 3, 5]) 674 3 675 >>> median_high([1, 3, 5, 7]) 676 5 677 678 """ 679 data = sorted(data) 680 n = len(data) 681 if n == 0: 682 raise StatisticsError("no median for empty data") 683 return data[n // 2] 684 685 686def median_grouped(data, interval=1.0): 687 """Estimates the median for numeric data binned around the midpoints 688 of consecutive, fixed-width intervals. 689 690 The *data* can be any iterable of numeric data with each value being 691 exactly the midpoint of a bin. At least one value must be present. 692 693 The *interval* is width of each bin. 694 695 For example, demographic information may have been summarized into 696 consecutive ten-year age groups with each group being represented 697 by the 5-year midpoints of the intervals: 698 699 >>> demographics = Counter({ 700 ... 25: 172, # 20 to 30 years old 701 ... 35: 484, # 30 to 40 years old 702 ... 45: 387, # 40 to 50 years old 703 ... 55: 22, # 50 to 60 years old 704 ... 65: 6, # 60 to 70 years old 705 ... }) 706 707 The 50th percentile (median) is the 536th person out of the 1071 708 member cohort. That person is in the 30 to 40 year old age group. 709 710 The regular median() function would assume that everyone in the 711 tricenarian age group was exactly 35 years old. A more tenable 712 assumption is that the 484 members of that age group are evenly 713 distributed between 30 and 40. For that, we use median_grouped(). 714 715 >>> data = list(demographics.elements()) 716 >>> median(data) 717 35 718 >>> round(median_grouped(data, interval=10), 1) 719 37.5 720 721 The caller is responsible for making sure the data points are separated 722 by exact multiples of *interval*. This is essential for getting a 723 correct result. The function does not check this precondition. 724 725 Inputs may be any numeric type that can be coerced to a float during 726 the interpolation step. 727 728 """ 729 data = sorted(data) 730 n = len(data) 731 if not n: 732 raise StatisticsError("no median for empty data") 733 734 # Find the value at the midpoint. Remember this corresponds to the 735 # midpoint of the class interval. 736 x = data[n // 2] 737 738 # Using O(log n) bisection, find where all the x values occur in the data. 739 # All x will lie within data[i:j]. 740 i = bisect_left(data, x) 741 j = bisect_right(data, x, lo=i) 742 743 # Coerce to floats, raising a TypeError if not possible 744 try: 745 interval = float(interval) 746 x = float(x) 747 except ValueError: 748 raise TypeError(f'Value cannot be converted to a float') 749 750 # Interpolate the median using the formula found at: 751 # https://www.cuemath.com/data/median-of-grouped-data/ 752 L = x - interval / 2.0 # Lower limit of the median interval 753 cf = i # Cumulative frequency of the preceding interval 754 f = j - i # Number of elements in the median internal 755 return L + interval * (n / 2 - cf) / f 756 757 758def mode(data): 759 """Return the most common data point from discrete or nominal data. 760 761 ``mode`` assumes discrete data, and returns a single value. This is the 762 standard treatment of the mode as commonly taught in schools: 763 764 >>> mode([1, 1, 2, 3, 3, 3, 3, 4]) 765 3 766 767 This also works with nominal (non-numeric) data: 768 769 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"]) 770 'red' 771 772 If there are multiple modes with same frequency, return the first one 773 encountered: 774 775 >>> mode(['red', 'red', 'green', 'blue', 'blue']) 776 'red' 777 778 If *data* is empty, ``mode``, raises StatisticsError. 779 780 """ 781 pairs = Counter(iter(data)).most_common(1) 782 try: 783 return pairs[0][0] 784 except IndexError: 785 raise StatisticsError('no mode for empty data') from None 786 787 788def multimode(data): 789 """Return a list of the most frequently occurring values. 790 791 Will return more than one result if there are multiple modes 792 or an empty list if *data* is empty. 793 794 >>> multimode('aabbbbbbbbcc') 795 ['b'] 796 >>> multimode('aabbbbccddddeeffffgg') 797 ['b', 'd', 'f'] 798 >>> multimode('') 799 [] 800 """ 801 counts = Counter(iter(data)) 802 if not counts: 803 return [] 804 maxcount = max(counts.values()) 805 return [value for value, count in counts.items() if count == maxcount] 806 807 808def kde(data, h, kernel='normal', *, cumulative=False): 809 """Kernel Density Estimation: Create a continuous probability density 810 function or cumulative distribution function from discrete samples. 811 812 The basic idea is to smooth the data using a kernel function 813 to help draw inferences about a population from a sample. 814 815 The degree of smoothing is controlled by the scaling parameter h 816 which is called the bandwidth. Smaller values emphasize local 817 features while larger values give smoother results. 818 819 The kernel determines the relative weights of the sample data 820 points. Generally, the choice of kernel shape does not matter 821 as much as the more influential bandwidth smoothing parameter. 822 823 Kernels that give some weight to every sample point: 824 825 normal (gauss) 826 logistic 827 sigmoid 828 829 Kernels that only give weight to sample points within 830 the bandwidth: 831 832 rectangular (uniform) 833 triangular 834 parabolic (epanechnikov) 835 quartic (biweight) 836 triweight 837 cosine 838 839 If *cumulative* is true, will return a cumulative distribution function. 840 841 A StatisticsError will be raised if the data sequence is empty. 842 843 Example 844 ------- 845 846 Given a sample of six data points, construct a continuous 847 function that estimates the underlying probability density: 848 849 >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] 850 >>> f_hat = kde(sample, h=1.5) 851 852 Compute the area under the curve: 853 854 >>> area = sum(f_hat(x) for x in range(-20, 20)) 855 >>> round(area, 4) 856 1.0 857 858 Plot the estimated probability density function at 859 evenly spaced points from -6 to 10: 860 861 >>> for x in range(-6, 11): 862 ... density = f_hat(x) 863 ... plot = ' ' * int(density * 400) + 'x' 864 ... print(f'{x:2}: {density:.3f} {plot}') 865 ... 866 -6: 0.002 x 867 -5: 0.009 x 868 -4: 0.031 x 869 -3: 0.070 x 870 -2: 0.111 x 871 -1: 0.125 x 872 0: 0.110 x 873 1: 0.086 x 874 2: 0.068 x 875 3: 0.059 x 876 4: 0.066 x 877 5: 0.082 x 878 6: 0.082 x 879 7: 0.058 x 880 8: 0.028 x 881 9: 0.009 x 882 10: 0.002 x 883 884 Estimate P(4.5 < X <= 7.5), the probability that a new sample value 885 will be between 4.5 and 7.5: 886 887 >>> cdf = kde(sample, h=1.5, cumulative=True) 888 >>> round(cdf(7.5) - cdf(4.5), 2) 889 0.22 890 891 References 892 ---------- 893 894 Kernel density estimation and its application: 895 https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf 896 897 Kernel functions in common use: 898 https://en.wikipedia.org/wiki/Kernel_(statistics)#kernel_functions_in_common_use 899 900 Interactive graphical demonstration and exploration: 901 https://demonstrations.wolfram.com/KernelDensityEstimation/ 902 903 Kernel estimation of cumulative distribution function of a random variable with bounded support 904 https://www.econstor.eu/bitstream/10419/207829/1/10.21307_stattrans-2016-037.pdf 905 906 """ 907 908 n = len(data) 909 if not n: 910 raise StatisticsError('Empty data sequence') 911 912 if not isinstance(data[0], (int, float)): 913 raise TypeError('Data sequence must contain ints or floats') 914 915 if h <= 0.0: 916 raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}') 917 918 match kernel: 919 920 case 'normal' | 'gauss': 921 sqrt2pi = sqrt(2 * pi) 922 sqrt2 = sqrt(2) 923 K = lambda t: exp(-1/2 * t * t) / sqrt2pi 924 W = lambda t: 1/2 * (1.0 + erf(t / sqrt2)) 925 support = None 926 927 case 'logistic': 928 # 1.0 / (exp(t) + 2.0 + exp(-t)) 929 K = lambda t: 1/2 / (1.0 + cosh(t)) 930 W = lambda t: 1.0 - 1.0 / (exp(t) + 1.0) 931 support = None 932 933 case 'sigmoid': 934 # (2/pi) / (exp(t) + exp(-t)) 935 c1 = 1 / pi 936 c2 = 2 / pi 937 K = lambda t: c1 / cosh(t) 938 W = lambda t: c2 * atan(exp(t)) 939 support = None 940 941 case 'rectangular' | 'uniform': 942 K = lambda t: 1/2 943 W = lambda t: 1/2 * t + 1/2 944 support = 1.0 945 946 case 'triangular': 947 K = lambda t: 1.0 - abs(t) 948 W = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2 949 support = 1.0 950 951 case 'parabolic' | 'epanechnikov': 952 K = lambda t: 3/4 * (1.0 - t * t) 953 W = lambda t: -1/4 * t**3 + 3/4 * t + 1/2 954 support = 1.0 955 956 case 'quartic' | 'biweight': 957 K = lambda t: 15/16 * (1.0 - t * t) ** 2 958 W = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2 959 support = 1.0 960 961 case 'triweight': 962 K = lambda t: 35/32 * (1.0 - t * t) ** 3 963 W = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2 964 support = 1.0 965 966 case 'cosine': 967 c1 = pi / 4 968 c2 = pi / 2 969 K = lambda t: c1 * cos(c2 * t) 970 W = lambda t: 1/2 * sin(c2 * t) + 1/2 971 support = 1.0 972 973 case _: 974 raise StatisticsError(f'Unknown kernel name: {kernel!r}') 975 976 if support is None: 977 978 def pdf(x): 979 n = len(data) 980 return sum(K((x - x_i) / h) for x_i in data) / (n * h) 981 982 def cdf(x): 983 n = len(data) 984 return sum(W((x - x_i) / h) for x_i in data) / n 985 986 else: 987 988 sample = sorted(data) 989 bandwidth = h * support 990 991 def pdf(x): 992 nonlocal n, sample 993 if len(data) != n: 994 sample = sorted(data) 995 n = len(data) 996 i = bisect_left(sample, x - bandwidth) 997 j = bisect_right(sample, x + bandwidth) 998 supported = sample[i : j] 999 return sum(K((x - x_i) / h) for x_i in supported) / (n * h) 1000 1001 def cdf(x): 1002 nonlocal n, sample 1003 if len(data) != n: 1004 sample = sorted(data) 1005 n = len(data) 1006 i = bisect_left(sample, x - bandwidth) 1007 j = bisect_right(sample, x + bandwidth) 1008 supported = sample[i : j] 1009 return sum((W((x - x_i) / h) for x_i in supported), i) / n 1010 1011 if cumulative: 1012 cdf.__doc__ = f'CDF estimate with {h=!r} and {kernel=!r}' 1013 return cdf 1014 1015 else: 1016 pdf.__doc__ = f'PDF estimate with {h=!r} and {kernel=!r}' 1017 return pdf 1018 1019 1020# Notes on methods for computing quantiles 1021# ---------------------------------------- 1022# 1023# There is no one perfect way to compute quantiles. Here we offer 1024# two methods that serve common needs. Most other packages 1025# surveyed offered at least one or both of these two, making them 1026# "standard" in the sense of "widely-adopted and reproducible". 1027# They are also easy to explain, easy to compute manually, and have 1028# straight-forward interpretations that aren't surprising. 1029 1030# The default method is known as "R6", "PERCENTILE.EXC", or "expected 1031# value of rank order statistics". The alternative method is known as 1032# "R7", "PERCENTILE.INC", or "mode of rank order statistics". 1033 1034# For sample data where there is a positive probability for values 1035# beyond the range of the data, the R6 exclusive method is a 1036# reasonable choice. Consider a random sample of nine values from a 1037# population with a uniform distribution from 0.0 to 1.0. The 1038# distribution of the third ranked sample point is described by 1039# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and 1040# mean=0.300. Only the latter (which corresponds with R6) gives the 1041# desired cut point with 30% of the population falling below that 1042# value, making it comparable to a result from an inv_cdf() function. 1043# The R6 exclusive method is also idempotent. 1044 1045# For describing population data where the end points are known to 1046# be included in the data, the R7 inclusive method is a reasonable 1047# choice. Instead of the mean, it uses the mode of the beta 1048# distribution for the interior points. Per Hyndman & Fan, "One nice 1049# property is that the vertices of Q7(p) divide the range into n - 1 1050# intervals, and exactly 100p% of the intervals lie to the left of 1051# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)." 1052 1053# If needed, other methods could be added. However, for now, the 1054# position is that fewer options make for easier choices and that 1055# external packages can be used for anything more advanced. 1056 1057def quantiles(data, *, n=4, method='exclusive'): 1058 """Divide *data* into *n* continuous intervals with equal probability. 1059 1060 Returns a list of (n - 1) cut points separating the intervals. 1061 1062 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. 1063 Set *n* to 100 for percentiles which gives the 99 cuts points that 1064 separate *data* in to 100 equal sized groups. 1065 1066 The *data* can be any iterable containing sample. 1067 The cut points are linearly interpolated between data points. 1068 1069 If *method* is set to *inclusive*, *data* is treated as population 1070 data. The minimum value is treated as the 0th percentile and the 1071 maximum value is treated as the 100th percentile. 1072 """ 1073 if n < 1: 1074 raise StatisticsError('n must be at least 1') 1075 data = sorted(data) 1076 ld = len(data) 1077 if ld < 2: 1078 if ld == 1: 1079 return data * (n - 1) 1080 raise StatisticsError('must have at least one data point') 1081 1082 if method == 'inclusive': 1083 m = ld - 1 1084 result = [] 1085 for i in range(1, n): 1086 j, delta = divmod(i * m, n) 1087 interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n 1088 result.append(interpolated) 1089 return result 1090 1091 if method == 'exclusive': 1092 m = ld + 1 1093 result = [] 1094 for i in range(1, n): 1095 j = i * m // n # rescale i to m/n 1096 j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1 1097 delta = i*m - j*n # exact integer math 1098 interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n 1099 result.append(interpolated) 1100 return result 1101 1102 raise ValueError(f'Unknown method: {method!r}') 1103 1104 1105# === Measures of spread === 1106 1107# See http://mathworld.wolfram.com/Variance.html 1108# http://mathworld.wolfram.com/SampleVariance.html 1109 1110 1111def variance(data, xbar=None): 1112 """Return the sample variance of data. 1113 1114 data should be an iterable of Real-valued numbers, with at least two 1115 values. The optional argument xbar, if given, should be the mean of 1116 the data. If it is missing or None, the mean is automatically calculated. 1117 1118 Use this function when your data is a sample from a population. To 1119 calculate the variance from the entire population, see ``pvariance``. 1120 1121 Examples: 1122 1123 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5] 1124 >>> variance(data) 1125 1.3720238095238095 1126 1127 If you have already calculated the mean of your data, you can pass it as 1128 the optional second argument ``xbar`` to avoid recalculating it: 1129 1130 >>> m = mean(data) 1131 >>> variance(data, m) 1132 1.3720238095238095 1133 1134 This function does not check that ``xbar`` is actually the mean of 1135 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or 1136 impossible results. 1137 1138 Decimals and Fractions are supported: 1139 1140 >>> from decimal import Decimal as D 1141 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) 1142 Decimal('31.01875') 1143 1144 >>> from fractions import Fraction as F 1145 >>> variance([F(1, 6), F(1, 2), F(5, 3)]) 1146 Fraction(67, 108) 1147 1148 """ 1149 T, ss, c, n = _ss(data, xbar) 1150 if n < 2: 1151 raise StatisticsError('variance requires at least two data points') 1152 return _convert(ss / (n - 1), T) 1153 1154 1155def pvariance(data, mu=None): 1156 """Return the population variance of ``data``. 1157 1158 data should be a sequence or iterable of Real-valued numbers, with at least one 1159 value. The optional argument mu, if given, should be the mean of 1160 the data. If it is missing or None, the mean is automatically calculated. 1161 1162 Use this function to calculate the variance from the entire population. 1163 To estimate the variance from a sample, the ``variance`` function is 1164 usually a better choice. 1165 1166 Examples: 1167 1168 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25] 1169 >>> pvariance(data) 1170 1.25 1171 1172 If you have already calculated the mean of the data, you can pass it as 1173 the optional second argument to avoid recalculating it: 1174 1175 >>> mu = mean(data) 1176 >>> pvariance(data, mu) 1177 1.25 1178 1179 Decimals and Fractions are supported: 1180 1181 >>> from decimal import Decimal as D 1182 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) 1183 Decimal('24.815') 1184 1185 >>> from fractions import Fraction as F 1186 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)]) 1187 Fraction(13, 72) 1188 1189 """ 1190 T, ss, c, n = _ss(data, mu) 1191 if n < 1: 1192 raise StatisticsError('pvariance requires at least one data point') 1193 return _convert(ss / n, T) 1194 1195 1196def stdev(data, xbar=None): 1197 """Return the square root of the sample variance. 1198 1199 See ``variance`` for arguments and other details. 1200 1201 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 1202 1.0810874155219827 1203 1204 """ 1205 T, ss, c, n = _ss(data, xbar) 1206 if n < 2: 1207 raise StatisticsError('stdev requires at least two data points') 1208 mss = ss / (n - 1) 1209 if issubclass(T, Decimal): 1210 return _decimal_sqrt_of_frac(mss.numerator, mss.denominator) 1211 return _float_sqrt_of_frac(mss.numerator, mss.denominator) 1212 1213 1214def pstdev(data, mu=None): 1215 """Return the square root of the population variance. 1216 1217 See ``pvariance`` for arguments and other details. 1218 1219 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 1220 0.986893273527251 1221 1222 """ 1223 T, ss, c, n = _ss(data, mu) 1224 if n < 1: 1225 raise StatisticsError('pstdev requires at least one data point') 1226 mss = ss / n 1227 if issubclass(T, Decimal): 1228 return _decimal_sqrt_of_frac(mss.numerator, mss.denominator) 1229 return _float_sqrt_of_frac(mss.numerator, mss.denominator) 1230 1231 1232def _mean_stdev(data): 1233 """In one pass, compute the mean and sample standard deviation as floats.""" 1234 T, ss, xbar, n = _ss(data) 1235 if n < 2: 1236 raise StatisticsError('stdev requires at least two data points') 1237 mss = ss / (n - 1) 1238 try: 1239 return float(xbar), _float_sqrt_of_frac(mss.numerator, mss.denominator) 1240 except AttributeError: 1241 # Handle Nans and Infs gracefully 1242 return float(xbar), float(xbar) / float(ss) 1243 1244def _sqrtprod(x: float, y: float) -> float: 1245 "Return sqrt(x * y) computed with improved accuracy and without overflow/underflow." 1246 h = sqrt(x * y) 1247 if not isfinite(h): 1248 if isinf(h) and not isinf(x) and not isinf(y): 1249 # Finite inputs overflowed, so scale down, and recompute. 1250 scale = 2.0 ** -512 # sqrt(1 / sys.float_info.max) 1251 return _sqrtprod(scale * x, scale * y) / scale 1252 return h 1253 if not h: 1254 if x and y: 1255 # Non-zero inputs underflowed, so scale up, and recompute. 1256 # Scale: 1 / sqrt(sys.float_info.min * sys.float_info.epsilon) 1257 scale = 2.0 ** 537 1258 return _sqrtprod(scale * x, scale * y) / scale 1259 return h 1260 # Improve accuracy with a differential correction. 1261 # https://www.wolframalpha.com/input/?i=Maclaurin+series+sqrt%28h**2+%2B+x%29+at+x%3D0 1262 d = sumprod((x, h), (y, -h)) 1263 return h + d / (2.0 * h) 1264 1265 1266# === Statistics for relations between two inputs === 1267 1268# See https://en.wikipedia.org/wiki/Covariance 1269# https://en.wikipedia.org/wiki/Pearson_correlation_coefficient 1270# https://en.wikipedia.org/wiki/Simple_linear_regression 1271 1272 1273def covariance(x, y, /): 1274 """Covariance 1275 1276 Return the sample covariance of two inputs *x* and *y*. Covariance 1277 is a measure of the joint variability of two inputs. 1278 1279 >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] 1280 >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] 1281 >>> covariance(x, y) 1282 0.75 1283 >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1] 1284 >>> covariance(x, z) 1285 -7.5 1286 >>> covariance(z, x) 1287 -7.5 1288 1289 """ 1290 n = len(x) 1291 if len(y) != n: 1292 raise StatisticsError('covariance requires that both inputs have same number of data points') 1293 if n < 2: 1294 raise StatisticsError('covariance requires at least two data points') 1295 xbar = fsum(x) / n 1296 ybar = fsum(y) / n 1297 sxy = sumprod((xi - xbar for xi in x), (yi - ybar for yi in y)) 1298 return sxy / (n - 1) 1299 1300 1301def correlation(x, y, /, *, method='linear'): 1302 """Pearson's correlation coefficient 1303 1304 Return the Pearson's correlation coefficient for two inputs. Pearson's 1305 correlation coefficient *r* takes values between -1 and +1. It measures 1306 the strength and direction of a linear relationship. 1307 1308 >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] 1309 >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1] 1310 >>> correlation(x, x) 1311 1.0 1312 >>> correlation(x, y) 1313 -1.0 1314 1315 If *method* is "ranked", computes Spearman's rank correlation coefficient 1316 for two inputs. The data is replaced by ranks. Ties are averaged 1317 so that equal values receive the same rank. The resulting coefficient 1318 measures the strength of a monotonic relationship. 1319 1320 Spearman's rank correlation coefficient is appropriate for ordinal 1321 data or for continuous data that doesn't meet the linear proportion 1322 requirement for Pearson's correlation coefficient. 1323 """ 1324 n = len(x) 1325 if len(y) != n: 1326 raise StatisticsError('correlation requires that both inputs have same number of data points') 1327 if n < 2: 1328 raise StatisticsError('correlation requires at least two data points') 1329 if method not in {'linear', 'ranked'}: 1330 raise ValueError(f'Unknown method: {method!r}') 1331 if method == 'ranked': 1332 start = (n - 1) / -2 # Center rankings around zero 1333 x = _rank(x, start=start) 1334 y = _rank(y, start=start) 1335 else: 1336 xbar = fsum(x) / n 1337 ybar = fsum(y) / n 1338 x = [xi - xbar for xi in x] 1339 y = [yi - ybar for yi in y] 1340 sxy = sumprod(x, y) 1341 sxx = sumprod(x, x) 1342 syy = sumprod(y, y) 1343 try: 1344 return sxy / _sqrtprod(sxx, syy) 1345 except ZeroDivisionError: 1346 raise StatisticsError('at least one of the inputs is constant') 1347 1348 1349LinearRegression = namedtuple('LinearRegression', ('slope', 'intercept')) 1350 1351 1352def linear_regression(x, y, /, *, proportional=False): 1353 """Slope and intercept for simple linear regression. 1354 1355 Return the slope and intercept of simple linear regression 1356 parameters estimated using ordinary least squares. Simple linear 1357 regression describes relationship between an independent variable 1358 *x* and a dependent variable *y* in terms of a linear function: 1359 1360 y = slope * x + intercept + noise 1361 1362 where *slope* and *intercept* are the regression parameters that are 1363 estimated, and noise represents the variability of the data that was 1364 not explained by the linear regression (it is equal to the 1365 difference between predicted and actual values of the dependent 1366 variable). 1367 1368 The parameters are returned as a named tuple. 1369 1370 >>> x = [1, 2, 3, 4, 5] 1371 >>> noise = NormalDist().samples(5, seed=42) 1372 >>> y = [3 * x[i] + 2 + noise[i] for i in range(5)] 1373 >>> linear_regression(x, y) #doctest: +ELLIPSIS 1374 LinearRegression(slope=3.17495..., intercept=1.00925...) 1375 1376 If *proportional* is true, the independent variable *x* and the 1377 dependent variable *y* are assumed to be directly proportional. 1378 The data is fit to a line passing through the origin. 1379 1380 Since the *intercept* will always be 0.0, the underlying linear 1381 function simplifies to: 1382 1383 y = slope * x + noise 1384 1385 >>> y = [3 * x[i] + noise[i] for i in range(5)] 1386 >>> linear_regression(x, y, proportional=True) #doctest: +ELLIPSIS 1387 LinearRegression(slope=2.90475..., intercept=0.0) 1388 1389 """ 1390 n = len(x) 1391 if len(y) != n: 1392 raise StatisticsError('linear regression requires that both inputs have same number of data points') 1393 if n < 2: 1394 raise StatisticsError('linear regression requires at least two data points') 1395 if not proportional: 1396 xbar = fsum(x) / n 1397 ybar = fsum(y) / n 1398 x = [xi - xbar for xi in x] # List because used three times below 1399 y = (yi - ybar for yi in y) # Generator because only used once below 1400 sxy = sumprod(x, y) + 0.0 # Add zero to coerce result to a float 1401 sxx = sumprod(x, x) 1402 try: 1403 slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x) 1404 except ZeroDivisionError: 1405 raise StatisticsError('x is constant') 1406 intercept = 0.0 if proportional else ybar - slope * xbar 1407 return LinearRegression(slope=slope, intercept=intercept) 1408 1409 1410## Normal Distribution ##################################################### 1411 1412 1413def _normal_dist_inv_cdf(p, mu, sigma): 1414 # There is no closed-form solution to the inverse CDF for the normal 1415 # distribution, so we use a rational approximation instead: 1416 # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the 1417 # Normal Distribution". Applied Statistics. Blackwell Publishing. 37 1418 # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330. 1419 q = p - 0.5 1420 if fabs(q) <= 0.425: 1421 r = 0.180625 - q * q 1422 # Hash sum: 55.88319_28806_14901_4439 1423 num = (((((((2.50908_09287_30122_6727e+3 * r + 1424 3.34305_75583_58812_8105e+4) * r + 1425 6.72657_70927_00870_0853e+4) * r + 1426 4.59219_53931_54987_1457e+4) * r + 1427 1.37316_93765_50946_1125e+4) * r + 1428 1.97159_09503_06551_4427e+3) * r + 1429 1.33141_66789_17843_7745e+2) * r + 1430 3.38713_28727_96366_6080e+0) * q 1431 den = (((((((5.22649_52788_52854_5610e+3 * r + 1432 2.87290_85735_72194_2674e+4) * r + 1433 3.93078_95800_09271_0610e+4) * r + 1434 2.12137_94301_58659_5867e+4) * r + 1435 5.39419_60214_24751_1077e+3) * r + 1436 6.87187_00749_20579_0830e+2) * r + 1437 4.23133_30701_60091_1252e+1) * r + 1438 1.0) 1439 x = num / den 1440 return mu + (x * sigma) 1441 r = p if q <= 0.0 else 1.0 - p 1442 r = sqrt(-log(r)) 1443 if r <= 5.0: 1444 r = r - 1.6 1445 # Hash sum: 49.33206_50330_16102_89036 1446 num = (((((((7.74545_01427_83414_07640e-4 * r + 1447 2.27238_44989_26918_45833e-2) * r + 1448 2.41780_72517_74506_11770e-1) * r + 1449 1.27045_82524_52368_38258e+0) * r + 1450 3.64784_83247_63204_60504e+0) * r + 1451 5.76949_72214_60691_40550e+0) * r + 1452 4.63033_78461_56545_29590e+0) * r + 1453 1.42343_71107_49683_57734e+0) 1454 den = (((((((1.05075_00716_44416_84324e-9 * r + 1455 5.47593_80849_95344_94600e-4) * r + 1456 1.51986_66563_61645_71966e-2) * r + 1457 1.48103_97642_74800_74590e-1) * r + 1458 6.89767_33498_51000_04550e-1) * r + 1459 1.67638_48301_83803_84940e+0) * r + 1460 2.05319_16266_37758_82187e+0) * r + 1461 1.0) 1462 else: 1463 r = r - 5.0 1464 # Hash sum: 47.52583_31754_92896_71629 1465 num = (((((((2.01033_43992_92288_13265e-7 * r + 1466 2.71155_55687_43487_57815e-5) * r + 1467 1.24266_09473_88078_43860e-3) * r + 1468 2.65321_89526_57612_30930e-2) * r + 1469 2.96560_57182_85048_91230e-1) * r + 1470 1.78482_65399_17291_33580e+0) * r + 1471 5.46378_49111_64114_36990e+0) * r + 1472 6.65790_46435_01103_77720e+0) 1473 den = (((((((2.04426_31033_89939_78564e-15 * r + 1474 1.42151_17583_16445_88870e-7) * r + 1475 1.84631_83175_10054_68180e-5) * r + 1476 7.86869_13114_56132_59100e-4) * r + 1477 1.48753_61290_85061_48525e-2) * r + 1478 1.36929_88092_27358_05310e-1) * r + 1479 5.99832_20655_58879_37690e-1) * r + 1480 1.0) 1481 x = num / den 1482 if q < 0.0: 1483 x = -x 1484 return mu + (x * sigma) 1485 1486 1487# If available, use C implementation 1488try: 1489 from _statistics import _normal_dist_inv_cdf 1490except ImportError: 1491 pass 1492 1493 1494class NormalDist: 1495 "Normal distribution of a random variable" 1496 # https://en.wikipedia.org/wiki/Normal_distribution 1497 # https://en.wikipedia.org/wiki/Variance#Properties 1498 1499 __slots__ = { 1500 '_mu': 'Arithmetic mean of a normal distribution', 1501 '_sigma': 'Standard deviation of a normal distribution', 1502 } 1503 1504 def __init__(self, mu=0.0, sigma=1.0): 1505 "NormalDist where mu is the mean and sigma is the standard deviation." 1506 if sigma < 0.0: 1507 raise StatisticsError('sigma must be non-negative') 1508 self._mu = float(mu) 1509 self._sigma = float(sigma) 1510 1511 @classmethod 1512 def from_samples(cls, data): 1513 "Make a normal distribution instance from sample data." 1514 return cls(*_mean_stdev(data)) 1515 1516 def samples(self, n, *, seed=None): 1517 "Generate *n* samples for a given mean and standard deviation." 1518 rnd = random.random if seed is None else random.Random(seed).random 1519 inv_cdf = _normal_dist_inv_cdf 1520 mu = self._mu 1521 sigma = self._sigma 1522 return [inv_cdf(rnd(), mu, sigma) for _ in repeat(None, n)] 1523 1524 def pdf(self, x): 1525 "Probability density function. P(x <= X < x+dx) / dx" 1526 variance = self._sigma * self._sigma 1527 if not variance: 1528 raise StatisticsError('pdf() not defined when sigma is zero') 1529 diff = x - self._mu 1530 return exp(diff * diff / (-2.0 * variance)) / sqrt(tau * variance) 1531 1532 def cdf(self, x): 1533 "Cumulative distribution function. P(X <= x)" 1534 if not self._sigma: 1535 raise StatisticsError('cdf() not defined when sigma is zero') 1536 return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * _SQRT2))) 1537 1538 def inv_cdf(self, p): 1539 """Inverse cumulative distribution function. x : P(X <= x) = p 1540 1541 Finds the value of the random variable such that the probability of 1542 the variable being less than or equal to that value equals the given 1543 probability. 1544 1545 This function is also called the percent point function or quantile 1546 function. 1547 """ 1548 if p <= 0.0 or p >= 1.0: 1549 raise StatisticsError('p must be in the range 0.0 < p < 1.0') 1550 return _normal_dist_inv_cdf(p, self._mu, self._sigma) 1551 1552 def quantiles(self, n=4): 1553 """Divide into *n* continuous intervals with equal probability. 1554 1555 Returns a list of (n - 1) cut points separating the intervals. 1556 1557 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. 1558 Set *n* to 100 for percentiles which gives the 99 cuts points that 1559 separate the normal distribution in to 100 equal sized groups. 1560 """ 1561 return [self.inv_cdf(i / n) for i in range(1, n)] 1562 1563 def overlap(self, other): 1564 """Compute the overlapping coefficient (OVL) between two normal distributions. 1565 1566 Measures the agreement between two normal probability distributions. 1567 Returns a value between 0.0 and 1.0 giving the overlapping area in 1568 the two underlying probability density functions. 1569 1570 >>> N1 = NormalDist(2.4, 1.6) 1571 >>> N2 = NormalDist(3.2, 2.0) 1572 >>> N1.overlap(N2) 1573 0.8035050657330205 1574 """ 1575 # See: "The overlapping coefficient as a measure of agreement between 1576 # probability distributions and point estimation of the overlap of two 1577 # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr 1578 # http://dx.doi.org/10.1080/03610928908830127 1579 if not isinstance(other, NormalDist): 1580 raise TypeError('Expected another NormalDist instance') 1581 X, Y = self, other 1582 if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity 1583 X, Y = Y, X 1584 X_var, Y_var = X.variance, Y.variance 1585 if not X_var or not Y_var: 1586 raise StatisticsError('overlap() not defined when sigma is zero') 1587 dv = Y_var - X_var 1588 dm = fabs(Y._mu - X._mu) 1589 if not dv: 1590 return 1.0 - erf(dm / (2.0 * X._sigma * _SQRT2)) 1591 a = X._mu * Y_var - Y._mu * X_var 1592 b = X._sigma * Y._sigma * sqrt(dm * dm + dv * log(Y_var / X_var)) 1593 x1 = (a + b) / dv 1594 x2 = (a - b) / dv 1595 return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2))) 1596 1597 def zscore(self, x): 1598 """Compute the Standard Score. (x - mean) / stdev 1599 1600 Describes *x* in terms of the number of standard deviations 1601 above or below the mean of the normal distribution. 1602 """ 1603 # https://www.statisticshowto.com/probability-and-statistics/z-score/ 1604 if not self._sigma: 1605 raise StatisticsError('zscore() not defined when sigma is zero') 1606 return (x - self._mu) / self._sigma 1607 1608 @property 1609 def mean(self): 1610 "Arithmetic mean of the normal distribution." 1611 return self._mu 1612 1613 @property 1614 def median(self): 1615 "Return the median of the normal distribution" 1616 return self._mu 1617 1618 @property 1619 def mode(self): 1620 """Return the mode of the normal distribution 1621 1622 The mode is the value x where which the probability density 1623 function (pdf) takes its maximum value. 1624 """ 1625 return self._mu 1626 1627 @property 1628 def stdev(self): 1629 "Standard deviation of the normal distribution." 1630 return self._sigma 1631 1632 @property 1633 def variance(self): 1634 "Square of the standard deviation." 1635 return self._sigma * self._sigma 1636 1637 def __add__(x1, x2): 1638 """Add a constant or another NormalDist instance. 1639 1640 If *other* is a constant, translate mu by the constant, 1641 leaving sigma unchanged. 1642 1643 If *other* is a NormalDist, add both the means and the variances. 1644 Mathematically, this works only if the two distributions are 1645 independent or if they are jointly normally distributed. 1646 """ 1647 if isinstance(x2, NormalDist): 1648 return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma)) 1649 return NormalDist(x1._mu + x2, x1._sigma) 1650 1651 def __sub__(x1, x2): 1652 """Subtract a constant or another NormalDist instance. 1653 1654 If *other* is a constant, translate by the constant mu, 1655 leaving sigma unchanged. 1656 1657 If *other* is a NormalDist, subtract the means and add the variances. 1658 Mathematically, this works only if the two distributions are 1659 independent or if they are jointly normally distributed. 1660 """ 1661 if isinstance(x2, NormalDist): 1662 return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma)) 1663 return NormalDist(x1._mu - x2, x1._sigma) 1664 1665 def __mul__(x1, x2): 1666 """Multiply both mu and sigma by a constant. 1667 1668 Used for rescaling, perhaps to change measurement units. 1669 Sigma is scaled with the absolute value of the constant. 1670 """ 1671 return NormalDist(x1._mu * x2, x1._sigma * fabs(x2)) 1672 1673 def __truediv__(x1, x2): 1674 """Divide both mu and sigma by a constant. 1675 1676 Used for rescaling, perhaps to change measurement units. 1677 Sigma is scaled with the absolute value of the constant. 1678 """ 1679 return NormalDist(x1._mu / x2, x1._sigma / fabs(x2)) 1680 1681 def __pos__(x1): 1682 "Return a copy of the instance." 1683 return NormalDist(x1._mu, x1._sigma) 1684 1685 def __neg__(x1): 1686 "Negates mu while keeping sigma the same." 1687 return NormalDist(-x1._mu, x1._sigma) 1688 1689 __radd__ = __add__ 1690 1691 def __rsub__(x1, x2): 1692 "Subtract a NormalDist from a constant or another NormalDist." 1693 return -(x1 - x2) 1694 1695 __rmul__ = __mul__ 1696 1697 def __eq__(x1, x2): 1698 "Two NormalDist objects are equal if their mu and sigma are both equal." 1699 if not isinstance(x2, NormalDist): 1700 return NotImplemented 1701 return x1._mu == x2._mu and x1._sigma == x2._sigma 1702 1703 def __hash__(self): 1704 "NormalDist objects hash equal if their mu and sigma are both equal." 1705 return hash((self._mu, self._sigma)) 1706 1707 def __repr__(self): 1708 return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})' 1709 1710 def __getstate__(self): 1711 return self._mu, self._sigma 1712 1713 def __setstate__(self, state): 1714 self._mu, self._sigma = state 1715 1716 1717## kde_random() ############################################################## 1718 1719def _newton_raphson(f_inv_estimate, f, f_prime, tolerance=1e-12): 1720 def f_inv(y): 1721 "Return x such that f(x) ≈ y within the specified tolerance." 1722 x = f_inv_estimate(y) 1723 while abs(diff := f(x) - y) > tolerance: 1724 x -= diff / f_prime(x) 1725 return x 1726 return f_inv 1727 1728def _quartic_invcdf_estimate(p): 1729 sign, p = (1.0, p) if p <= 1/2 else (-1.0, 1.0 - p) 1730 x = (2.0 * p) ** 0.4258865685331 - 1.0 1731 if p >= 0.004 < 0.499: 1732 x += 0.026818732 * sin(7.101753784 * p + 2.73230839482953) 1733 return x * sign 1734 1735_quartic_invcdf = _newton_raphson( 1736 f_inv_estimate = _quartic_invcdf_estimate, 1737 f = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2, 1738 f_prime = lambda t: 15/16 * (1.0 - t * t) ** 2) 1739 1740def _triweight_invcdf_estimate(p): 1741 sign, p = (1.0, p) if p <= 1/2 else (-1.0, 1.0 - p) 1742 x = (2.0 * p) ** 0.3400218741872791 - 1.0 1743 return x * sign 1744 1745_triweight_invcdf = _newton_raphson( 1746 f_inv_estimate = _triweight_invcdf_estimate, 1747 f = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2, 1748 f_prime = lambda t: 35/32 * (1.0 - t * t) ** 3) 1749 1750_kernel_invcdfs = { 1751 'normal': NormalDist().inv_cdf, 1752 'logistic': lambda p: log(p / (1 - p)), 1753 'sigmoid': lambda p: log(tan(p * pi/2)), 1754 'rectangular': lambda p: 2*p - 1, 1755 'parabolic': lambda p: 2 * cos((acos(2*p-1) + pi) / 3), 1756 'quartic': _quartic_invcdf, 1757 'triweight': _triweight_invcdf, 1758 'triangular': lambda p: sqrt(2*p) - 1 if p < 1/2 else 1 - sqrt(2 - 2*p), 1759 'cosine': lambda p: 2 * asin(2*p - 1) / pi, 1760} 1761_kernel_invcdfs['gauss'] = _kernel_invcdfs['normal'] 1762_kernel_invcdfs['uniform'] = _kernel_invcdfs['rectangular'] 1763_kernel_invcdfs['epanechnikov'] = _kernel_invcdfs['parabolic'] 1764_kernel_invcdfs['biweight'] = _kernel_invcdfs['quartic'] 1765 1766def kde_random(data, h, kernel='normal', *, seed=None): 1767 """Return a function that makes a random selection from the estimated 1768 probability density function created by kde(data, h, kernel). 1769 1770 Providing a *seed* allows reproducible selections within a single 1771 thread. The seed may be an integer, float, str, or bytes. 1772 1773 A StatisticsError will be raised if the *data* sequence is empty. 1774 1775 Example: 1776 1777 >>> data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] 1778 >>> rand = kde_random(data, h=1.5, seed=8675309) 1779 >>> new_selections = [rand() for i in range(10)] 1780 >>> [round(x, 1) for x in new_selections] 1781 [0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6] 1782 1783 """ 1784 n = len(data) 1785 if not n: 1786 raise StatisticsError('Empty data sequence') 1787 1788 if not isinstance(data[0], (int, float)): 1789 raise TypeError('Data sequence must contain ints or floats') 1790 1791 if h <= 0.0: 1792 raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}') 1793 1794 kernel_invcdf = _kernel_invcdfs.get(kernel) 1795 if kernel_invcdf is None: 1796 raise StatisticsError(f'Unknown kernel name: {kernel!r}') 1797 1798 prng = _random.Random(seed) 1799 random = prng.random 1800 choice = prng.choice 1801 1802 def rand(): 1803 return choice(data) + h * kernel_invcdf(random()) 1804 1805 rand.__doc__ = f'Random KDE selection with {h=!r} and {kernel=!r}' 1806 1807 return rand 1808