• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
10==================  ==================================================
11Function            Description
12==================  ==================================================
13mean                Arithmetic mean (average) of data.
14fmean               Fast, floating-point arithmetic mean.
15geometric_mean      Geometric mean of data.
16harmonic_mean       Harmonic mean of data.
17median              Median (middle value) of data.
18median_low          Low median of data.
19median_high         High median of data.
20median_grouped      Median, or 50th percentile, of grouped data.
21mode                Mode (most common value) of data.
22multimode           List of modes (most common values of data).
23quantiles           Divide data into intervals with equal probability.
24==================  ==================================================
25
26Calculate the arithmetic mean ("the average") of data:
27
28>>> mean([-1.0, 2.5, 3.25, 5.75])
292.625
30
31
32Calculate the standard median of discrete data:
33
34>>> median([2, 3, 4, 5])
353.5
36
37
38Calculate the median, or 50th percentile, of data grouped into class intervals
39centred on the data values provided. E.g. if your data points are rounded to
40the nearest whole number:
41
42>>> median_grouped([2, 2, 3, 3, 3, 4])  #doctest: +ELLIPSIS
432.8333333333...
44
45This should be interpreted in this way: you have two data points in the class
46interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
47the class interval 3.5-4.5. The median of these data points is 2.8333...
48
49
50Calculating variability or spread
51---------------------------------
52
53==================  =============================================
54Function            Description
55==================  =============================================
56pvariance           Population variance of data.
57variance            Sample variance of data.
58pstdev              Population standard deviation of data.
59stdev               Sample standard deviation of data.
60==================  =============================================
61
62Calculate the standard deviation of sample data:
63
64>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75])  #doctest: +ELLIPSIS
654.38961843444...
66
67If you have previously calculated the mean, you can pass it as the optional
68second argument to the four "spread" functions to avoid recalculating it:
69
70>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
71>>> mu = mean(data)
72>>> pvariance(data, mu)
732.5
74
75
76Statistics for relations between two inputs
77-------------------------------------------
78
79==================  ====================================================
80Function            Description
81==================  ====================================================
82covariance          Sample covariance for two variables.
83correlation         Pearson's correlation coefficient for two variables.
84linear_regression   Intercept and slope for simple linear regression.
85==================  ====================================================
86
87Calculate covariance, Pearson's correlation, and simple linear regression
88for two inputs:
89
90>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
91>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
92>>> covariance(x, y)
930.75
94>>> correlation(x, y)  #doctest: +ELLIPSIS
950.31622776601...
96>>> linear_regression(x, y)  #doctest:
97LinearRegression(slope=0.1, intercept=1.5)
98
99
100Exceptions
101----------
102
103A single exception is defined: StatisticsError is a subclass of ValueError.
104
105"""
106
107__all__ = [
108    'NormalDist',
109    'StatisticsError',
110    'correlation',
111    'covariance',
112    'fmean',
113    'geometric_mean',
114    'harmonic_mean',
115    'kde',
116    'kde_random',
117    'linear_regression',
118    'mean',
119    'median',
120    'median_grouped',
121    'median_high',
122    'median_low',
123    'mode',
124    'multimode',
125    'pstdev',
126    'pvariance',
127    'quantiles',
128    'stdev',
129    'variance',
130]
131
132import math
133import numbers
134import random
135import sys
136
137from fractions import Fraction
138from decimal import Decimal
139from itertools import count, groupby, repeat
140from bisect import bisect_left, bisect_right
141from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod
142from math import isfinite, isinf, pi, cos, sin, tan, cosh, asin, atan, acos
143from functools import reduce
144from operator import itemgetter
145from collections import Counter, namedtuple, defaultdict
146
147_SQRT2 = sqrt(2.0)
148_random = random
149
150# === Exceptions ===
151
152class StatisticsError(ValueError):
153    pass
154
155
156# === Private utilities ===
157
158def _sum(data):
159    """_sum(data) -> (type, sum, count)
160
161    Return a high-precision sum of the given numeric data as a fraction,
162    together with the type to be converted to and the count of items.
163
164    Examples
165    --------
166
167    >>> _sum([3, 2.25, 4.5, -0.5, 0.25])
168    (<class 'float'>, Fraction(19, 2), 5)
169
170    Some sources of round-off error will be avoided:
171
172    # Built-in sum returns zero.
173    >>> _sum([1e50, 1, -1e50] * 1000)
174    (<class 'float'>, Fraction(1000, 1), 3000)
175
176    Fractions and Decimals are also supported:
177
178    >>> from fractions import Fraction as F
179    >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
180    (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
181
182    >>> from decimal import Decimal as D
183    >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
184    >>> _sum(data)
185    (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
186
187    Mixed types are currently treated as an error, except that int is
188    allowed.
189    """
190    count = 0
191    types = set()
192    types_add = types.add
193    partials = {}
194    partials_get = partials.get
195    for typ, values in groupby(data, type):
196        types_add(typ)
197        for n, d in map(_exact_ratio, values):
198            count += 1
199            partials[d] = partials_get(d, 0) + n
200    if None in partials:
201        # The sum will be a NAN or INF. We can ignore all the finite
202        # partials, and just look at this special one.
203        total = partials[None]
204        assert not _isfinite(total)
205    else:
206        # Sum all the partial sums using builtin sum.
207        total = sum(Fraction(n, d) for d, n in partials.items())
208    T = reduce(_coerce, types, int)  # or raise TypeError
209    return (T, total, count)
210
211
212def _ss(data, c=None):
213    """Return the exact mean and sum of square deviations of sequence data.
214
215    Calculations are done in a single pass, allowing the input to be an iterator.
216
217    If given *c* is used the mean; otherwise, it is calculated from the data.
218    Use the *c* argument with care, as it can lead to garbage results.
219
220    """
221    if c is not None:
222        T, ssd, count = _sum((d := x - c) * d for x in data)
223        return (T, ssd, c, count)
224    count = 0
225    types = set()
226    types_add = types.add
227    sx_partials = defaultdict(int)
228    sxx_partials = defaultdict(int)
229    for typ, values in groupby(data, type):
230        types_add(typ)
231        for n, d in map(_exact_ratio, values):
232            count += 1
233            sx_partials[d] += n
234            sxx_partials[d] += n * n
235    if not count:
236        ssd = c = Fraction(0)
237    elif None in sx_partials:
238        # The sum will be a NAN or INF. We can ignore all the finite
239        # partials, and just look at this special one.
240        ssd = c = sx_partials[None]
241        assert not _isfinite(ssd)
242    else:
243        sx = sum(Fraction(n, d) for d, n in sx_partials.items())
244        sxx = sum(Fraction(n, d*d) for d, n in sxx_partials.items())
245        # This formula has poor numeric properties for floats,
246        # but with fractions it is exact.
247        ssd = (count * sxx - sx * sx) / count
248        c = sx / count
249    T = reduce(_coerce, types, int)  # or raise TypeError
250    return (T, ssd, c, count)
251
252
253def _isfinite(x):
254    try:
255        return x.is_finite()  # Likely a Decimal.
256    except AttributeError:
257        return math.isfinite(x)  # Coerces to float first.
258
259
260def _coerce(T, S):
261    """Coerce types T and S to a common type, or raise TypeError.
262
263    Coercion rules are currently an implementation detail. See the CoerceTest
264    test class in test_statistics for details.
265    """
266    # See http://bugs.python.org/issue24068.
267    assert T is not bool, "initial type T is bool"
268    # If the types are the same, no need to coerce anything. Put this
269    # first, so that the usual case (no coercion needed) happens as soon
270    # as possible.
271    if T is S:  return T
272    # Mixed int & other coerce to the other type.
273    if S is int or S is bool:  return T
274    if T is int:  return S
275    # If one is a (strict) subclass of the other, coerce to the subclass.
276    if issubclass(S, T):  return S
277    if issubclass(T, S):  return T
278    # Ints coerce to the other type.
279    if issubclass(T, int):  return S
280    if issubclass(S, int):  return T
281    # Mixed fraction & float coerces to float (or float subclass).
282    if issubclass(T, Fraction) and issubclass(S, float):
283        return S
284    if issubclass(T, float) and issubclass(S, Fraction):
285        return T
286    # Any other combination is disallowed.
287    msg = "don't know how to coerce %s and %s"
288    raise TypeError(msg % (T.__name__, S.__name__))
289
290
291def _exact_ratio(x):
292    """Return Real number x to exact (numerator, denominator) pair.
293
294    >>> _exact_ratio(0.25)
295    (1, 4)
296
297    x is expected to be an int, Fraction, Decimal or float.
298    """
299
300    # XXX We should revisit whether using fractions to accumulate exact
301    # ratios is the right way to go.
302
303    # The integer ratios for binary floats can have numerators or
304    # denominators with over 300 decimal digits.  The problem is more
305    # acute with decimal floats where the default decimal context
306    # supports a huge range of exponents from Emin=-999999 to
307    # Emax=999999.  When expanded with as_integer_ratio(), numbers like
308    # Decimal('3.14E+5000') and Decimal('3.14E-5000') have large
309    # numerators or denominators that will slow computation.
310
311    # When the integer ratios are accumulated as fractions, the size
312    # grows to cover the full range from the smallest magnitude to the
313    # largest.  For example, Fraction(3.14E+300) + Fraction(3.14E-300),
314    # has a 616 digit numerator.  Likewise,
315    # Fraction(Decimal('3.14E+5000')) + Fraction(Decimal('3.14E-5000'))
316    # has 10,003 digit numerator.
317
318    # This doesn't seem to have been problem in practice, but it is a
319    # potential pitfall.
320
321    try:
322        return x.as_integer_ratio()
323    except AttributeError:
324        pass
325    except (OverflowError, ValueError):
326        # float NAN or INF.
327        assert not _isfinite(x)
328        return (x, None)
329    try:
330        # x may be an Integral ABC.
331        return (x.numerator, x.denominator)
332    except AttributeError:
333        msg = f"can't convert type '{type(x).__name__}' to numerator/denominator"
334        raise TypeError(msg)
335
336
337def _convert(value, T):
338    """Convert value to given numeric type T."""
339    if type(value) is T:
340        # This covers the cases where T is Fraction, or where value is
341        # a NAN or INF (Decimal or float).
342        return value
343    if issubclass(T, int) and value.denominator != 1:
344        T = float
345    try:
346        # FIXME: what do we do if this overflows?
347        return T(value)
348    except TypeError:
349        if issubclass(T, Decimal):
350            return T(value.numerator) / T(value.denominator)
351        else:
352            raise
353
354
355def _fail_neg(values, errmsg='negative value'):
356    """Iterate over values, failing if any are less than zero."""
357    for x in values:
358        if x < 0:
359            raise StatisticsError(errmsg)
360        yield x
361
362
363def _rank(data, /, *, key=None, reverse=False, ties='average', start=1) -> list[float]:
364    """Rank order a dataset. The lowest value has rank 1.
365
366    Ties are averaged so that equal values receive the same rank:
367
368        >>> data = [31, 56, 31, 25, 75, 18]
369        >>> _rank(data)
370        [3.5, 5.0, 3.5, 2.0, 6.0, 1.0]
371
372    The operation is idempotent:
373
374        >>> _rank([3.5, 5.0, 3.5, 2.0, 6.0, 1.0])
375        [3.5, 5.0, 3.5, 2.0, 6.0, 1.0]
376
377    It is possible to rank the data in reverse order so that the
378    highest value has rank 1.  Also, a key-function can extract
379    the field to be ranked:
380
381        >>> goals = [('eagles', 45), ('bears', 48), ('lions', 44)]
382        >>> _rank(goals, key=itemgetter(1), reverse=True)
383        [2.0, 1.0, 3.0]
384
385    Ranks are conventionally numbered starting from one; however,
386    setting *start* to zero allows the ranks to be used as array indices:
387
388        >>> prize = ['Gold', 'Silver', 'Bronze', 'Certificate']
389        >>> scores = [8.1, 7.3, 9.4, 8.3]
390        >>> [prize[int(i)] for i in _rank(scores, start=0, reverse=True)]
391        ['Bronze', 'Certificate', 'Gold', 'Silver']
392
393    """
394    # If this function becomes public at some point, more thought
395    # needs to be given to the signature.  A list of ints is
396    # plausible when ties is "min" or "max".  When ties is "average",
397    # either list[float] or list[Fraction] is plausible.
398
399    # Default handling of ties matches scipy.stats.mstats.spearmanr.
400    if ties != 'average':
401        raise ValueError(f'Unknown tie resolution method: {ties!r}')
402    if key is not None:
403        data = map(key, data)
404    val_pos = sorted(zip(data, count()), reverse=reverse)
405    i = start - 1
406    result = [0] * len(val_pos)
407    for _, g in groupby(val_pos, key=itemgetter(0)):
408        group = list(g)
409        size = len(group)
410        rank = i + (size + 1) / 2
411        for value, orig_pos in group:
412            result[orig_pos] = rank
413        i += size
414    return result
415
416
417def _integer_sqrt_of_frac_rto(n: int, m: int) -> int:
418    """Square root of n/m, rounded to the nearest integer using round-to-odd."""
419    # Reference: https://www.lri.fr/~melquion/doc/05-imacs17_1-expose.pdf
420    a = math.isqrt(n // m)
421    return a | (a*a*m != n)
422
423
424# For 53 bit precision floats, the bit width used in
425# _float_sqrt_of_frac() is 109.
426_sqrt_bit_width: int = 2 * sys.float_info.mant_dig + 3
427
428
429def _float_sqrt_of_frac(n: int, m: int) -> float:
430    """Square root of n/m as a float, correctly rounded."""
431    # See principle and proof sketch at: https://bugs.python.org/msg407078
432    q = (n.bit_length() - m.bit_length() - _sqrt_bit_width) // 2
433    if q >= 0:
434        numerator = _integer_sqrt_of_frac_rto(n, m << 2 * q) << q
435        denominator = 1
436    else:
437        numerator = _integer_sqrt_of_frac_rto(n << -2 * q, m)
438        denominator = 1 << -q
439    return numerator / denominator   # Convert to float
440
441
442def _decimal_sqrt_of_frac(n: int, m: int) -> Decimal:
443    """Square root of n/m as a Decimal, correctly rounded."""
444    # Premise:  For decimal, computing (n/m).sqrt() can be off
445    #           by 1 ulp from the correctly rounded result.
446    # Method:   Check the result, moving up or down a step if needed.
447    if n <= 0:
448        if not n:
449            return Decimal('0.0')
450        n, m = -n, -m
451
452    root = (Decimal(n) / Decimal(m)).sqrt()
453    nr, dr = root.as_integer_ratio()
454
455    plus = root.next_plus()
456    np, dp = plus.as_integer_ratio()
457    # test: n / m > ((root + plus) / 2) ** 2
458    if 4 * n * (dr*dp)**2 > m * (dr*np + dp*nr)**2:
459        return plus
460
461    minus = root.next_minus()
462    nm, dm = minus.as_integer_ratio()
463    # test: n / m < ((root + minus) / 2) ** 2
464    if 4 * n * (dr*dm)**2 < m * (dr*nm + dm*nr)**2:
465        return minus
466
467    return root
468
469
470# === Measures of central tendency (averages) ===
471
472def mean(data):
473    """Return the sample arithmetic mean of data.
474
475    >>> mean([1, 2, 3, 4, 4])
476    2.8
477
478    >>> from fractions import Fraction as F
479    >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
480    Fraction(13, 21)
481
482    >>> from decimal import Decimal as D
483    >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
484    Decimal('0.5625')
485
486    If ``data`` is empty, StatisticsError will be raised.
487    """
488    T, total, n = _sum(data)
489    if n < 1:
490        raise StatisticsError('mean requires at least one data point')
491    return _convert(total / n, T)
492
493
494def fmean(data, weights=None):
495    """Convert data to floats and compute the arithmetic mean.
496
497    This runs faster than the mean() function and it always returns a float.
498    If the input dataset is empty, it raises a StatisticsError.
499
500    >>> fmean([3.5, 4.0, 5.25])
501    4.25
502    """
503    if weights is None:
504        try:
505            n = len(data)
506        except TypeError:
507            # Handle iterators that do not define __len__().
508            n = 0
509            def count(iterable):
510                nonlocal n
511                for n, x in enumerate(iterable, start=1):
512                    yield x
513            data = count(data)
514        total = fsum(data)
515        if not n:
516            raise StatisticsError('fmean requires at least one data point')
517        return total / n
518    if not isinstance(weights, (list, tuple)):
519        weights = list(weights)
520    try:
521        num = sumprod(data, weights)
522    except ValueError:
523        raise StatisticsError('data and weights must be the same length')
524    den = fsum(weights)
525    if not den:
526        raise StatisticsError('sum of weights must be non-zero')
527    return num / den
528
529
530def geometric_mean(data):
531    """Convert data to floats and compute the geometric mean.
532
533    Raises a StatisticsError if the input dataset is empty
534    or if it contains a negative value.
535
536    Returns zero if the product of inputs is zero.
537
538    No special efforts are made to achieve exact results.
539    (However, this may change in the future.)
540
541    >>> round(geometric_mean([54, 24, 36]), 9)
542    36.0
543    """
544    n = 0
545    found_zero = False
546    def count_positive(iterable):
547        nonlocal n, found_zero
548        for n, x in enumerate(iterable, start=1):
549            if x > 0.0 or math.isnan(x):
550                yield x
551            elif x == 0.0:
552                found_zero = True
553            else:
554                raise StatisticsError('No negative inputs allowed', x)
555    total = fsum(map(log, count_positive(data)))
556    if not n:
557        raise StatisticsError('Must have a non-empty dataset')
558    if math.isnan(total):
559        return math.nan
560    if found_zero:
561        return math.nan if total == math.inf else 0.0
562    return exp(total / n)
563
564
565def harmonic_mean(data, weights=None):
566    """Return the harmonic mean of data.
567
568    The harmonic mean is the reciprocal of the arithmetic mean of the
569    reciprocals of the data.  It can be used for averaging ratios or
570    rates, for example speeds.
571
572    Suppose a car travels 40 km/hr for 5 km and then speeds-up to
573    60 km/hr for another 5 km. What is the average speed?
574
575        >>> harmonic_mean([40, 60])
576        48.0
577
578    Suppose a car travels 40 km/hr for 5 km, and when traffic clears,
579    speeds-up to 60 km/hr for the remaining 30 km of the journey. What
580    is the average speed?
581
582        >>> harmonic_mean([40, 60], weights=[5, 30])
583        56.0
584
585    If ``data`` is empty, or any element is less than zero,
586    ``harmonic_mean`` will raise ``StatisticsError``.
587    """
588    if iter(data) is data:
589        data = list(data)
590    errmsg = 'harmonic mean does not support negative values'
591    n = len(data)
592    if n < 1:
593        raise StatisticsError('harmonic_mean requires at least one data point')
594    elif n == 1 and weights is None:
595        x = data[0]
596        if isinstance(x, (numbers.Real, Decimal)):
597            if x < 0:
598                raise StatisticsError(errmsg)
599            return x
600        else:
601            raise TypeError('unsupported type')
602    if weights is None:
603        weights = repeat(1, n)
604        sum_weights = n
605    else:
606        if iter(weights) is weights:
607            weights = list(weights)
608        if len(weights) != n:
609            raise StatisticsError('Number of weights does not match data size')
610        _, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg))
611    try:
612        data = _fail_neg(data, errmsg)
613        T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data))
614    except ZeroDivisionError:
615        return 0
616    if total <= 0:
617        raise StatisticsError('Weighted sum must be positive')
618    return _convert(sum_weights / total, T)
619
620# FIXME: investigate ways to calculate medians without sorting? Quickselect?
621def median(data):
622    """Return the median (middle value) of numeric data.
623
624    When the number of data points is odd, return the middle data point.
625    When the number of data points is even, the median is interpolated by
626    taking the average of the two middle values:
627
628    >>> median([1, 3, 5])
629    3
630    >>> median([1, 3, 5, 7])
631    4.0
632
633    """
634    data = sorted(data)
635    n = len(data)
636    if n == 0:
637        raise StatisticsError("no median for empty data")
638    if n % 2 == 1:
639        return data[n // 2]
640    else:
641        i = n // 2
642        return (data[i - 1] + data[i]) / 2
643
644
645def median_low(data):
646    """Return the low median of numeric data.
647
648    When the number of data points is odd, the middle value is returned.
649    When it is even, the smaller of the two middle values is returned.
650
651    >>> median_low([1, 3, 5])
652    3
653    >>> median_low([1, 3, 5, 7])
654    3
655
656    """
657    data = sorted(data)
658    n = len(data)
659    if n == 0:
660        raise StatisticsError("no median for empty data")
661    if n % 2 == 1:
662        return data[n // 2]
663    else:
664        return data[n // 2 - 1]
665
666
667def median_high(data):
668    """Return the high median of data.
669
670    When the number of data points is odd, the middle value is returned.
671    When it is even, the larger of the two middle values is returned.
672
673    >>> median_high([1, 3, 5])
674    3
675    >>> median_high([1, 3, 5, 7])
676    5
677
678    """
679    data = sorted(data)
680    n = len(data)
681    if n == 0:
682        raise StatisticsError("no median for empty data")
683    return data[n // 2]
684
685
686def median_grouped(data, interval=1.0):
687    """Estimates the median for numeric data binned around the midpoints
688    of consecutive, fixed-width intervals.
689
690    The *data* can be any iterable of numeric data with each value being
691    exactly the midpoint of a bin.  At least one value must be present.
692
693    The *interval* is width of each bin.
694
695    For example, demographic information may have been summarized into
696    consecutive ten-year age groups with each group being represented
697    by the 5-year midpoints of the intervals:
698
699        >>> demographics = Counter({
700        ...    25: 172,   # 20 to 30 years old
701        ...    35: 484,   # 30 to 40 years old
702        ...    45: 387,   # 40 to 50 years old
703        ...    55:  22,   # 50 to 60 years old
704        ...    65:   6,   # 60 to 70 years old
705        ... })
706
707    The 50th percentile (median) is the 536th person out of the 1071
708    member cohort.  That person is in the 30 to 40 year old age group.
709
710    The regular median() function would assume that everyone in the
711    tricenarian age group was exactly 35 years old.  A more tenable
712    assumption is that the 484 members of that age group are evenly
713    distributed between 30 and 40.  For that, we use median_grouped().
714
715        >>> data = list(demographics.elements())
716        >>> median(data)
717        35
718        >>> round(median_grouped(data, interval=10), 1)
719        37.5
720
721    The caller is responsible for making sure the data points are separated
722    by exact multiples of *interval*.  This is essential for getting a
723    correct result.  The function does not check this precondition.
724
725    Inputs may be any numeric type that can be coerced to a float during
726    the interpolation step.
727
728    """
729    data = sorted(data)
730    n = len(data)
731    if not n:
732        raise StatisticsError("no median for empty data")
733
734    # Find the value at the midpoint. Remember this corresponds to the
735    # midpoint of the class interval.
736    x = data[n // 2]
737
738    # Using O(log n) bisection, find where all the x values occur in the data.
739    # All x will lie within data[i:j].
740    i = bisect_left(data, x)
741    j = bisect_right(data, x, lo=i)
742
743    # Coerce to floats, raising a TypeError if not possible
744    try:
745        interval = float(interval)
746        x = float(x)
747    except ValueError:
748        raise TypeError(f'Value cannot be converted to a float')
749
750    # Interpolate the median using the formula found at:
751    # https://www.cuemath.com/data/median-of-grouped-data/
752    L = x - interval / 2.0    # Lower limit of the median interval
753    cf = i                    # Cumulative frequency of the preceding interval
754    f = j - i                 # Number of elements in the median internal
755    return L + interval * (n / 2 - cf) / f
756
757
758def mode(data):
759    """Return the most common data point from discrete or nominal data.
760
761    ``mode`` assumes discrete data, and returns a single value. This is the
762    standard treatment of the mode as commonly taught in schools:
763
764        >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
765        3
766
767    This also works with nominal (non-numeric) data:
768
769        >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
770        'red'
771
772    If there are multiple modes with same frequency, return the first one
773    encountered:
774
775        >>> mode(['red', 'red', 'green', 'blue', 'blue'])
776        'red'
777
778    If *data* is empty, ``mode``, raises StatisticsError.
779
780    """
781    pairs = Counter(iter(data)).most_common(1)
782    try:
783        return pairs[0][0]
784    except IndexError:
785        raise StatisticsError('no mode for empty data') from None
786
787
788def multimode(data):
789    """Return a list of the most frequently occurring values.
790
791    Will return more than one result if there are multiple modes
792    or an empty list if *data* is empty.
793
794    >>> multimode('aabbbbbbbbcc')
795    ['b']
796    >>> multimode('aabbbbccddddeeffffgg')
797    ['b', 'd', 'f']
798    >>> multimode('')
799    []
800    """
801    counts = Counter(iter(data))
802    if not counts:
803        return []
804    maxcount = max(counts.values())
805    return [value for value, count in counts.items() if count == maxcount]
806
807
808def kde(data, h, kernel='normal', *, cumulative=False):
809    """Kernel Density Estimation:  Create a continuous probability density
810    function or cumulative distribution function from discrete samples.
811
812    The basic idea is to smooth the data using a kernel function
813    to help draw inferences about a population from a sample.
814
815    The degree of smoothing is controlled by the scaling parameter h
816    which is called the bandwidth.  Smaller values emphasize local
817    features while larger values give smoother results.
818
819    The kernel determines the relative weights of the sample data
820    points.  Generally, the choice of kernel shape does not matter
821    as much as the more influential bandwidth smoothing parameter.
822
823    Kernels that give some weight to every sample point:
824
825       normal (gauss)
826       logistic
827       sigmoid
828
829    Kernels that only give weight to sample points within
830    the bandwidth:
831
832       rectangular (uniform)
833       triangular
834       parabolic (epanechnikov)
835       quartic (biweight)
836       triweight
837       cosine
838
839    If *cumulative* is true, will return a cumulative distribution function.
840
841    A StatisticsError will be raised if the data sequence is empty.
842
843    Example
844    -------
845
846    Given a sample of six data points, construct a continuous
847    function that estimates the underlying probability density:
848
849        >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
850        >>> f_hat = kde(sample, h=1.5)
851
852    Compute the area under the curve:
853
854        >>> area = sum(f_hat(x) for x in range(-20, 20))
855        >>> round(area, 4)
856        1.0
857
858    Plot the estimated probability density function at
859    evenly spaced points from -6 to 10:
860
861        >>> for x in range(-6, 11):
862        ...     density = f_hat(x)
863        ...     plot = ' ' * int(density * 400) + 'x'
864        ...     print(f'{x:2}: {density:.3f} {plot}')
865        ...
866        -6: 0.002 x
867        -5: 0.009    x
868        -4: 0.031             x
869        -3: 0.070                             x
870        -2: 0.111                                             x
871        -1: 0.125                                                   x
872         0: 0.110                                            x
873         1: 0.086                                   x
874         2: 0.068                            x
875         3: 0.059                        x
876         4: 0.066                           x
877         5: 0.082                                 x
878         6: 0.082                                 x
879         7: 0.058                        x
880         8: 0.028            x
881         9: 0.009    x
882        10: 0.002 x
883
884    Estimate P(4.5 < X <= 7.5), the probability that a new sample value
885    will be between 4.5 and 7.5:
886
887        >>> cdf = kde(sample, h=1.5, cumulative=True)
888        >>> round(cdf(7.5) - cdf(4.5), 2)
889        0.22
890
891    References
892    ----------
893
894    Kernel density estimation and its application:
895    https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf
896
897    Kernel functions in common use:
898    https://en.wikipedia.org/wiki/Kernel_(statistics)#kernel_functions_in_common_use
899
900    Interactive graphical demonstration and exploration:
901    https://demonstrations.wolfram.com/KernelDensityEstimation/
902
903    Kernel estimation of cumulative distribution function of a random variable with bounded support
904    https://www.econstor.eu/bitstream/10419/207829/1/10.21307_stattrans-2016-037.pdf
905
906    """
907
908    n = len(data)
909    if not n:
910        raise StatisticsError('Empty data sequence')
911
912    if not isinstance(data[0], (int, float)):
913        raise TypeError('Data sequence must contain ints or floats')
914
915    if h <= 0.0:
916        raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}')
917
918    match kernel:
919
920        case 'normal' | 'gauss':
921            sqrt2pi = sqrt(2 * pi)
922            sqrt2 = sqrt(2)
923            K = lambda t: exp(-1/2 * t * t) / sqrt2pi
924            W = lambda t: 1/2 * (1.0 + erf(t / sqrt2))
925            support = None
926
927        case 'logistic':
928            # 1.0 / (exp(t) + 2.0 + exp(-t))
929            K = lambda t: 1/2 / (1.0 + cosh(t))
930            W = lambda t: 1.0 - 1.0 / (exp(t) + 1.0)
931            support = None
932
933        case 'sigmoid':
934            # (2/pi) / (exp(t) + exp(-t))
935            c1 = 1 / pi
936            c2 = 2 / pi
937            K = lambda t: c1 / cosh(t)
938            W = lambda t: c2 * atan(exp(t))
939            support = None
940
941        case 'rectangular' | 'uniform':
942            K = lambda t: 1/2
943            W = lambda t: 1/2 * t + 1/2
944            support = 1.0
945
946        case 'triangular':
947            K = lambda t: 1.0 - abs(t)
948            W = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2
949            support = 1.0
950
951        case 'parabolic' | 'epanechnikov':
952            K = lambda t: 3/4 * (1.0 - t * t)
953            W = lambda t: -1/4 * t**3 + 3/4 * t + 1/2
954            support = 1.0
955
956        case 'quartic' | 'biweight':
957            K = lambda t: 15/16 * (1.0 - t * t) ** 2
958            W = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2
959            support = 1.0
960
961        case 'triweight':
962            K = lambda t: 35/32 * (1.0 - t * t) ** 3
963            W = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2
964            support = 1.0
965
966        case 'cosine':
967            c1 = pi / 4
968            c2 = pi / 2
969            K = lambda t: c1 * cos(c2 * t)
970            W = lambda t: 1/2 * sin(c2 * t) + 1/2
971            support = 1.0
972
973        case _:
974            raise StatisticsError(f'Unknown kernel name: {kernel!r}')
975
976    if support is None:
977
978        def pdf(x):
979            n = len(data)
980            return sum(K((x - x_i) / h) for x_i in data) / (n * h)
981
982        def cdf(x):
983            n = len(data)
984            return sum(W((x - x_i) / h) for x_i in data) / n
985
986    else:
987
988        sample = sorted(data)
989        bandwidth = h * support
990
991        def pdf(x):
992            nonlocal n, sample
993            if len(data) != n:
994                sample = sorted(data)
995                n = len(data)
996            i = bisect_left(sample, x - bandwidth)
997            j = bisect_right(sample, x + bandwidth)
998            supported = sample[i : j]
999            return sum(K((x - x_i) / h) for x_i in supported) / (n * h)
1000
1001        def cdf(x):
1002            nonlocal n, sample
1003            if len(data) != n:
1004                sample = sorted(data)
1005                n = len(data)
1006            i = bisect_left(sample, x - bandwidth)
1007            j = bisect_right(sample, x + bandwidth)
1008            supported = sample[i : j]
1009            return sum((W((x - x_i) / h) for x_i in supported), i) / n
1010
1011    if cumulative:
1012        cdf.__doc__ = f'CDF estimate with {h=!r} and {kernel=!r}'
1013        return cdf
1014
1015    else:
1016        pdf.__doc__ = f'PDF estimate with {h=!r} and {kernel=!r}'
1017        return pdf
1018
1019
1020# Notes on methods for computing quantiles
1021# ----------------------------------------
1022#
1023# There is no one perfect way to compute quantiles.  Here we offer
1024# two methods that serve common needs.  Most other packages
1025# surveyed offered at least one or both of these two, making them
1026# "standard" in the sense of "widely-adopted and reproducible".
1027# They are also easy to explain, easy to compute manually, and have
1028# straight-forward interpretations that aren't surprising.
1029
1030# The default method is known as "R6", "PERCENTILE.EXC", or "expected
1031# value of rank order statistics". The alternative method is known as
1032# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
1033
1034# For sample data where there is a positive probability for values
1035# beyond the range of the data, the R6 exclusive method is a
1036# reasonable choice.  Consider a random sample of nine values from a
1037# population with a uniform distribution from 0.0 to 1.0.  The
1038# distribution of the third ranked sample point is described by
1039# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
1040# mean=0.300.  Only the latter (which corresponds with R6) gives the
1041# desired cut point with 30% of the population falling below that
1042# value, making it comparable to a result from an inv_cdf() function.
1043# The R6 exclusive method is also idempotent.
1044
1045# For describing population data where the end points are known to
1046# be included in the data, the R7 inclusive method is a reasonable
1047# choice.  Instead of the mean, it uses the mode of the beta
1048# distribution for the interior points.  Per Hyndman & Fan, "One nice
1049# property is that the vertices of Q7(p) divide the range into n - 1
1050# intervals, and exactly 100p% of the intervals lie to the left of
1051# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
1052
1053# If needed, other methods could be added.  However, for now, the
1054# position is that fewer options make for easier choices and that
1055# external packages can be used for anything more advanced.
1056
1057def quantiles(data, *, n=4, method='exclusive'):
1058    """Divide *data* into *n* continuous intervals with equal probability.
1059
1060    Returns a list of (n - 1) cut points separating the intervals.
1061
1062    Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
1063    Set *n* to 100 for percentiles which gives the 99 cuts points that
1064    separate *data* in to 100 equal sized groups.
1065
1066    The *data* can be any iterable containing sample.
1067    The cut points are linearly interpolated between data points.
1068
1069    If *method* is set to *inclusive*, *data* is treated as population
1070    data.  The minimum value is treated as the 0th percentile and the
1071    maximum value is treated as the 100th percentile.
1072    """
1073    if n < 1:
1074        raise StatisticsError('n must be at least 1')
1075    data = sorted(data)
1076    ld = len(data)
1077    if ld < 2:
1078        if ld == 1:
1079            return data * (n - 1)
1080        raise StatisticsError('must have at least one data point')
1081
1082    if method == 'inclusive':
1083        m = ld - 1
1084        result = []
1085        for i in range(1, n):
1086            j, delta = divmod(i * m, n)
1087            interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
1088            result.append(interpolated)
1089        return result
1090
1091    if method == 'exclusive':
1092        m = ld + 1
1093        result = []
1094        for i in range(1, n):
1095            j = i * m // n                               # rescale i to m/n
1096            j = 1 if j < 1 else ld-1 if j > ld-1 else j  # clamp to 1 .. ld-1
1097            delta = i*m - j*n                            # exact integer math
1098            interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
1099            result.append(interpolated)
1100        return result
1101
1102    raise ValueError(f'Unknown method: {method!r}')
1103
1104
1105# === Measures of spread ===
1106
1107# See http://mathworld.wolfram.com/Variance.html
1108#     http://mathworld.wolfram.com/SampleVariance.html
1109
1110
1111def variance(data, xbar=None):
1112    """Return the sample variance of data.
1113
1114    data should be an iterable of Real-valued numbers, with at least two
1115    values. The optional argument xbar, if given, should be the mean of
1116    the data. If it is missing or None, the mean is automatically calculated.
1117
1118    Use this function when your data is a sample from a population. To
1119    calculate the variance from the entire population, see ``pvariance``.
1120
1121    Examples:
1122
1123    >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
1124    >>> variance(data)
1125    1.3720238095238095
1126
1127    If you have already calculated the mean of your data, you can pass it as
1128    the optional second argument ``xbar`` to avoid recalculating it:
1129
1130    >>> m = mean(data)
1131    >>> variance(data, m)
1132    1.3720238095238095
1133
1134    This function does not check that ``xbar`` is actually the mean of
1135    ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
1136    impossible results.
1137
1138    Decimals and Fractions are supported:
1139
1140    >>> from decimal import Decimal as D
1141    >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
1142    Decimal('31.01875')
1143
1144    >>> from fractions import Fraction as F
1145    >>> variance([F(1, 6), F(1, 2), F(5, 3)])
1146    Fraction(67, 108)
1147
1148    """
1149    T, ss, c, n = _ss(data, xbar)
1150    if n < 2:
1151        raise StatisticsError('variance requires at least two data points')
1152    return _convert(ss / (n - 1), T)
1153
1154
1155def pvariance(data, mu=None):
1156    """Return the population variance of ``data``.
1157
1158    data should be a sequence or iterable of Real-valued numbers, with at least one
1159    value. The optional argument mu, if given, should be the mean of
1160    the data. If it is missing or None, the mean is automatically calculated.
1161
1162    Use this function to calculate the variance from the entire population.
1163    To estimate the variance from a sample, the ``variance`` function is
1164    usually a better choice.
1165
1166    Examples:
1167
1168    >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
1169    >>> pvariance(data)
1170    1.25
1171
1172    If you have already calculated the mean of the data, you can pass it as
1173    the optional second argument to avoid recalculating it:
1174
1175    >>> mu = mean(data)
1176    >>> pvariance(data, mu)
1177    1.25
1178
1179    Decimals and Fractions are supported:
1180
1181    >>> from decimal import Decimal as D
1182    >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
1183    Decimal('24.815')
1184
1185    >>> from fractions import Fraction as F
1186    >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
1187    Fraction(13, 72)
1188
1189    """
1190    T, ss, c, n = _ss(data, mu)
1191    if n < 1:
1192        raise StatisticsError('pvariance requires at least one data point')
1193    return _convert(ss / n, T)
1194
1195
1196def stdev(data, xbar=None):
1197    """Return the square root of the sample variance.
1198
1199    See ``variance`` for arguments and other details.
1200
1201    >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
1202    1.0810874155219827
1203
1204    """
1205    T, ss, c, n = _ss(data, xbar)
1206    if n < 2:
1207        raise StatisticsError('stdev requires at least two data points')
1208    mss = ss / (n - 1)
1209    if issubclass(T, Decimal):
1210        return _decimal_sqrt_of_frac(mss.numerator, mss.denominator)
1211    return _float_sqrt_of_frac(mss.numerator, mss.denominator)
1212
1213
1214def pstdev(data, mu=None):
1215    """Return the square root of the population variance.
1216
1217    See ``pvariance`` for arguments and other details.
1218
1219    >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
1220    0.986893273527251
1221
1222    """
1223    T, ss, c, n = _ss(data, mu)
1224    if n < 1:
1225        raise StatisticsError('pstdev requires at least one data point')
1226    mss = ss / n
1227    if issubclass(T, Decimal):
1228        return _decimal_sqrt_of_frac(mss.numerator, mss.denominator)
1229    return _float_sqrt_of_frac(mss.numerator, mss.denominator)
1230
1231
1232def _mean_stdev(data):
1233    """In one pass, compute the mean and sample standard deviation as floats."""
1234    T, ss, xbar, n = _ss(data)
1235    if n < 2:
1236        raise StatisticsError('stdev requires at least two data points')
1237    mss = ss / (n - 1)
1238    try:
1239        return float(xbar), _float_sqrt_of_frac(mss.numerator, mss.denominator)
1240    except AttributeError:
1241        # Handle Nans and Infs gracefully
1242        return float(xbar), float(xbar) / float(ss)
1243
1244def _sqrtprod(x: float, y: float) -> float:
1245    "Return sqrt(x * y) computed with improved accuracy and without overflow/underflow."
1246    h = sqrt(x * y)
1247    if not isfinite(h):
1248        if isinf(h) and not isinf(x) and not isinf(y):
1249            # Finite inputs overflowed, so scale down, and recompute.
1250            scale = 2.0 ** -512  # sqrt(1 / sys.float_info.max)
1251            return _sqrtprod(scale * x, scale * y) / scale
1252        return h
1253    if not h:
1254        if x and y:
1255            # Non-zero inputs underflowed, so scale up, and recompute.
1256            # Scale:  1 / sqrt(sys.float_info.min * sys.float_info.epsilon)
1257            scale = 2.0 ** 537
1258            return _sqrtprod(scale * x, scale * y) / scale
1259        return h
1260    # Improve accuracy with a differential correction.
1261    # https://www.wolframalpha.com/input/?i=Maclaurin+series+sqrt%28h**2+%2B+x%29+at+x%3D0
1262    d = sumprod((x, h), (y, -h))
1263    return h + d / (2.0 * h)
1264
1265
1266# === Statistics for relations between two inputs ===
1267
1268# See https://en.wikipedia.org/wiki/Covariance
1269#     https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
1270#     https://en.wikipedia.org/wiki/Simple_linear_regression
1271
1272
1273def covariance(x, y, /):
1274    """Covariance
1275
1276    Return the sample covariance of two inputs *x* and *y*. Covariance
1277    is a measure of the joint variability of two inputs.
1278
1279    >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
1280    >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
1281    >>> covariance(x, y)
1282    0.75
1283    >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1]
1284    >>> covariance(x, z)
1285    -7.5
1286    >>> covariance(z, x)
1287    -7.5
1288
1289    """
1290    n = len(x)
1291    if len(y) != n:
1292        raise StatisticsError('covariance requires that both inputs have same number of data points')
1293    if n < 2:
1294        raise StatisticsError('covariance requires at least two data points')
1295    xbar = fsum(x) / n
1296    ybar = fsum(y) / n
1297    sxy = sumprod((xi - xbar for xi in x), (yi - ybar for yi in y))
1298    return sxy / (n - 1)
1299
1300
1301def correlation(x, y, /, *, method='linear'):
1302    """Pearson's correlation coefficient
1303
1304    Return the Pearson's correlation coefficient for two inputs. Pearson's
1305    correlation coefficient *r* takes values between -1 and +1. It measures
1306    the strength and direction of a linear relationship.
1307
1308    >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
1309    >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
1310    >>> correlation(x, x)
1311    1.0
1312    >>> correlation(x, y)
1313    -1.0
1314
1315    If *method* is "ranked", computes Spearman's rank correlation coefficient
1316    for two inputs.  The data is replaced by ranks.  Ties are averaged
1317    so that equal values receive the same rank.  The resulting coefficient
1318    measures the strength of a monotonic relationship.
1319
1320    Spearman's rank correlation coefficient is appropriate for ordinal
1321    data or for continuous data that doesn't meet the linear proportion
1322    requirement for Pearson's correlation coefficient.
1323    """
1324    n = len(x)
1325    if len(y) != n:
1326        raise StatisticsError('correlation requires that both inputs have same number of data points')
1327    if n < 2:
1328        raise StatisticsError('correlation requires at least two data points')
1329    if method not in {'linear', 'ranked'}:
1330        raise ValueError(f'Unknown method: {method!r}')
1331    if method == 'ranked':
1332        start = (n - 1) / -2            # Center rankings around zero
1333        x = _rank(x, start=start)
1334        y = _rank(y, start=start)
1335    else:
1336        xbar = fsum(x) / n
1337        ybar = fsum(y) / n
1338        x = [xi - xbar for xi in x]
1339        y = [yi - ybar for yi in y]
1340    sxy = sumprod(x, y)
1341    sxx = sumprod(x, x)
1342    syy = sumprod(y, y)
1343    try:
1344        return sxy / _sqrtprod(sxx, syy)
1345    except ZeroDivisionError:
1346        raise StatisticsError('at least one of the inputs is constant')
1347
1348
1349LinearRegression = namedtuple('LinearRegression', ('slope', 'intercept'))
1350
1351
1352def linear_regression(x, y, /, *, proportional=False):
1353    """Slope and intercept for simple linear regression.
1354
1355    Return the slope and intercept of simple linear regression
1356    parameters estimated using ordinary least squares. Simple linear
1357    regression describes relationship between an independent variable
1358    *x* and a dependent variable *y* in terms of a linear function:
1359
1360        y = slope * x + intercept + noise
1361
1362    where *slope* and *intercept* are the regression parameters that are
1363    estimated, and noise represents the variability of the data that was
1364    not explained by the linear regression (it is equal to the
1365    difference between predicted and actual values of the dependent
1366    variable).
1367
1368    The parameters are returned as a named tuple.
1369
1370    >>> x = [1, 2, 3, 4, 5]
1371    >>> noise = NormalDist().samples(5, seed=42)
1372    >>> y = [3 * x[i] + 2 + noise[i] for i in range(5)]
1373    >>> linear_regression(x, y)  #doctest: +ELLIPSIS
1374    LinearRegression(slope=3.17495..., intercept=1.00925...)
1375
1376    If *proportional* is true, the independent variable *x* and the
1377    dependent variable *y* are assumed to be directly proportional.
1378    The data is fit to a line passing through the origin.
1379
1380    Since the *intercept* will always be 0.0, the underlying linear
1381    function simplifies to:
1382
1383        y = slope * x + noise
1384
1385    >>> y = [3 * x[i] + noise[i] for i in range(5)]
1386    >>> linear_regression(x, y, proportional=True)  #doctest: +ELLIPSIS
1387    LinearRegression(slope=2.90475..., intercept=0.0)
1388
1389    """
1390    n = len(x)
1391    if len(y) != n:
1392        raise StatisticsError('linear regression requires that both inputs have same number of data points')
1393    if n < 2:
1394        raise StatisticsError('linear regression requires at least two data points')
1395    if not proportional:
1396        xbar = fsum(x) / n
1397        ybar = fsum(y) / n
1398        x = [xi - xbar for xi in x]  # List because used three times below
1399        y = (yi - ybar for yi in y)  # Generator because only used once below
1400    sxy = sumprod(x, y) + 0.0        # Add zero to coerce result to a float
1401    sxx = sumprod(x, x)
1402    try:
1403        slope = sxy / sxx   # equivalent to:  covariance(x, y) / variance(x)
1404    except ZeroDivisionError:
1405        raise StatisticsError('x is constant')
1406    intercept = 0.0 if proportional else ybar - slope * xbar
1407    return LinearRegression(slope=slope, intercept=intercept)
1408
1409
1410## Normal Distribution #####################################################
1411
1412
1413def _normal_dist_inv_cdf(p, mu, sigma):
1414    # There is no closed-form solution to the inverse CDF for the normal
1415    # distribution, so we use a rational approximation instead:
1416    # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
1417    # Normal Distribution".  Applied Statistics. Blackwell Publishing. 37
1418    # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
1419    q = p - 0.5
1420    if fabs(q) <= 0.425:
1421        r = 0.180625 - q * q
1422        # Hash sum: 55.88319_28806_14901_4439
1423        num = (((((((2.50908_09287_30122_6727e+3 * r +
1424                     3.34305_75583_58812_8105e+4) * r +
1425                     6.72657_70927_00870_0853e+4) * r +
1426                     4.59219_53931_54987_1457e+4) * r +
1427                     1.37316_93765_50946_1125e+4) * r +
1428                     1.97159_09503_06551_4427e+3) * r +
1429                     1.33141_66789_17843_7745e+2) * r +
1430                     3.38713_28727_96366_6080e+0) * q
1431        den = (((((((5.22649_52788_52854_5610e+3 * r +
1432                     2.87290_85735_72194_2674e+4) * r +
1433                     3.93078_95800_09271_0610e+4) * r +
1434                     2.12137_94301_58659_5867e+4) * r +
1435                     5.39419_60214_24751_1077e+3) * r +
1436                     6.87187_00749_20579_0830e+2) * r +
1437                     4.23133_30701_60091_1252e+1) * r +
1438                     1.0)
1439        x = num / den
1440        return mu + (x * sigma)
1441    r = p if q <= 0.0 else 1.0 - p
1442    r = sqrt(-log(r))
1443    if r <= 5.0:
1444        r = r - 1.6
1445        # Hash sum: 49.33206_50330_16102_89036
1446        num = (((((((7.74545_01427_83414_07640e-4 * r +
1447                     2.27238_44989_26918_45833e-2) * r +
1448                     2.41780_72517_74506_11770e-1) * r +
1449                     1.27045_82524_52368_38258e+0) * r +
1450                     3.64784_83247_63204_60504e+0) * r +
1451                     5.76949_72214_60691_40550e+0) * r +
1452                     4.63033_78461_56545_29590e+0) * r +
1453                     1.42343_71107_49683_57734e+0)
1454        den = (((((((1.05075_00716_44416_84324e-9 * r +
1455                     5.47593_80849_95344_94600e-4) * r +
1456                     1.51986_66563_61645_71966e-2) * r +
1457                     1.48103_97642_74800_74590e-1) * r +
1458                     6.89767_33498_51000_04550e-1) * r +
1459                     1.67638_48301_83803_84940e+0) * r +
1460                     2.05319_16266_37758_82187e+0) * r +
1461                     1.0)
1462    else:
1463        r = r - 5.0
1464        # Hash sum: 47.52583_31754_92896_71629
1465        num = (((((((2.01033_43992_92288_13265e-7 * r +
1466                     2.71155_55687_43487_57815e-5) * r +
1467                     1.24266_09473_88078_43860e-3) * r +
1468                     2.65321_89526_57612_30930e-2) * r +
1469                     2.96560_57182_85048_91230e-1) * r +
1470                     1.78482_65399_17291_33580e+0) * r +
1471                     5.46378_49111_64114_36990e+0) * r +
1472                     6.65790_46435_01103_77720e+0)
1473        den = (((((((2.04426_31033_89939_78564e-15 * r +
1474                     1.42151_17583_16445_88870e-7) * r +
1475                     1.84631_83175_10054_68180e-5) * r +
1476                     7.86869_13114_56132_59100e-4) * r +
1477                     1.48753_61290_85061_48525e-2) * r +
1478                     1.36929_88092_27358_05310e-1) * r +
1479                     5.99832_20655_58879_37690e-1) * r +
1480                     1.0)
1481    x = num / den
1482    if q < 0.0:
1483        x = -x
1484    return mu + (x * sigma)
1485
1486
1487# If available, use C implementation
1488try:
1489    from _statistics import _normal_dist_inv_cdf
1490except ImportError:
1491    pass
1492
1493
1494class NormalDist:
1495    "Normal distribution of a random variable"
1496    # https://en.wikipedia.org/wiki/Normal_distribution
1497    # https://en.wikipedia.org/wiki/Variance#Properties
1498
1499    __slots__ = {
1500        '_mu': 'Arithmetic mean of a normal distribution',
1501        '_sigma': 'Standard deviation of a normal distribution',
1502    }
1503
1504    def __init__(self, mu=0.0, sigma=1.0):
1505        "NormalDist where mu is the mean and sigma is the standard deviation."
1506        if sigma < 0.0:
1507            raise StatisticsError('sigma must be non-negative')
1508        self._mu = float(mu)
1509        self._sigma = float(sigma)
1510
1511    @classmethod
1512    def from_samples(cls, data):
1513        "Make a normal distribution instance from sample data."
1514        return cls(*_mean_stdev(data))
1515
1516    def samples(self, n, *, seed=None):
1517        "Generate *n* samples for a given mean and standard deviation."
1518        rnd = random.random if seed is None else random.Random(seed).random
1519        inv_cdf = _normal_dist_inv_cdf
1520        mu = self._mu
1521        sigma = self._sigma
1522        return [inv_cdf(rnd(), mu, sigma) for _ in repeat(None, n)]
1523
1524    def pdf(self, x):
1525        "Probability density function.  P(x <= X < x+dx) / dx"
1526        variance = self._sigma * self._sigma
1527        if not variance:
1528            raise StatisticsError('pdf() not defined when sigma is zero')
1529        diff = x - self._mu
1530        return exp(diff * diff / (-2.0 * variance)) / sqrt(tau * variance)
1531
1532    def cdf(self, x):
1533        "Cumulative distribution function.  P(X <= x)"
1534        if not self._sigma:
1535            raise StatisticsError('cdf() not defined when sigma is zero')
1536        return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * _SQRT2)))
1537
1538    def inv_cdf(self, p):
1539        """Inverse cumulative distribution function.  x : P(X <= x) = p
1540
1541        Finds the value of the random variable such that the probability of
1542        the variable being less than or equal to that value equals the given
1543        probability.
1544
1545        This function is also called the percent point function or quantile
1546        function.
1547        """
1548        if p <= 0.0 or p >= 1.0:
1549            raise StatisticsError('p must be in the range 0.0 < p < 1.0')
1550        return _normal_dist_inv_cdf(p, self._mu, self._sigma)
1551
1552    def quantiles(self, n=4):
1553        """Divide into *n* continuous intervals with equal probability.
1554
1555        Returns a list of (n - 1) cut points separating the intervals.
1556
1557        Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
1558        Set *n* to 100 for percentiles which gives the 99 cuts points that
1559        separate the normal distribution in to 100 equal sized groups.
1560        """
1561        return [self.inv_cdf(i / n) for i in range(1, n)]
1562
1563    def overlap(self, other):
1564        """Compute the overlapping coefficient (OVL) between two normal distributions.
1565
1566        Measures the agreement between two normal probability distributions.
1567        Returns a value between 0.0 and 1.0 giving the overlapping area in
1568        the two underlying probability density functions.
1569
1570            >>> N1 = NormalDist(2.4, 1.6)
1571            >>> N2 = NormalDist(3.2, 2.0)
1572            >>> N1.overlap(N2)
1573            0.8035050657330205
1574        """
1575        # See: "The overlapping coefficient as a measure of agreement between
1576        # probability distributions and point estimation of the overlap of two
1577        # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
1578        # http://dx.doi.org/10.1080/03610928908830127
1579        if not isinstance(other, NormalDist):
1580            raise TypeError('Expected another NormalDist instance')
1581        X, Y = self, other
1582        if (Y._sigma, Y._mu) < (X._sigma, X._mu):  # sort to assure commutativity
1583            X, Y = Y, X
1584        X_var, Y_var = X.variance, Y.variance
1585        if not X_var or not Y_var:
1586            raise StatisticsError('overlap() not defined when sigma is zero')
1587        dv = Y_var - X_var
1588        dm = fabs(Y._mu - X._mu)
1589        if not dv:
1590            return 1.0 - erf(dm / (2.0 * X._sigma * _SQRT2))
1591        a = X._mu * Y_var - Y._mu * X_var
1592        b = X._sigma * Y._sigma * sqrt(dm * dm + dv * log(Y_var / X_var))
1593        x1 = (a + b) / dv
1594        x2 = (a - b) / dv
1595        return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
1596
1597    def zscore(self, x):
1598        """Compute the Standard Score.  (x - mean) / stdev
1599
1600        Describes *x* in terms of the number of standard deviations
1601        above or below the mean of the normal distribution.
1602        """
1603        # https://www.statisticshowto.com/probability-and-statistics/z-score/
1604        if not self._sigma:
1605            raise StatisticsError('zscore() not defined when sigma is zero')
1606        return (x - self._mu) / self._sigma
1607
1608    @property
1609    def mean(self):
1610        "Arithmetic mean of the normal distribution."
1611        return self._mu
1612
1613    @property
1614    def median(self):
1615        "Return the median of the normal distribution"
1616        return self._mu
1617
1618    @property
1619    def mode(self):
1620        """Return the mode of the normal distribution
1621
1622        The mode is the value x where which the probability density
1623        function (pdf) takes its maximum value.
1624        """
1625        return self._mu
1626
1627    @property
1628    def stdev(self):
1629        "Standard deviation of the normal distribution."
1630        return self._sigma
1631
1632    @property
1633    def variance(self):
1634        "Square of the standard deviation."
1635        return self._sigma * self._sigma
1636
1637    def __add__(x1, x2):
1638        """Add a constant or another NormalDist instance.
1639
1640        If *other* is a constant, translate mu by the constant,
1641        leaving sigma unchanged.
1642
1643        If *other* is a NormalDist, add both the means and the variances.
1644        Mathematically, this works only if the two distributions are
1645        independent or if they are jointly normally distributed.
1646        """
1647        if isinstance(x2, NormalDist):
1648            return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
1649        return NormalDist(x1._mu + x2, x1._sigma)
1650
1651    def __sub__(x1, x2):
1652        """Subtract a constant or another NormalDist instance.
1653
1654        If *other* is a constant, translate by the constant mu,
1655        leaving sigma unchanged.
1656
1657        If *other* is a NormalDist, subtract the means and add the variances.
1658        Mathematically, this works only if the two distributions are
1659        independent or if they are jointly normally distributed.
1660        """
1661        if isinstance(x2, NormalDist):
1662            return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
1663        return NormalDist(x1._mu - x2, x1._sigma)
1664
1665    def __mul__(x1, x2):
1666        """Multiply both mu and sigma by a constant.
1667
1668        Used for rescaling, perhaps to change measurement units.
1669        Sigma is scaled with the absolute value of the constant.
1670        """
1671        return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
1672
1673    def __truediv__(x1, x2):
1674        """Divide both mu and sigma by a constant.
1675
1676        Used for rescaling, perhaps to change measurement units.
1677        Sigma is scaled with the absolute value of the constant.
1678        """
1679        return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
1680
1681    def __pos__(x1):
1682        "Return a copy of the instance."
1683        return NormalDist(x1._mu, x1._sigma)
1684
1685    def __neg__(x1):
1686        "Negates mu while keeping sigma the same."
1687        return NormalDist(-x1._mu, x1._sigma)
1688
1689    __radd__ = __add__
1690
1691    def __rsub__(x1, x2):
1692        "Subtract a NormalDist from a constant or another NormalDist."
1693        return -(x1 - x2)
1694
1695    __rmul__ = __mul__
1696
1697    def __eq__(x1, x2):
1698        "Two NormalDist objects are equal if their mu and sigma are both equal."
1699        if not isinstance(x2, NormalDist):
1700            return NotImplemented
1701        return x1._mu == x2._mu and x1._sigma == x2._sigma
1702
1703    def __hash__(self):
1704        "NormalDist objects hash equal if their mu and sigma are both equal."
1705        return hash((self._mu, self._sigma))
1706
1707    def __repr__(self):
1708        return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'
1709
1710    def __getstate__(self):
1711        return self._mu, self._sigma
1712
1713    def __setstate__(self, state):
1714        self._mu, self._sigma = state
1715
1716
1717## kde_random() ##############################################################
1718
1719def _newton_raphson(f_inv_estimate, f, f_prime, tolerance=1e-12):
1720    def f_inv(y):
1721        "Return x such that f(x) ≈ y within the specified tolerance."
1722        x = f_inv_estimate(y)
1723        while abs(diff := f(x) - y) > tolerance:
1724            x -= diff / f_prime(x)
1725        return x
1726    return f_inv
1727
1728def _quartic_invcdf_estimate(p):
1729    sign, p = (1.0, p) if p <= 1/2 else (-1.0, 1.0 - p)
1730    x = (2.0 * p) ** 0.4258865685331 - 1.0
1731    if p >= 0.004 < 0.499:
1732        x += 0.026818732 * sin(7.101753784 * p + 2.73230839482953)
1733    return x * sign
1734
1735_quartic_invcdf = _newton_raphson(
1736    f_inv_estimate = _quartic_invcdf_estimate,
1737    f = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2,
1738    f_prime = lambda t: 15/16 * (1.0 - t * t) ** 2)
1739
1740def _triweight_invcdf_estimate(p):
1741    sign, p = (1.0, p) if p <= 1/2 else (-1.0, 1.0 - p)
1742    x = (2.0 * p) ** 0.3400218741872791 - 1.0
1743    return x * sign
1744
1745_triweight_invcdf = _newton_raphson(
1746    f_inv_estimate = _triweight_invcdf_estimate,
1747    f = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2,
1748    f_prime = lambda t: 35/32 * (1.0 - t * t) ** 3)
1749
1750_kernel_invcdfs = {
1751    'normal': NormalDist().inv_cdf,
1752    'logistic': lambda p: log(p / (1 - p)),
1753    'sigmoid': lambda p: log(tan(p * pi/2)),
1754    'rectangular': lambda p: 2*p - 1,
1755    'parabolic': lambda p: 2 * cos((acos(2*p-1) + pi) / 3),
1756    'quartic': _quartic_invcdf,
1757    'triweight': _triweight_invcdf,
1758    'triangular': lambda p: sqrt(2*p) - 1 if p < 1/2 else 1 - sqrt(2 - 2*p),
1759    'cosine': lambda p: 2 * asin(2*p - 1) / pi,
1760}
1761_kernel_invcdfs['gauss'] = _kernel_invcdfs['normal']
1762_kernel_invcdfs['uniform'] = _kernel_invcdfs['rectangular']
1763_kernel_invcdfs['epanechnikov'] = _kernel_invcdfs['parabolic']
1764_kernel_invcdfs['biweight'] = _kernel_invcdfs['quartic']
1765
1766def kde_random(data, h, kernel='normal', *, seed=None):
1767    """Return a function that makes a random selection from the estimated
1768    probability density function created by kde(data, h, kernel).
1769
1770    Providing a *seed* allows reproducible selections within a single
1771    thread.  The seed may be an integer, float, str, or bytes.
1772
1773    A StatisticsError will be raised if the *data* sequence is empty.
1774
1775    Example:
1776
1777    >>> data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
1778    >>> rand = kde_random(data, h=1.5, seed=8675309)
1779    >>> new_selections = [rand() for i in range(10)]
1780    >>> [round(x, 1) for x in new_selections]
1781    [0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6]
1782
1783    """
1784    n = len(data)
1785    if not n:
1786        raise StatisticsError('Empty data sequence')
1787
1788    if not isinstance(data[0], (int, float)):
1789        raise TypeError('Data sequence must contain ints or floats')
1790
1791    if h <= 0.0:
1792        raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}')
1793
1794    kernel_invcdf = _kernel_invcdfs.get(kernel)
1795    if kernel_invcdf is None:
1796        raise StatisticsError(f'Unknown kernel name: {kernel!r}')
1797
1798    prng = _random.Random(seed)
1799    random = prng.random
1800    choice = prng.choice
1801
1802    def rand():
1803        return choice(data) + h * kernel_invcdf(random())
1804
1805    rand.__doc__ = f'Random KDE selection with {h=!r} and {kernel=!r}'
1806
1807    return rand
1808