• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
10==================  ==================================================
11Function            Description
12==================  ==================================================
13mean                Arithmetic mean (average) of data.
14fmean               Fast, floating point arithmetic mean.
15geometric_mean      Geometric mean of data.
16harmonic_mean       Harmonic mean of data.
17median              Median (middle value) of data.
18median_low          Low median of data.
19median_high         High median of data.
20median_grouped      Median, or 50th percentile, of grouped data.
21mode                Mode (most common value) of data.
22multimode           List of modes (most common values of data).
23quantiles           Divide data into intervals with equal probability.
24==================  ==================================================
25
26Calculate the arithmetic mean ("the average") of data:
27
28>>> mean([-1.0, 2.5, 3.25, 5.75])
292.625
30
31
32Calculate the standard median of discrete data:
33
34>>> median([2, 3, 4, 5])
353.5
36
37
38Calculate the median, or 50th percentile, of data grouped into class intervals
39centred on the data values provided. E.g. if your data points are rounded to
40the nearest whole number:
41
42>>> median_grouped([2, 2, 3, 3, 3, 4])  #doctest: +ELLIPSIS
432.8333333333...
44
45This should be interpreted in this way: you have two data points in the class
46interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
47the class interval 3.5-4.5. The median of these data points is 2.8333...
48
49
50Calculating variability or spread
51---------------------------------
52
53==================  =============================================
54Function            Description
55==================  =============================================
56pvariance           Population variance of data.
57variance            Sample variance of data.
58pstdev              Population standard deviation of data.
59stdev               Sample standard deviation of data.
60==================  =============================================
61
62Calculate the standard deviation of sample data:
63
64>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75])  #doctest: +ELLIPSIS
654.38961843444...
66
67If you have previously calculated the mean, you can pass it as the optional
68second argument to the four "spread" functions to avoid recalculating it:
69
70>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
71>>> mu = mean(data)
72>>> pvariance(data, mu)
732.5
74
75
76Exceptions
77----------
78
79A single exception is defined: StatisticsError is a subclass of ValueError.
80
81"""
82
83__all__ = [
84    'NormalDist',
85    'StatisticsError',
86    'fmean',
87    'geometric_mean',
88    'harmonic_mean',
89    'mean',
90    'median',
91    'median_grouped',
92    'median_high',
93    'median_low',
94    'mode',
95    'multimode',
96    'pstdev',
97    'pvariance',
98    'quantiles',
99    'stdev',
100    'variance',
101]
102
103import math
104import numbers
105import random
106
107from fractions import Fraction
108from decimal import Decimal
109from itertools import groupby
110from bisect import bisect_left, bisect_right
111from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
112from operator import itemgetter
113from collections import Counter
114
115# === Exceptions ===
116
117class StatisticsError(ValueError):
118    pass
119
120
121# === Private utilities ===
122
123def _sum(data, start=0):
124    """_sum(data [, start]) -> (type, sum, count)
125
126    Return a high-precision sum of the given numeric data as a fraction,
127    together with the type to be converted to and the count of items.
128
129    If optional argument ``start`` is given, it is added to the total.
130    If ``data`` is empty, ``start`` (defaulting to 0) is returned.
131
132
133    Examples
134    --------
135
136    >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
137    (<class 'float'>, Fraction(11, 1), 5)
138
139    Some sources of round-off error will be avoided:
140
141    # Built-in sum returns zero.
142    >>> _sum([1e50, 1, -1e50] * 1000)
143    (<class 'float'>, Fraction(1000, 1), 3000)
144
145    Fractions and Decimals are also supported:
146
147    >>> from fractions import Fraction as F
148    >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
149    (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
150
151    >>> from decimal import Decimal as D
152    >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
153    >>> _sum(data)
154    (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
155
156    Mixed types are currently treated as an error, except that int is
157    allowed.
158    """
159    count = 0
160    n, d = _exact_ratio(start)
161    partials = {d: n}
162    partials_get = partials.get
163    T = _coerce(int, type(start))
164    for typ, values in groupby(data, type):
165        T = _coerce(T, typ)  # or raise TypeError
166        for n, d in map(_exact_ratio, values):
167            count += 1
168            partials[d] = partials_get(d, 0) + n
169    if None in partials:
170        # The sum will be a NAN or INF. We can ignore all the finite
171        # partials, and just look at this special one.
172        total = partials[None]
173        assert not _isfinite(total)
174    else:
175        # Sum all the partial sums using builtin sum.
176        # FIXME is this faster if we sum them in order of the denominator?
177        total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
178    return (T, total, count)
179
180
181def _isfinite(x):
182    try:
183        return x.is_finite()  # Likely a Decimal.
184    except AttributeError:
185        return math.isfinite(x)  # Coerces to float first.
186
187
188def _coerce(T, S):
189    """Coerce types T and S to a common type, or raise TypeError.
190
191    Coercion rules are currently an implementation detail. See the CoerceTest
192    test class in test_statistics for details.
193    """
194    # See http://bugs.python.org/issue24068.
195    assert T is not bool, "initial type T is bool"
196    # If the types are the same, no need to coerce anything. Put this
197    # first, so that the usual case (no coercion needed) happens as soon
198    # as possible.
199    if T is S:  return T
200    # Mixed int & other coerce to the other type.
201    if S is int or S is bool:  return T
202    if T is int:  return S
203    # If one is a (strict) subclass of the other, coerce to the subclass.
204    if issubclass(S, T):  return S
205    if issubclass(T, S):  return T
206    # Ints coerce to the other type.
207    if issubclass(T, int):  return S
208    if issubclass(S, int):  return T
209    # Mixed fraction & float coerces to float (or float subclass).
210    if issubclass(T, Fraction) and issubclass(S, float):
211        return S
212    if issubclass(T, float) and issubclass(S, Fraction):
213        return T
214    # Any other combination is disallowed.
215    msg = "don't know how to coerce %s and %s"
216    raise TypeError(msg % (T.__name__, S.__name__))
217
218
219def _exact_ratio(x):
220    """Return Real number x to exact (numerator, denominator) pair.
221
222    >>> _exact_ratio(0.25)
223    (1, 4)
224
225    x is expected to be an int, Fraction, Decimal or float.
226    """
227    try:
228        # Optimise the common case of floats. We expect that the most often
229        # used numeric type will be builtin floats, so try to make this as
230        # fast as possible.
231        if type(x) is float or type(x) is Decimal:
232            return x.as_integer_ratio()
233        try:
234            # x may be an int, Fraction, or Integral ABC.
235            return (x.numerator, x.denominator)
236        except AttributeError:
237            try:
238                # x may be a float or Decimal subclass.
239                return x.as_integer_ratio()
240            except AttributeError:
241                # Just give up?
242                pass
243    except (OverflowError, ValueError):
244        # float NAN or INF.
245        assert not _isfinite(x)
246        return (x, None)
247    msg = "can't convert type '{}' to numerator/denominator"
248    raise TypeError(msg.format(type(x).__name__))
249
250
251def _convert(value, T):
252    """Convert value to given numeric type T."""
253    if type(value) is T:
254        # This covers the cases where T is Fraction, or where value is
255        # a NAN or INF (Decimal or float).
256        return value
257    if issubclass(T, int) and value.denominator != 1:
258        T = float
259    try:
260        # FIXME: what do we do if this overflows?
261        return T(value)
262    except TypeError:
263        if issubclass(T, Decimal):
264            return T(value.numerator) / T(value.denominator)
265        else:
266            raise
267
268
269def _find_lteq(a, x):
270    'Locate the leftmost value exactly equal to x'
271    i = bisect_left(a, x)
272    if i != len(a) and a[i] == x:
273        return i
274    raise ValueError
275
276
277def _find_rteq(a, l, x):
278    'Locate the rightmost value exactly equal to x'
279    i = bisect_right(a, x, lo=l)
280    if i != (len(a) + 1) and a[i - 1] == x:
281        return i - 1
282    raise ValueError
283
284
285def _fail_neg(values, errmsg='negative value'):
286    """Iterate over values, failing if any are less than zero."""
287    for x in values:
288        if x < 0:
289            raise StatisticsError(errmsg)
290        yield x
291
292
293# === Measures of central tendency (averages) ===
294
295def mean(data):
296    """Return the sample arithmetic mean of data.
297
298    >>> mean([1, 2, 3, 4, 4])
299    2.8
300
301    >>> from fractions import Fraction as F
302    >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
303    Fraction(13, 21)
304
305    >>> from decimal import Decimal as D
306    >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
307    Decimal('0.5625')
308
309    If ``data`` is empty, StatisticsError will be raised.
310    """
311    if iter(data) is data:
312        data = list(data)
313    n = len(data)
314    if n < 1:
315        raise StatisticsError('mean requires at least one data point')
316    T, total, count = _sum(data)
317    assert count == n
318    return _convert(total / n, T)
319
320
321def fmean(data):
322    """Convert data to floats and compute the arithmetic mean.
323
324    This runs faster than the mean() function and it always returns a float.
325    If the input dataset is empty, it raises a StatisticsError.
326
327    >>> fmean([3.5, 4.0, 5.25])
328    4.25
329    """
330    try:
331        n = len(data)
332    except TypeError:
333        # Handle iterators that do not define __len__().
334        n = 0
335        def count(iterable):
336            nonlocal n
337            for n, x in enumerate(iterable, start=1):
338                yield x
339        total = fsum(count(data))
340    else:
341        total = fsum(data)
342    try:
343        return total / n
344    except ZeroDivisionError:
345        raise StatisticsError('fmean requires at least one data point') from None
346
347
348def geometric_mean(data):
349    """Convert data to floats and compute the geometric mean.
350
351    Raises a StatisticsError if the input dataset is empty,
352    if it contains a zero, or if it contains a negative value.
353
354    No special efforts are made to achieve exact results.
355    (However, this may change in the future.)
356
357    >>> round(geometric_mean([54, 24, 36]), 9)
358    36.0
359    """
360    try:
361        return exp(fmean(map(log, data)))
362    except ValueError:
363        raise StatisticsError('geometric mean requires a non-empty dataset '
364                              ' containing positive numbers') from None
365
366
367def harmonic_mean(data):
368    """Return the harmonic mean of data.
369
370    The harmonic mean, sometimes called the subcontrary mean, is the
371    reciprocal of the arithmetic mean of the reciprocals of the data,
372    and is often appropriate when averaging quantities which are rates
373    or ratios, for example speeds. Example:
374
375    Suppose an investor purchases an equal value of shares in each of
376    three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
377    What is the average P/E ratio for the investor's portfolio?
378
379    >>> harmonic_mean([2.5, 3, 10])  # For an equal investment portfolio.
380    3.6
381
382    Using the arithmetic mean would give an average of about 5.167, which
383    is too high.
384
385    If ``data`` is empty, or any element is less than zero,
386    ``harmonic_mean`` will raise ``StatisticsError``.
387    """
388    # For a justification for using harmonic mean for P/E ratios, see
389    # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
390    # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
391    if iter(data) is data:
392        data = list(data)
393    errmsg = 'harmonic mean does not support negative values'
394    n = len(data)
395    if n < 1:
396        raise StatisticsError('harmonic_mean requires at least one data point')
397    elif n == 1:
398        x = data[0]
399        if isinstance(x, (numbers.Real, Decimal)):
400            if x < 0:
401                raise StatisticsError(errmsg)
402            return x
403        else:
404            raise TypeError('unsupported type')
405    try:
406        T, total, count = _sum(1 / x for x in _fail_neg(data, errmsg))
407    except ZeroDivisionError:
408        return 0
409    assert count == n
410    return _convert(n / total, T)
411
412
413# FIXME: investigate ways to calculate medians without sorting? Quickselect?
414def median(data):
415    """Return the median (middle value) of numeric data.
416
417    When the number of data points is odd, return the middle data point.
418    When the number of data points is even, the median is interpolated by
419    taking the average of the two middle values:
420
421    >>> median([1, 3, 5])
422    3
423    >>> median([1, 3, 5, 7])
424    4.0
425
426    """
427    data = sorted(data)
428    n = len(data)
429    if n == 0:
430        raise StatisticsError("no median for empty data")
431    if n % 2 == 1:
432        return data[n // 2]
433    else:
434        i = n // 2
435        return (data[i - 1] + data[i]) / 2
436
437
438def median_low(data):
439    """Return the low median of numeric data.
440
441    When the number of data points is odd, the middle value is returned.
442    When it is even, the smaller of the two middle values is returned.
443
444    >>> median_low([1, 3, 5])
445    3
446    >>> median_low([1, 3, 5, 7])
447    3
448
449    """
450    data = sorted(data)
451    n = len(data)
452    if n == 0:
453        raise StatisticsError("no median for empty data")
454    if n % 2 == 1:
455        return data[n // 2]
456    else:
457        return data[n // 2 - 1]
458
459
460def median_high(data):
461    """Return the high median of data.
462
463    When the number of data points is odd, the middle value is returned.
464    When it is even, the larger of the two middle values is returned.
465
466    >>> median_high([1, 3, 5])
467    3
468    >>> median_high([1, 3, 5, 7])
469    5
470
471    """
472    data = sorted(data)
473    n = len(data)
474    if n == 0:
475        raise StatisticsError("no median for empty data")
476    return data[n // 2]
477
478
479def median_grouped(data, interval=1):
480    """Return the 50th percentile (median) of grouped continuous data.
481
482    >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
483    3.7
484    >>> median_grouped([52, 52, 53, 54])
485    52.5
486
487    This calculates the median as the 50th percentile, and should be
488    used when your data is continuous and grouped. In the above example,
489    the values 1, 2, 3, etc. actually represent the midpoint of classes
490    0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
491    class 3.5-4.5, and interpolation is used to estimate it.
492
493    Optional argument ``interval`` represents the class interval, and
494    defaults to 1. Changing the class interval naturally will change the
495    interpolated 50th percentile value:
496
497    >>> median_grouped([1, 3, 3, 5, 7], interval=1)
498    3.25
499    >>> median_grouped([1, 3, 3, 5, 7], interval=2)
500    3.5
501
502    This function does not check whether the data points are at least
503    ``interval`` apart.
504    """
505    data = sorted(data)
506    n = len(data)
507    if n == 0:
508        raise StatisticsError("no median for empty data")
509    elif n == 1:
510        return data[0]
511    # Find the value at the midpoint. Remember this corresponds to the
512    # centre of the class interval.
513    x = data[n // 2]
514    for obj in (x, interval):
515        if isinstance(obj, (str, bytes)):
516            raise TypeError('expected number but got %r' % obj)
517    try:
518        L = x - interval / 2  # The lower limit of the median interval.
519    except TypeError:
520        # Mixed type. For now we just coerce to float.
521        L = float(x) - float(interval) / 2
522
523    # Uses bisection search to search for x in data with log(n) time complexity
524    # Find the position of leftmost occurrence of x in data
525    l1 = _find_lteq(data, x)
526    # Find the position of rightmost occurrence of x in data[l1...len(data)]
527    # Assuming always l1 <= l2
528    l2 = _find_rteq(data, l1, x)
529    cf = l1
530    f = l2 - l1 + 1
531    return L + interval * (n / 2 - cf) / f
532
533
534def mode(data):
535    """Return the most common data point from discrete or nominal data.
536
537    ``mode`` assumes discrete data, and returns a single value. This is the
538    standard treatment of the mode as commonly taught in schools:
539
540        >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
541        3
542
543    This also works with nominal (non-numeric) data:
544
545        >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
546        'red'
547
548    If there are multiple modes with same frequency, return the first one
549    encountered:
550
551        >>> mode(['red', 'red', 'green', 'blue', 'blue'])
552        'red'
553
554    If *data* is empty, ``mode``, raises StatisticsError.
555
556    """
557    pairs = Counter(iter(data)).most_common(1)
558    try:
559        return pairs[0][0]
560    except IndexError:
561        raise StatisticsError('no mode for empty data') from None
562
563
564def multimode(data):
565    """Return a list of the most frequently occurring values.
566
567    Will return more than one result if there are multiple modes
568    or an empty list if *data* is empty.
569
570    >>> multimode('aabbbbbbbbcc')
571    ['b']
572    >>> multimode('aabbbbccddddeeffffgg')
573    ['b', 'd', 'f']
574    >>> multimode('')
575    []
576    """
577    counts = Counter(iter(data)).most_common()
578    maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
579    return list(map(itemgetter(0), mode_items))
580
581
582# Notes on methods for computing quantiles
583# ----------------------------------------
584#
585# There is no one perfect way to compute quantiles.  Here we offer
586# two methods that serve common needs.  Most other packages
587# surveyed offered at least one or both of these two, making them
588# "standard" in the sense of "widely-adopted and reproducible".
589# They are also easy to explain, easy to compute manually, and have
590# straight-forward interpretations that aren't surprising.
591
592# The default method is known as "R6", "PERCENTILE.EXC", or "expected
593# value of rank order statistics". The alternative method is known as
594# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
595
596# For sample data where there is a positive probability for values
597# beyond the range of the data, the R6 exclusive method is a
598# reasonable choice.  Consider a random sample of nine values from a
599# population with a uniform distribution from 0.0 to 1.0.  The
600# distribution of the third ranked sample point is described by
601# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
602# mean=0.300.  Only the latter (which corresponds with R6) gives the
603# desired cut point with 30% of the population falling below that
604# value, making it comparable to a result from an inv_cdf() function.
605# The R6 exclusive method is also idempotent.
606
607# For describing population data where the end points are known to
608# be included in the data, the R7 inclusive method is a reasonable
609# choice.  Instead of the mean, it uses the mode of the beta
610# distribution for the interior points.  Per Hyndman & Fan, "One nice
611# property is that the vertices of Q7(p) divide the range into n - 1
612# intervals, and exactly 100p% of the intervals lie to the left of
613# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
614
615# If needed, other methods could be added.  However, for now, the
616# position is that fewer options make for easier choices and that
617# external packages can be used for anything more advanced.
618
619def quantiles(data, *, n=4, method='exclusive'):
620    """Divide *data* into *n* continuous intervals with equal probability.
621
622    Returns a list of (n - 1) cut points separating the intervals.
623
624    Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
625    Set *n* to 100 for percentiles which gives the 99 cuts points that
626    separate *data* in to 100 equal sized groups.
627
628    The *data* can be any iterable containing sample.
629    The cut points are linearly interpolated between data points.
630
631    If *method* is set to *inclusive*, *data* is treated as population
632    data.  The minimum value is treated as the 0th percentile and the
633    maximum value is treated as the 100th percentile.
634    """
635    if n < 1:
636        raise StatisticsError('n must be at least 1')
637    data = sorted(data)
638    ld = len(data)
639    if ld < 2:
640        raise StatisticsError('must have at least two data points')
641    if method == 'inclusive':
642        m = ld - 1
643        result = []
644        for i in range(1, n):
645            j, delta = divmod(i * m, n)
646            interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
647            result.append(interpolated)
648        return result
649    if method == 'exclusive':
650        m = ld + 1
651        result = []
652        for i in range(1, n):
653            j = i * m // n                               # rescale i to m/n
654            j = 1 if j < 1 else ld-1 if j > ld-1 else j  # clamp to 1 .. ld-1
655            delta = i*m - j*n                            # exact integer math
656            interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
657            result.append(interpolated)
658        return result
659    raise ValueError(f'Unknown method: {method!r}')
660
661
662# === Measures of spread ===
663
664# See http://mathworld.wolfram.com/Variance.html
665#     http://mathworld.wolfram.com/SampleVariance.html
666#     http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
667#
668# Under no circumstances use the so-called "computational formula for
669# variance", as that is only suitable for hand calculations with a small
670# amount of low-precision data. It has terrible numeric properties.
671#
672# See a comparison of three computational methods here:
673# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
674
675def _ss(data, c=None):
676    """Return sum of square deviations of sequence data.
677
678    If ``c`` is None, the mean is calculated in one pass, and the deviations
679    from the mean are calculated in a second pass. Otherwise, deviations are
680    calculated from ``c`` as given. Use the second case with care, as it can
681    lead to garbage results.
682    """
683    if c is not None:
684        T, total, count = _sum((x-c)**2 for x in data)
685        return (T, total)
686    c = mean(data)
687    T, total, count = _sum((x-c)**2 for x in data)
688    # The following sum should mathematically equal zero, but due to rounding
689    # error may not.
690    U, total2, count2 = _sum((x - c) for x in data)
691    assert T == U and count == count2
692    total -= total2 ** 2 / len(data)
693    assert not total < 0, 'negative sum of square deviations: %f' % total
694    return (T, total)
695
696
697def variance(data, xbar=None):
698    """Return the sample variance of data.
699
700    data should be an iterable of Real-valued numbers, with at least two
701    values. The optional argument xbar, if given, should be the mean of
702    the data. If it is missing or None, the mean is automatically calculated.
703
704    Use this function when your data is a sample from a population. To
705    calculate the variance from the entire population, see ``pvariance``.
706
707    Examples:
708
709    >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
710    >>> variance(data)
711    1.3720238095238095
712
713    If you have already calculated the mean of your data, you can pass it as
714    the optional second argument ``xbar`` to avoid recalculating it:
715
716    >>> m = mean(data)
717    >>> variance(data, m)
718    1.3720238095238095
719
720    This function does not check that ``xbar`` is actually the mean of
721    ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
722    impossible results.
723
724    Decimals and Fractions are supported:
725
726    >>> from decimal import Decimal as D
727    >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
728    Decimal('31.01875')
729
730    >>> from fractions import Fraction as F
731    >>> variance([F(1, 6), F(1, 2), F(5, 3)])
732    Fraction(67, 108)
733
734    """
735    if iter(data) is data:
736        data = list(data)
737    n = len(data)
738    if n < 2:
739        raise StatisticsError('variance requires at least two data points')
740    T, ss = _ss(data, xbar)
741    return _convert(ss / (n - 1), T)
742
743
744def pvariance(data, mu=None):
745    """Return the population variance of ``data``.
746
747    data should be a sequence or iterable of Real-valued numbers, with at least one
748    value. The optional argument mu, if given, should be the mean of
749    the data. If it is missing or None, the mean is automatically calculated.
750
751    Use this function to calculate the variance from the entire population.
752    To estimate the variance from a sample, the ``variance`` function is
753    usually a better choice.
754
755    Examples:
756
757    >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
758    >>> pvariance(data)
759    1.25
760
761    If you have already calculated the mean of the data, you can pass it as
762    the optional second argument to avoid recalculating it:
763
764    >>> mu = mean(data)
765    >>> pvariance(data, mu)
766    1.25
767
768    Decimals and Fractions are supported:
769
770    >>> from decimal import Decimal as D
771    >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
772    Decimal('24.815')
773
774    >>> from fractions import Fraction as F
775    >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
776    Fraction(13, 72)
777
778    """
779    if iter(data) is data:
780        data = list(data)
781    n = len(data)
782    if n < 1:
783        raise StatisticsError('pvariance requires at least one data point')
784    T, ss = _ss(data, mu)
785    return _convert(ss / n, T)
786
787
788def stdev(data, xbar=None):
789    """Return the square root of the sample variance.
790
791    See ``variance`` for arguments and other details.
792
793    >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
794    1.0810874155219827
795
796    """
797    var = variance(data, xbar)
798    try:
799        return var.sqrt()
800    except AttributeError:
801        return math.sqrt(var)
802
803
804def pstdev(data, mu=None):
805    """Return the square root of the population variance.
806
807    See ``pvariance`` for arguments and other details.
808
809    >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
810    0.986893273527251
811
812    """
813    var = pvariance(data, mu)
814    try:
815        return var.sqrt()
816    except AttributeError:
817        return math.sqrt(var)
818
819
820## Normal Distribution #####################################################
821
822
823def _normal_dist_inv_cdf(p, mu, sigma):
824    # There is no closed-form solution to the inverse CDF for the normal
825    # distribution, so we use a rational approximation instead:
826    # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
827    # Normal Distribution".  Applied Statistics. Blackwell Publishing. 37
828    # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
829    q = p - 0.5
830    if fabs(q) <= 0.425:
831        r = 0.180625 - q * q
832        # Hash sum: 55.88319_28806_14901_4439
833        num = (((((((2.50908_09287_30122_6727e+3 * r +
834                     3.34305_75583_58812_8105e+4) * r +
835                     6.72657_70927_00870_0853e+4) * r +
836                     4.59219_53931_54987_1457e+4) * r +
837                     1.37316_93765_50946_1125e+4) * r +
838                     1.97159_09503_06551_4427e+3) * r +
839                     1.33141_66789_17843_7745e+2) * r +
840                     3.38713_28727_96366_6080e+0) * q
841        den = (((((((5.22649_52788_52854_5610e+3 * r +
842                     2.87290_85735_72194_2674e+4) * r +
843                     3.93078_95800_09271_0610e+4) * r +
844                     2.12137_94301_58659_5867e+4) * r +
845                     5.39419_60214_24751_1077e+3) * r +
846                     6.87187_00749_20579_0830e+2) * r +
847                     4.23133_30701_60091_1252e+1) * r +
848                     1.0)
849        x = num / den
850        return mu + (x * sigma)
851    r = p if q <= 0.0 else 1.0 - p
852    r = sqrt(-log(r))
853    if r <= 5.0:
854        r = r - 1.6
855        # Hash sum: 49.33206_50330_16102_89036
856        num = (((((((7.74545_01427_83414_07640e-4 * r +
857                     2.27238_44989_26918_45833e-2) * r +
858                     2.41780_72517_74506_11770e-1) * r +
859                     1.27045_82524_52368_38258e+0) * r +
860                     3.64784_83247_63204_60504e+0) * r +
861                     5.76949_72214_60691_40550e+0) * r +
862                     4.63033_78461_56545_29590e+0) * r +
863                     1.42343_71107_49683_57734e+0)
864        den = (((((((1.05075_00716_44416_84324e-9 * r +
865                     5.47593_80849_95344_94600e-4) * r +
866                     1.51986_66563_61645_71966e-2) * r +
867                     1.48103_97642_74800_74590e-1) * r +
868                     6.89767_33498_51000_04550e-1) * r +
869                     1.67638_48301_83803_84940e+0) * r +
870                     2.05319_16266_37758_82187e+0) * r +
871                     1.0)
872    else:
873        r = r - 5.0
874        # Hash sum: 47.52583_31754_92896_71629
875        num = (((((((2.01033_43992_92288_13265e-7 * r +
876                     2.71155_55687_43487_57815e-5) * r +
877                     1.24266_09473_88078_43860e-3) * r +
878                     2.65321_89526_57612_30930e-2) * r +
879                     2.96560_57182_85048_91230e-1) * r +
880                     1.78482_65399_17291_33580e+0) * r +
881                     5.46378_49111_64114_36990e+0) * r +
882                     6.65790_46435_01103_77720e+0)
883        den = (((((((2.04426_31033_89939_78564e-15 * r +
884                     1.42151_17583_16445_88870e-7) * r +
885                     1.84631_83175_10054_68180e-5) * r +
886                     7.86869_13114_56132_59100e-4) * r +
887                     1.48753_61290_85061_48525e-2) * r +
888                     1.36929_88092_27358_05310e-1) * r +
889                     5.99832_20655_58879_37690e-1) * r +
890                     1.0)
891    x = num / den
892    if q < 0.0:
893        x = -x
894    return mu + (x * sigma)
895
896
897# If available, use C implementation
898try:
899    from _statistics import _normal_dist_inv_cdf
900except ImportError:
901    pass
902
903
904class NormalDist:
905    "Normal distribution of a random variable"
906    # https://en.wikipedia.org/wiki/Normal_distribution
907    # https://en.wikipedia.org/wiki/Variance#Properties
908
909    __slots__ = {
910        '_mu': 'Arithmetic mean of a normal distribution',
911        '_sigma': 'Standard deviation of a normal distribution',
912    }
913
914    def __init__(self, mu=0.0, sigma=1.0):
915        "NormalDist where mu is the mean and sigma is the standard deviation."
916        if sigma < 0.0:
917            raise StatisticsError('sigma must be non-negative')
918        self._mu = float(mu)
919        self._sigma = float(sigma)
920
921    @classmethod
922    def from_samples(cls, data):
923        "Make a normal distribution instance from sample data."
924        if not isinstance(data, (list, tuple)):
925            data = list(data)
926        xbar = fmean(data)
927        return cls(xbar, stdev(data, xbar))
928
929    def samples(self, n, *, seed=None):
930        "Generate *n* samples for a given mean and standard deviation."
931        gauss = random.gauss if seed is None else random.Random(seed).gauss
932        mu, sigma = self._mu, self._sigma
933        return [gauss(mu, sigma) for i in range(n)]
934
935    def pdf(self, x):
936        "Probability density function.  P(x <= X < x+dx) / dx"
937        variance = self._sigma ** 2.0
938        if not variance:
939            raise StatisticsError('pdf() not defined when sigma is zero')
940        return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance)
941
942    def cdf(self, x):
943        "Cumulative distribution function.  P(X <= x)"
944        if not self._sigma:
945            raise StatisticsError('cdf() not defined when sigma is zero')
946        return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
947
948    def inv_cdf(self, p):
949        """Inverse cumulative distribution function.  x : P(X <= x) = p
950
951        Finds the value of the random variable such that the probability of
952        the variable being less than or equal to that value equals the given
953        probability.
954
955        This function is also called the percent point function or quantile
956        function.
957        """
958        if p <= 0.0 or p >= 1.0:
959            raise StatisticsError('p must be in the range 0.0 < p < 1.0')
960        if self._sigma <= 0.0:
961            raise StatisticsError('cdf() not defined when sigma at or below zero')
962        return _normal_dist_inv_cdf(p, self._mu, self._sigma)
963
964    def quantiles(self, n=4):
965        """Divide into *n* continuous intervals with equal probability.
966
967        Returns a list of (n - 1) cut points separating the intervals.
968
969        Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
970        Set *n* to 100 for percentiles which gives the 99 cuts points that
971        separate the normal distribution in to 100 equal sized groups.
972        """
973        return [self.inv_cdf(i / n) for i in range(1, n)]
974
975    def overlap(self, other):
976        """Compute the overlapping coefficient (OVL) between two normal distributions.
977
978        Measures the agreement between two normal probability distributions.
979        Returns a value between 0.0 and 1.0 giving the overlapping area in
980        the two underlying probability density functions.
981
982            >>> N1 = NormalDist(2.4, 1.6)
983            >>> N2 = NormalDist(3.2, 2.0)
984            >>> N1.overlap(N2)
985            0.8035050657330205
986        """
987        # See: "The overlapping coefficient as a measure of agreement between
988        # probability distributions and point estimation of the overlap of two
989        # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
990        # http://dx.doi.org/10.1080/03610928908830127
991        if not isinstance(other, NormalDist):
992            raise TypeError('Expected another NormalDist instance')
993        X, Y = self, other
994        if (Y._sigma, Y._mu) < (X._sigma, X._mu):  # sort to assure commutativity
995            X, Y = Y, X
996        X_var, Y_var = X.variance, Y.variance
997        if not X_var or not Y_var:
998            raise StatisticsError('overlap() not defined when sigma is zero')
999        dv = Y_var - X_var
1000        dm = fabs(Y._mu - X._mu)
1001        if not dv:
1002            return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
1003        a = X._mu * Y_var - Y._mu * X_var
1004        b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
1005        x1 = (a + b) / dv
1006        x2 = (a - b) / dv
1007        return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
1008
1009    def zscore(self, x):
1010        """Compute the Standard Score.  (x - mean) / stdev
1011
1012        Describes *x* in terms of the number of standard deviations
1013        above or below the mean of the normal distribution.
1014        """
1015        # https://www.statisticshowto.com/probability-and-statistics/z-score/
1016        if not self._sigma:
1017            raise StatisticsError('zscore() not defined when sigma is zero')
1018        return (x - self._mu) / self._sigma
1019
1020    @property
1021    def mean(self):
1022        "Arithmetic mean of the normal distribution."
1023        return self._mu
1024
1025    @property
1026    def median(self):
1027        "Return the median of the normal distribution"
1028        return self._mu
1029
1030    @property
1031    def mode(self):
1032        """Return the mode of the normal distribution
1033
1034        The mode is the value x where which the probability density
1035        function (pdf) takes its maximum value.
1036        """
1037        return self._mu
1038
1039    @property
1040    def stdev(self):
1041        "Standard deviation of the normal distribution."
1042        return self._sigma
1043
1044    @property
1045    def variance(self):
1046        "Square of the standard deviation."
1047        return self._sigma ** 2.0
1048
1049    def __add__(x1, x2):
1050        """Add a constant or another NormalDist instance.
1051
1052        If *other* is a constant, translate mu by the constant,
1053        leaving sigma unchanged.
1054
1055        If *other* is a NormalDist, add both the means and the variances.
1056        Mathematically, this works only if the two distributions are
1057        independent or if they are jointly normally distributed.
1058        """
1059        if isinstance(x2, NormalDist):
1060            return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
1061        return NormalDist(x1._mu + x2, x1._sigma)
1062
1063    def __sub__(x1, x2):
1064        """Subtract a constant or another NormalDist instance.
1065
1066        If *other* is a constant, translate by the constant mu,
1067        leaving sigma unchanged.
1068
1069        If *other* is a NormalDist, subtract the means and add the variances.
1070        Mathematically, this works only if the two distributions are
1071        independent or if they are jointly normally distributed.
1072        """
1073        if isinstance(x2, NormalDist):
1074            return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
1075        return NormalDist(x1._mu - x2, x1._sigma)
1076
1077    def __mul__(x1, x2):
1078        """Multiply both mu and sigma by a constant.
1079
1080        Used for rescaling, perhaps to change measurement units.
1081        Sigma is scaled with the absolute value of the constant.
1082        """
1083        return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
1084
1085    def __truediv__(x1, x2):
1086        """Divide both mu and sigma by a constant.
1087
1088        Used for rescaling, perhaps to change measurement units.
1089        Sigma is scaled with the absolute value of the constant.
1090        """
1091        return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
1092
1093    def __pos__(x1):
1094        "Return a copy of the instance."
1095        return NormalDist(x1._mu, x1._sigma)
1096
1097    def __neg__(x1):
1098        "Negates mu while keeping sigma the same."
1099        return NormalDist(-x1._mu, x1._sigma)
1100
1101    __radd__ = __add__
1102
1103    def __rsub__(x1, x2):
1104        "Subtract a NormalDist from a constant or another NormalDist."
1105        return -(x1 - x2)
1106
1107    __rmul__ = __mul__
1108
1109    def __eq__(x1, x2):
1110        "Two NormalDist objects are equal if their mu and sigma are both equal."
1111        if not isinstance(x2, NormalDist):
1112            return NotImplemented
1113        return x1._mu == x2._mu and x1._sigma == x2._sigma
1114
1115    def __hash__(self):
1116        "NormalDist objects hash equal if their mu and sigma are both equal."
1117        return hash((self._mu, self._sigma))
1118
1119    def __repr__(self):
1120        return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'
1121