• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
10==================  ==================================================
11Function            Description
12==================  ==================================================
13mean                Arithmetic mean (average) of data.
14fmean               Fast, floating point arithmetic mean.
15geometric_mean      Geometric mean of data.
16harmonic_mean       Harmonic mean of data.
17median              Median (middle value) of data.
18median_low          Low median of data.
19median_high         High median of data.
20median_grouped      Median, or 50th percentile, of grouped data.
21mode                Mode (most common value) of data.
22multimode           List of modes (most common values of data).
23quantiles           Divide data into intervals with equal probability.
24==================  ==================================================
25
26Calculate the arithmetic mean ("the average") of data:
27
28>>> mean([-1.0, 2.5, 3.25, 5.75])
292.625
30
31
32Calculate the standard median of discrete data:
33
34>>> median([2, 3, 4, 5])
353.5
36
37
38Calculate the median, or 50th percentile, of data grouped into class intervals
39centred on the data values provided. E.g. if your data points are rounded to
40the nearest whole number:
41
42>>> median_grouped([2, 2, 3, 3, 3, 4])  #doctest: +ELLIPSIS
432.8333333333...
44
45This should be interpreted in this way: you have two data points in the class
46interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
47the class interval 3.5-4.5. The median of these data points is 2.8333...
48
49
50Calculating variability or spread
51---------------------------------
52
53==================  =============================================
54Function            Description
55==================  =============================================
56pvariance           Population variance of data.
57variance            Sample variance of data.
58pstdev              Population standard deviation of data.
59stdev               Sample standard deviation of data.
60==================  =============================================
61
62Calculate the standard deviation of sample data:
63
64>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75])  #doctest: +ELLIPSIS
654.38961843444...
66
67If you have previously calculated the mean, you can pass it as the optional
68second argument to the four "spread" functions to avoid recalculating it:
69
70>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
71>>> mu = mean(data)
72>>> pvariance(data, mu)
732.5
74
75
76Exceptions
77----------
78
79A single exception is defined: StatisticsError is a subclass of ValueError.
80
81"""
82
83__all__ = [
84    'NormalDist',
85    'StatisticsError',
86    'fmean',
87    'geometric_mean',
88    'harmonic_mean',
89    'mean',
90    'median',
91    'median_grouped',
92    'median_high',
93    'median_low',
94    'mode',
95    'multimode',
96    'pstdev',
97    'pvariance',
98    'quantiles',
99    'stdev',
100    'variance',
101]
102
103import math
104import numbers
105import random
106
107from fractions import Fraction
108from decimal import Decimal
109from itertools import groupby
110from bisect import bisect_left, bisect_right
111from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
112from operator import itemgetter
113from collections import Counter
114
115# === Exceptions ===
116
117class StatisticsError(ValueError):
118    pass
119
120
121# === Private utilities ===
122
123def _sum(data, start=0):
124    """_sum(data [, start]) -> (type, sum, count)
125
126    Return a high-precision sum of the given numeric data as a fraction,
127    together with the type to be converted to and the count of items.
128
129    If optional argument ``start`` is given, it is added to the total.
130    If ``data`` is empty, ``start`` (defaulting to 0) is returned.
131
132
133    Examples
134    --------
135
136    >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
137    (<class 'float'>, Fraction(11, 1), 5)
138
139    Some sources of round-off error will be avoided:
140
141    # Built-in sum returns zero.
142    >>> _sum([1e50, 1, -1e50] * 1000)
143    (<class 'float'>, Fraction(1000, 1), 3000)
144
145    Fractions and Decimals are also supported:
146
147    >>> from fractions import Fraction as F
148    >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
149    (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
150
151    >>> from decimal import Decimal as D
152    >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
153    >>> _sum(data)
154    (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
155
156    Mixed types are currently treated as an error, except that int is
157    allowed.
158    """
159    count = 0
160    n, d = _exact_ratio(start)
161    partials = {d: n}
162    partials_get = partials.get
163    T = _coerce(int, type(start))
164    for typ, values in groupby(data, type):
165        T = _coerce(T, typ)  # or raise TypeError
166        for n,d in map(_exact_ratio, values):
167            count += 1
168            partials[d] = partials_get(d, 0) + n
169    if None in partials:
170        # The sum will be a NAN or INF. We can ignore all the finite
171        # partials, and just look at this special one.
172        total = partials[None]
173        assert not _isfinite(total)
174    else:
175        # Sum all the partial sums using builtin sum.
176        # FIXME is this faster if we sum them in order of the denominator?
177        total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
178    return (T, total, count)
179
180
181def _isfinite(x):
182    try:
183        return x.is_finite()  # Likely a Decimal.
184    except AttributeError:
185        return math.isfinite(x)  # Coerces to float first.
186
187
188def _coerce(T, S):
189    """Coerce types T and S to a common type, or raise TypeError.
190
191    Coercion rules are currently an implementation detail. See the CoerceTest
192    test class in test_statistics for details.
193    """
194    # See http://bugs.python.org/issue24068.
195    assert T is not bool, "initial type T is bool"
196    # If the types are the same, no need to coerce anything. Put this
197    # first, so that the usual case (no coercion needed) happens as soon
198    # as possible.
199    if T is S:  return T
200    # Mixed int & other coerce to the other type.
201    if S is int or S is bool:  return T
202    if T is int:  return S
203    # If one is a (strict) subclass of the other, coerce to the subclass.
204    if issubclass(S, T):  return S
205    if issubclass(T, S):  return T
206    # Ints coerce to the other type.
207    if issubclass(T, int):  return S
208    if issubclass(S, int):  return T
209    # Mixed fraction & float coerces to float (or float subclass).
210    if issubclass(T, Fraction) and issubclass(S, float):
211        return S
212    if issubclass(T, float) and issubclass(S, Fraction):
213        return T
214    # Any other combination is disallowed.
215    msg = "don't know how to coerce %s and %s"
216    raise TypeError(msg % (T.__name__, S.__name__))
217
218
219def _exact_ratio(x):
220    """Return Real number x to exact (numerator, denominator) pair.
221
222    >>> _exact_ratio(0.25)
223    (1, 4)
224
225    x is expected to be an int, Fraction, Decimal or float.
226    """
227    try:
228        # Optimise the common case of floats. We expect that the most often
229        # used numeric type will be builtin floats, so try to make this as
230        # fast as possible.
231        if type(x) is float or type(x) is Decimal:
232            return x.as_integer_ratio()
233        try:
234            # x may be an int, Fraction, or Integral ABC.
235            return (x.numerator, x.denominator)
236        except AttributeError:
237            try:
238                # x may be a float or Decimal subclass.
239                return x.as_integer_ratio()
240            except AttributeError:
241                # Just give up?
242                pass
243    except (OverflowError, ValueError):
244        # float NAN or INF.
245        assert not _isfinite(x)
246        return (x, None)
247    msg = "can't convert type '{}' to numerator/denominator"
248    raise TypeError(msg.format(type(x).__name__))
249
250
251def _convert(value, T):
252    """Convert value to given numeric type T."""
253    if type(value) is T:
254        # This covers the cases where T is Fraction, or where value is
255        # a NAN or INF (Decimal or float).
256        return value
257    if issubclass(T, int) and value.denominator != 1:
258        T = float
259    try:
260        # FIXME: what do we do if this overflows?
261        return T(value)
262    except TypeError:
263        if issubclass(T, Decimal):
264            return T(value.numerator)/T(value.denominator)
265        else:
266            raise
267
268
269def _find_lteq(a, x):
270    'Locate the leftmost value exactly equal to x'
271    i = bisect_left(a, x)
272    if i != len(a) and a[i] == x:
273        return i
274    raise ValueError
275
276
277def _find_rteq(a, l, x):
278    'Locate the rightmost value exactly equal to x'
279    i = bisect_right(a, x, lo=l)
280    if i != (len(a)+1) and a[i-1] == x:
281        return i-1
282    raise ValueError
283
284
285def _fail_neg(values, errmsg='negative value'):
286    """Iterate over values, failing if any are less than zero."""
287    for x in values:
288        if x < 0:
289            raise StatisticsError(errmsg)
290        yield x
291
292
293# === Measures of central tendency (averages) ===
294
295def mean(data):
296    """Return the sample arithmetic mean of data.
297
298    >>> mean([1, 2, 3, 4, 4])
299    2.8
300
301    >>> from fractions import Fraction as F
302    >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
303    Fraction(13, 21)
304
305    >>> from decimal import Decimal as D
306    >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
307    Decimal('0.5625')
308
309    If ``data`` is empty, StatisticsError will be raised.
310    """
311    if iter(data) is data:
312        data = list(data)
313    n = len(data)
314    if n < 1:
315        raise StatisticsError('mean requires at least one data point')
316    T, total, count = _sum(data)
317    assert count == n
318    return _convert(total/n, T)
319
320
321def fmean(data):
322    """Convert data to floats and compute the arithmetic mean.
323
324    This runs faster than the mean() function and it always returns a float.
325    If the input dataset is empty, it raises a StatisticsError.
326
327    >>> fmean([3.5, 4.0, 5.25])
328    4.25
329    """
330    try:
331        n = len(data)
332    except TypeError:
333        # Handle iterators that do not define __len__().
334        n = 0
335        def count(iterable):
336            nonlocal n
337            for n, x in enumerate(iterable, start=1):
338                yield x
339        total = fsum(count(data))
340    else:
341        total = fsum(data)
342    try:
343        return total / n
344    except ZeroDivisionError:
345        raise StatisticsError('fmean requires at least one data point') from None
346
347
348def geometric_mean(data):
349    """Convert data to floats and compute the geometric mean.
350
351    Raises a StatisticsError if the input dataset is empty,
352    if it contains a zero, or if it contains a negative value.
353
354    No special efforts are made to achieve exact results.
355    (However, this may change in the future.)
356
357    >>> round(geometric_mean([54, 24, 36]), 9)
358    36.0
359    """
360    try:
361        return exp(fmean(map(log, data)))
362    except ValueError:
363        raise StatisticsError('geometric mean requires a non-empty dataset '
364                              ' containing positive numbers') from None
365
366
367def harmonic_mean(data):
368    """Return the harmonic mean of data.
369
370    The harmonic mean, sometimes called the subcontrary mean, is the
371    reciprocal of the arithmetic mean of the reciprocals of the data,
372    and is often appropriate when averaging quantities which are rates
373    or ratios, for example speeds. Example:
374
375    Suppose an investor purchases an equal value of shares in each of
376    three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
377    What is the average P/E ratio for the investor's portfolio?
378
379    >>> harmonic_mean([2.5, 3, 10])  # For an equal investment portfolio.
380    3.6
381
382    Using the arithmetic mean would give an average of about 5.167, which
383    is too high.
384
385    If ``data`` is empty, or any element is less than zero,
386    ``harmonic_mean`` will raise ``StatisticsError``.
387    """
388    # For a justification for using harmonic mean for P/E ratios, see
389    # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
390    # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
391    if iter(data) is data:
392        data = list(data)
393    errmsg = 'harmonic mean does not support negative values'
394    n = len(data)
395    if n < 1:
396        raise StatisticsError('harmonic_mean requires at least one data point')
397    elif n == 1:
398        x = data[0]
399        if isinstance(x, (numbers.Real, Decimal)):
400            if x < 0:
401                raise StatisticsError(errmsg)
402            return x
403        else:
404            raise TypeError('unsupported type')
405    try:
406        T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
407    except ZeroDivisionError:
408        return 0
409    assert count == n
410    return _convert(n/total, T)
411
412
413# FIXME: investigate ways to calculate medians without sorting? Quickselect?
414def median(data):
415    """Return the median (middle value) of numeric data.
416
417    When the number of data points is odd, return the middle data point.
418    When the number of data points is even, the median is interpolated by
419    taking the average of the two middle values:
420
421    >>> median([1, 3, 5])
422    3
423    >>> median([1, 3, 5, 7])
424    4.0
425
426    """
427    data = sorted(data)
428    n = len(data)
429    if n == 0:
430        raise StatisticsError("no median for empty data")
431    if n%2 == 1:
432        return data[n//2]
433    else:
434        i = n//2
435        return (data[i - 1] + data[i])/2
436
437
438def median_low(data):
439    """Return the low median of numeric data.
440
441    When the number of data points is odd, the middle value is returned.
442    When it is even, the smaller of the two middle values is returned.
443
444    >>> median_low([1, 3, 5])
445    3
446    >>> median_low([1, 3, 5, 7])
447    3
448
449    """
450    data = sorted(data)
451    n = len(data)
452    if n == 0:
453        raise StatisticsError("no median for empty data")
454    if n%2 == 1:
455        return data[n//2]
456    else:
457        return data[n//2 - 1]
458
459
460def median_high(data):
461    """Return the high median of data.
462
463    When the number of data points is odd, the middle value is returned.
464    When it is even, the larger of the two middle values is returned.
465
466    >>> median_high([1, 3, 5])
467    3
468    >>> median_high([1, 3, 5, 7])
469    5
470
471    """
472    data = sorted(data)
473    n = len(data)
474    if n == 0:
475        raise StatisticsError("no median for empty data")
476    return data[n//2]
477
478
479def median_grouped(data, interval=1):
480    """Return the 50th percentile (median) of grouped continuous data.
481
482    >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
483    3.7
484    >>> median_grouped([52, 52, 53, 54])
485    52.5
486
487    This calculates the median as the 50th percentile, and should be
488    used when your data is continuous and grouped. In the above example,
489    the values 1, 2, 3, etc. actually represent the midpoint of classes
490    0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
491    class 3.5-4.5, and interpolation is used to estimate it.
492
493    Optional argument ``interval`` represents the class interval, and
494    defaults to 1. Changing the class interval naturally will change the
495    interpolated 50th percentile value:
496
497    >>> median_grouped([1, 3, 3, 5, 7], interval=1)
498    3.25
499    >>> median_grouped([1, 3, 3, 5, 7], interval=2)
500    3.5
501
502    This function does not check whether the data points are at least
503    ``interval`` apart.
504    """
505    data = sorted(data)
506    n = len(data)
507    if n == 0:
508        raise StatisticsError("no median for empty data")
509    elif n == 1:
510        return data[0]
511    # Find the value at the midpoint. Remember this corresponds to the
512    # centre of the class interval.
513    x = data[n//2]
514    for obj in (x, interval):
515        if isinstance(obj, (str, bytes)):
516            raise TypeError('expected number but got %r' % obj)
517    try:
518        L = x - interval/2  # The lower limit of the median interval.
519    except TypeError:
520        # Mixed type. For now we just coerce to float.
521        L = float(x) - float(interval)/2
522
523    # Uses bisection search to search for x in data with log(n) time complexity
524    # Find the position of leftmost occurrence of x in data
525    l1 = _find_lteq(data, x)
526    # Find the position of rightmost occurrence of x in data[l1...len(data)]
527    # Assuming always l1 <= l2
528    l2 = _find_rteq(data, l1, x)
529    cf = l1
530    f = l2 - l1 + 1
531    return L + interval*(n/2 - cf)/f
532
533
534def mode(data):
535    """Return the most common data point from discrete or nominal data.
536
537    ``mode`` assumes discrete data, and returns a single value. This is the
538    standard treatment of the mode as commonly taught in schools:
539
540        >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
541        3
542
543    This also works with nominal (non-numeric) data:
544
545        >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
546        'red'
547
548    If there are multiple modes with same frequency, return the first one
549    encountered:
550
551        >>> mode(['red', 'red', 'green', 'blue', 'blue'])
552        'red'
553
554    If *data* is empty, ``mode``, raises StatisticsError.
555
556    """
557    data = iter(data)
558    pairs = Counter(data).most_common(1)
559    try:
560        return pairs[0][0]
561    except IndexError:
562        raise StatisticsError('no mode for empty data') from None
563
564
565def multimode(data):
566    """Return a list of the most frequently occurring values.
567
568    Will return more than one result if there are multiple modes
569    or an empty list if *data* is empty.
570
571    >>> multimode('aabbbbbbbbcc')
572    ['b']
573    >>> multimode('aabbbbccddddeeffffgg')
574    ['b', 'd', 'f']
575    >>> multimode('')
576    []
577    """
578    counts = Counter(iter(data)).most_common()
579    maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
580    return list(map(itemgetter(0), mode_items))
581
582
583# Notes on methods for computing quantiles
584# ----------------------------------------
585#
586# There is no one perfect way to compute quantiles.  Here we offer
587# two methods that serve common needs.  Most other packages
588# surveyed offered at least one or both of these two, making them
589# "standard" in the sense of "widely-adopted and reproducible".
590# They are also easy to explain, easy to compute manually, and have
591# straight-forward interpretations that aren't surprising.
592
593# The default method is known as "R6", "PERCENTILE.EXC", or "expected
594# value of rank order statistics". The alternative method is known as
595# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
596
597# For sample data where there is a positive probability for values
598# beyond the range of the data, the R6 exclusive method is a
599# reasonable choice.  Consider a random sample of nine values from a
600# population with a uniform distribution from 0.0 to 100.0.  The
601# distribution of the third ranked sample point is described by
602# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
603# mean=0.300.  Only the latter (which corresponds with R6) gives the
604# desired cut point with 30% of the population falling below that
605# value, making it comparable to a result from an inv_cdf() function.
606# The R6 exclusive method is also idempotent.
607
608# For describing population data where the end points are known to
609# be included in the data, the R7 inclusive method is a reasonable
610# choice.  Instead of the mean, it uses the mode of the beta
611# distribution for the interior points.  Per Hyndman & Fan, "One nice
612# property is that the vertices of Q7(p) divide the range into n - 1
613# intervals, and exactly 100p% of the intervals lie to the left of
614# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
615
616# If needed, other methods could be added.  However, for now, the
617# position is that fewer options make for easier choices and that
618# external packages can be used for anything more advanced.
619
620def quantiles(data, *, n=4, method='exclusive'):
621    """Divide *data* into *n* continuous intervals with equal probability.
622
623    Returns a list of (n - 1) cut points separating the intervals.
624
625    Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
626    Set *n* to 100 for percentiles which gives the 99 cuts points that
627    separate *data* in to 100 equal sized groups.
628
629    The *data* can be any iterable containing sample.
630    The cut points are linearly interpolated between data points.
631
632    If *method* is set to *inclusive*, *data* is treated as population
633    data.  The minimum value is treated as the 0th percentile and the
634    maximum value is treated as the 100th percentile.
635    """
636    if n < 1:
637        raise StatisticsError('n must be at least 1')
638    data = sorted(data)
639    ld = len(data)
640    if ld < 2:
641        raise StatisticsError('must have at least two data points')
642    if method == 'inclusive':
643        m = ld - 1
644        result = []
645        for i in range(1, n):
646            j = i * m // n
647            delta = i*m - j*n
648            interpolated = (data[j] * (n - delta) + data[j+1] * delta) / n
649            result.append(interpolated)
650        return result
651    if method == 'exclusive':
652        m = ld + 1
653        result = []
654        for i in range(1, n):
655            j = i * m // n                               # rescale i to m/n
656            j = 1 if j < 1 else ld-1 if j > ld-1 else j  # clamp to 1 .. ld-1
657            delta = i*m - j*n                            # exact integer math
658            interpolated = (data[j-1] * (n - delta) + data[j] * delta) / n
659            result.append(interpolated)
660        return result
661    raise ValueError(f'Unknown method: {method!r}')
662
663
664# === Measures of spread ===
665
666# See http://mathworld.wolfram.com/Variance.html
667#     http://mathworld.wolfram.com/SampleVariance.html
668#     http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
669#
670# Under no circumstances use the so-called "computational formula for
671# variance", as that is only suitable for hand calculations with a small
672# amount of low-precision data. It has terrible numeric properties.
673#
674# See a comparison of three computational methods here:
675# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
676
677def _ss(data, c=None):
678    """Return sum of square deviations of sequence data.
679
680    If ``c`` is None, the mean is calculated in one pass, and the deviations
681    from the mean are calculated in a second pass. Otherwise, deviations are
682    calculated from ``c`` as given. Use the second case with care, as it can
683    lead to garbage results.
684    """
685    if c is None:
686        c = mean(data)
687    T, total, count = _sum((x-c)**2 for x in data)
688    # The following sum should mathematically equal zero, but due to rounding
689    # error may not.
690    U, total2, count2 = _sum((x-c) for x in data)
691    assert T == U and count == count2
692    total -=  total2**2/len(data)
693    assert not total < 0, 'negative sum of square deviations: %f' % total
694    return (T, total)
695
696
697def variance(data, xbar=None):
698    """Return the sample variance of data.
699
700    data should be an iterable of Real-valued numbers, with at least two
701    values. The optional argument xbar, if given, should be the mean of
702    the data. If it is missing or None, the mean is automatically calculated.
703
704    Use this function when your data is a sample from a population. To
705    calculate the variance from the entire population, see ``pvariance``.
706
707    Examples:
708
709    >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
710    >>> variance(data)
711    1.3720238095238095
712
713    If you have already calculated the mean of your data, you can pass it as
714    the optional second argument ``xbar`` to avoid recalculating it:
715
716    >>> m = mean(data)
717    >>> variance(data, m)
718    1.3720238095238095
719
720    This function does not check that ``xbar`` is actually the mean of
721    ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
722    impossible results.
723
724    Decimals and Fractions are supported:
725
726    >>> from decimal import Decimal as D
727    >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
728    Decimal('31.01875')
729
730    >>> from fractions import Fraction as F
731    >>> variance([F(1, 6), F(1, 2), F(5, 3)])
732    Fraction(67, 108)
733
734    """
735    if iter(data) is data:
736        data = list(data)
737    n = len(data)
738    if n < 2:
739        raise StatisticsError('variance requires at least two data points')
740    T, ss = _ss(data, xbar)
741    return _convert(ss/(n-1), T)
742
743
744def pvariance(data, mu=None):
745    """Return the population variance of ``data``.
746
747    data should be a sequence or iterable of Real-valued numbers, with at least one
748    value. The optional argument mu, if given, should be the mean of
749    the data. If it is missing or None, the mean is automatically calculated.
750
751    Use this function to calculate the variance from the entire population.
752    To estimate the variance from a sample, the ``variance`` function is
753    usually a better choice.
754
755    Examples:
756
757    >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
758    >>> pvariance(data)
759    1.25
760
761    If you have already calculated the mean of the data, you can pass it as
762    the optional second argument to avoid recalculating it:
763
764    >>> mu = mean(data)
765    >>> pvariance(data, mu)
766    1.25
767
768    Decimals and Fractions are supported:
769
770    >>> from decimal import Decimal as D
771    >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
772    Decimal('24.815')
773
774    >>> from fractions import Fraction as F
775    >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
776    Fraction(13, 72)
777
778    """
779    if iter(data) is data:
780        data = list(data)
781    n = len(data)
782    if n < 1:
783        raise StatisticsError('pvariance requires at least one data point')
784    T, ss = _ss(data, mu)
785    return _convert(ss/n, T)
786
787
788def stdev(data, xbar=None):
789    """Return the square root of the sample variance.
790
791    See ``variance`` for arguments and other details.
792
793    >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
794    1.0810874155219827
795
796    """
797    var = variance(data, xbar)
798    try:
799        return var.sqrt()
800    except AttributeError:
801        return math.sqrt(var)
802
803
804def pstdev(data, mu=None):
805    """Return the square root of the population variance.
806
807    See ``pvariance`` for arguments and other details.
808
809    >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
810    0.986893273527251
811
812    """
813    var = pvariance(data, mu)
814    try:
815        return var.sqrt()
816    except AttributeError:
817        return math.sqrt(var)
818
819
820## Normal Distribution #####################################################
821
822
823def _normal_dist_inv_cdf(p, mu, sigma):
824    # There is no closed-form solution to the inverse CDF for the normal
825    # distribution, so we use a rational approximation instead:
826    # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
827    # Normal Distribution".  Applied Statistics. Blackwell Publishing. 37
828    # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
829    q = p - 0.5
830    if fabs(q) <= 0.425:
831        r = 0.180625 - q * q
832        # Hash sum: 55.88319_28806_14901_4439
833        num = (((((((2.50908_09287_30122_6727e+3 * r +
834                     3.34305_75583_58812_8105e+4) * r +
835                     6.72657_70927_00870_0853e+4) * r +
836                     4.59219_53931_54987_1457e+4) * r +
837                     1.37316_93765_50946_1125e+4) * r +
838                     1.97159_09503_06551_4427e+3) * r +
839                     1.33141_66789_17843_7745e+2) * r +
840                     3.38713_28727_96366_6080e+0) * q
841        den = (((((((5.22649_52788_52854_5610e+3 * r +
842                     2.87290_85735_72194_2674e+4) * r +
843                     3.93078_95800_09271_0610e+4) * r +
844                     2.12137_94301_58659_5867e+4) * r +
845                     5.39419_60214_24751_1077e+3) * r +
846                     6.87187_00749_20579_0830e+2) * r +
847                     4.23133_30701_60091_1252e+1) * r +
848                     1.0)
849        x = num / den
850        return mu + (x * sigma)
851    r = p if q <= 0.0 else 1.0 - p
852    r = sqrt(-log(r))
853    if r <= 5.0:
854        r = r - 1.6
855        # Hash sum: 49.33206_50330_16102_89036
856        num = (((((((7.74545_01427_83414_07640e-4 * r +
857                     2.27238_44989_26918_45833e-2) * r +
858                     2.41780_72517_74506_11770e-1) * r +
859                     1.27045_82524_52368_38258e+0) * r +
860                     3.64784_83247_63204_60504e+0) * r +
861                     5.76949_72214_60691_40550e+0) * r +
862                     4.63033_78461_56545_29590e+0) * r +
863                     1.42343_71107_49683_57734e+0)
864        den = (((((((1.05075_00716_44416_84324e-9 * r +
865                     5.47593_80849_95344_94600e-4) * r +
866                     1.51986_66563_61645_71966e-2) * r +
867                     1.48103_97642_74800_74590e-1) * r +
868                     6.89767_33498_51000_04550e-1) * r +
869                     1.67638_48301_83803_84940e+0) * r +
870                     2.05319_16266_37758_82187e+0) * r +
871                     1.0)
872    else:
873        r = r - 5.0
874        # Hash sum: 47.52583_31754_92896_71629
875        num = (((((((2.01033_43992_92288_13265e-7 * r +
876                     2.71155_55687_43487_57815e-5) * r +
877                     1.24266_09473_88078_43860e-3) * r +
878                     2.65321_89526_57612_30930e-2) * r +
879                     2.96560_57182_85048_91230e-1) * r +
880                     1.78482_65399_17291_33580e+0) * r +
881                     5.46378_49111_64114_36990e+0) * r +
882                     6.65790_46435_01103_77720e+0)
883        den = (((((((2.04426_31033_89939_78564e-15 * r +
884                     1.42151_17583_16445_88870e-7) * r +
885                     1.84631_83175_10054_68180e-5) * r +
886                     7.86869_13114_56132_59100e-4) * r +
887                     1.48753_61290_85061_48525e-2) * r +
888                     1.36929_88092_27358_05310e-1) * r +
889                     5.99832_20655_58879_37690e-1) * r +
890                     1.0)
891    x = num / den
892    if q < 0.0:
893        x = -x
894    return mu + (x * sigma)
895
896
897class NormalDist:
898    "Normal distribution of a random variable"
899    # https://en.wikipedia.org/wiki/Normal_distribution
900    # https://en.wikipedia.org/wiki/Variance#Properties
901
902    __slots__ = {
903        '_mu': 'Arithmetic mean of a normal distribution',
904        '_sigma': 'Standard deviation of a normal distribution',
905    }
906
907    def __init__(self, mu=0.0, sigma=1.0):
908        "NormalDist where mu is the mean and sigma is the standard deviation."
909        if sigma < 0.0:
910            raise StatisticsError('sigma must be non-negative')
911        self._mu = float(mu)
912        self._sigma = float(sigma)
913
914    @classmethod
915    def from_samples(cls, data):
916        "Make a normal distribution instance from sample data."
917        if not isinstance(data, (list, tuple)):
918            data = list(data)
919        xbar = fmean(data)
920        return cls(xbar, stdev(data, xbar))
921
922    def samples(self, n, *, seed=None):
923        "Generate *n* samples for a given mean and standard deviation."
924        gauss = random.gauss if seed is None else random.Random(seed).gauss
925        mu, sigma = self._mu, self._sigma
926        return [gauss(mu, sigma) for i in range(n)]
927
928    def pdf(self, x):
929        "Probability density function.  P(x <= X < x+dx) / dx"
930        variance = self._sigma ** 2.0
931        if not variance:
932            raise StatisticsError('pdf() not defined when sigma is zero')
933        return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance)
934
935    def cdf(self, x):
936        "Cumulative distribution function.  P(X <= x)"
937        if not self._sigma:
938            raise StatisticsError('cdf() not defined when sigma is zero')
939        return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
940
941    def inv_cdf(self, p):
942        """Inverse cumulative distribution function.  x : P(X <= x) = p
943
944        Finds the value of the random variable such that the probability of
945        the variable being less than or equal to that value equals the given
946        probability.
947
948        This function is also called the percent point function or quantile
949        function.
950        """
951        if p <= 0.0 or p >= 1.0:
952            raise StatisticsError('p must be in the range 0.0 < p < 1.0')
953        if self._sigma <= 0.0:
954            raise StatisticsError('cdf() not defined when sigma at or below zero')
955        return _normal_dist_inv_cdf(p, self._mu, self._sigma)
956
957    def quantiles(self, n=4):
958        """Divide into *n* continuous intervals with equal probability.
959
960        Returns a list of (n - 1) cut points separating the intervals.
961
962        Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
963        Set *n* to 100 for percentiles which gives the 99 cuts points that
964        separate the normal distribution in to 100 equal sized groups.
965        """
966        return [self.inv_cdf(i / n) for i in range(1, n)]
967
968    def overlap(self, other):
969        """Compute the overlapping coefficient (OVL) between two normal distributions.
970
971        Measures the agreement between two normal probability distributions.
972        Returns a value between 0.0 and 1.0 giving the overlapping area in
973        the two underlying probability density functions.
974
975            >>> N1 = NormalDist(2.4, 1.6)
976            >>> N2 = NormalDist(3.2, 2.0)
977            >>> N1.overlap(N2)
978            0.8035050657330205
979        """
980        # See: "The overlapping coefficient as a measure of agreement between
981        # probability distributions and point estimation of the overlap of two
982        # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
983        # http://dx.doi.org/10.1080/03610928908830127
984        if not isinstance(other, NormalDist):
985            raise TypeError('Expected another NormalDist instance')
986        X, Y = self, other
987        if (Y._sigma, Y._mu) < (X._sigma, X._mu):   # sort to assure commutativity
988            X, Y = Y, X
989        X_var, Y_var = X.variance, Y.variance
990        if not X_var or not Y_var:
991            raise StatisticsError('overlap() not defined when sigma is zero')
992        dv = Y_var - X_var
993        dm = fabs(Y._mu - X._mu)
994        if not dv:
995            return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
996        a = X._mu * Y_var - Y._mu * X_var
997        b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
998        x1 = (a + b) / dv
999        x2 = (a - b) / dv
1000        return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
1001
1002    @property
1003    def mean(self):
1004        "Arithmetic mean of the normal distribution."
1005        return self._mu
1006
1007    @property
1008    def median(self):
1009        "Return the median of the normal distribution"
1010        return self._mu
1011
1012    @property
1013    def mode(self):
1014        """Return the mode of the normal distribution
1015
1016        The mode is the value x where which the probability density
1017        function (pdf) takes its maximum value.
1018        """
1019        return self._mu
1020
1021    @property
1022    def stdev(self):
1023        "Standard deviation of the normal distribution."
1024        return self._sigma
1025
1026    @property
1027    def variance(self):
1028        "Square of the standard deviation."
1029        return self._sigma ** 2.0
1030
1031    def __add__(x1, x2):
1032        """Add a constant or another NormalDist instance.
1033
1034        If *other* is a constant, translate mu by the constant,
1035        leaving sigma unchanged.
1036
1037        If *other* is a NormalDist, add both the means and the variances.
1038        Mathematically, this works only if the two distributions are
1039        independent or if they are jointly normally distributed.
1040        """
1041        if isinstance(x2, NormalDist):
1042            return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
1043        return NormalDist(x1._mu + x2, x1._sigma)
1044
1045    def __sub__(x1, x2):
1046        """Subtract a constant or another NormalDist instance.
1047
1048        If *other* is a constant, translate by the constant mu,
1049        leaving sigma unchanged.
1050
1051        If *other* is a NormalDist, subtract the means and add the variances.
1052        Mathematically, this works only if the two distributions are
1053        independent or if they are jointly normally distributed.
1054        """
1055        if isinstance(x2, NormalDist):
1056            return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
1057        return NormalDist(x1._mu - x2, x1._sigma)
1058
1059    def __mul__(x1, x2):
1060        """Multiply both mu and sigma by a constant.
1061
1062        Used for rescaling, perhaps to change measurement units.
1063        Sigma is scaled with the absolute value of the constant.
1064        """
1065        return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
1066
1067    def __truediv__(x1, x2):
1068        """Divide both mu and sigma by a constant.
1069
1070        Used for rescaling, perhaps to change measurement units.
1071        Sigma is scaled with the absolute value of the constant.
1072        """
1073        return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
1074
1075    def __pos__(x1):
1076        "Return a copy of the instance."
1077        return NormalDist(x1._mu, x1._sigma)
1078
1079    def __neg__(x1):
1080        "Negates mu while keeping sigma the same."
1081        return NormalDist(-x1._mu, x1._sigma)
1082
1083    __radd__ = __add__
1084
1085    def __rsub__(x1, x2):
1086        "Subtract a NormalDist from a constant or another NormalDist."
1087        return -(x1 - x2)
1088
1089    __rmul__ = __mul__
1090
1091    def __eq__(x1, x2):
1092        "Two NormalDist objects are equal if their mu and sigma are both equal."
1093        if not isinstance(x2, NormalDist):
1094            return NotImplemented
1095        return x1._mu == x2._mu and x1._sigma == x2._sigma
1096
1097    def __hash__(self):
1098        "NormalDist objects hash equal if their mu and sigma are both equal."
1099        return hash((self._mu, self._sigma))
1100
1101    def __repr__(self):
1102        return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'
1103
1104# If available, use C implementation
1105try:
1106    from _statistics import _normal_dist_inv_cdf
1107except ImportError:
1108    pass
1109
1110
1111if __name__ == '__main__':
1112
1113    # Show math operations computed analytically in comparsion
1114    # to a monte carlo simulation of the same operations
1115
1116    from math import isclose
1117    from operator import add, sub, mul, truediv
1118    from itertools import repeat
1119    import doctest
1120
1121    g1 = NormalDist(10, 20)
1122    g2 = NormalDist(-5, 25)
1123
1124    # Test scaling by a constant
1125    assert (g1 * 5 / 5).mean == g1.mean
1126    assert (g1 * 5 / 5).stdev == g1.stdev
1127
1128    n = 100_000
1129    G1 = g1.samples(n)
1130    G2 = g2.samples(n)
1131
1132    for func in (add, sub):
1133        print(f'\nTest {func.__name__} with another NormalDist:')
1134        print(func(g1, g2))
1135        print(NormalDist.from_samples(map(func, G1, G2)))
1136
1137    const = 11
1138    for func in (add, sub, mul, truediv):
1139        print(f'\nTest {func.__name__} with a constant:')
1140        print(func(g1, const))
1141        print(NormalDist.from_samples(map(func, G1, repeat(const))))
1142
1143    const = 19
1144    for func in (add, sub, mul):
1145        print(f'\nTest constant with {func.__name__}:')
1146        print(func(const, g1))
1147        print(NormalDist.from_samples(map(func, repeat(const), G1)))
1148
1149    def assert_close(G1, G2):
1150        assert isclose(G1.mean, G1.mean, rel_tol=0.01), (G1, G2)
1151        assert isclose(G1.stdev, G2.stdev, rel_tol=0.01), (G1, G2)
1152
1153    X = NormalDist(-105, 73)
1154    Y = NormalDist(31, 47)
1155    s = 32.75
1156    n = 100_000
1157
1158    S = NormalDist.from_samples([x + s for x in X.samples(n)])
1159    assert_close(X + s, S)
1160
1161    S = NormalDist.from_samples([x - s for x in X.samples(n)])
1162    assert_close(X - s, S)
1163
1164    S = NormalDist.from_samples([x * s for x in X.samples(n)])
1165    assert_close(X * s, S)
1166
1167    S = NormalDist.from_samples([x / s for x in X.samples(n)])
1168    assert_close(X / s, S)
1169
1170    S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n),
1171                                                       Y.samples(n))])
1172    assert_close(X + Y, S)
1173
1174    S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
1175                                                       Y.samples(n))])
1176    assert_close(X - Y, S)
1177
1178    print(doctest.testmod())
1179