• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
10==================  =============================================
11Function            Description
12==================  =============================================
13mean                Arithmetic mean (average) of data.
14harmonic_mean       Harmonic mean of data.
15median              Median (middle value) of data.
16median_low          Low median of data.
17median_high         High median of data.
18median_grouped      Median, or 50th percentile, of grouped data.
19mode                Mode (most common value) of data.
20==================  =============================================
21
22Calculate the arithmetic mean ("the average") of data:
23
24>>> mean([-1.0, 2.5, 3.25, 5.75])
252.625
26
27
28Calculate the standard median of discrete data:
29
30>>> median([2, 3, 4, 5])
313.5
32
33
34Calculate the median, or 50th percentile, of data grouped into class intervals
35centred on the data values provided. E.g. if your data points are rounded to
36the nearest whole number:
37
38>>> median_grouped([2, 2, 3, 3, 3, 4])  #doctest: +ELLIPSIS
392.8333333333...
40
41This should be interpreted in this way: you have two data points in the class
42interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
43the class interval 3.5-4.5. The median of these data points is 2.8333...
44
45
46Calculating variability or spread
47---------------------------------
48
49==================  =============================================
50Function            Description
51==================  =============================================
52pvariance           Population variance of data.
53variance            Sample variance of data.
54pstdev              Population standard deviation of data.
55stdev               Sample standard deviation of data.
56==================  =============================================
57
58Calculate the standard deviation of sample data:
59
60>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75])  #doctest: +ELLIPSIS
614.38961843444...
62
63If you have previously calculated the mean, you can pass it as the optional
64second argument to the four "spread" functions to avoid recalculating it:
65
66>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
67>>> mu = mean(data)
68>>> pvariance(data, mu)
692.5
70
71
72Exceptions
73----------
74
75A single exception is defined: StatisticsError is a subclass of ValueError.
76
77"""
78
79__all__ = [ 'StatisticsError',
80            'pstdev', 'pvariance', 'stdev', 'variance',
81            'median',  'median_low', 'median_high', 'median_grouped',
82            'mean', 'mode', 'harmonic_mean',
83          ]
84
85import collections
86import decimal
87import math
88import numbers
89
90from fractions import Fraction
91from decimal import Decimal
92from itertools import groupby, chain
93from bisect import bisect_left, bisect_right
94
95
96
97# === Exceptions ===
98
99class StatisticsError(ValueError):
100    pass
101
102
103# === Private utilities ===
104
105def _sum(data, start=0):
106    """_sum(data [, start]) -> (type, sum, count)
107
108    Return a high-precision sum of the given numeric data as a fraction,
109    together with the type to be converted to and the count of items.
110
111    If optional argument ``start`` is given, it is added to the total.
112    If ``data`` is empty, ``start`` (defaulting to 0) is returned.
113
114
115    Examples
116    --------
117
118    >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
119    (<class 'float'>, Fraction(11, 1), 5)
120
121    Some sources of round-off error will be avoided:
122
123    # Built-in sum returns zero.
124    >>> _sum([1e50, 1, -1e50] * 1000)
125    (<class 'float'>, Fraction(1000, 1), 3000)
126
127    Fractions and Decimals are also supported:
128
129    >>> from fractions import Fraction as F
130    >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
131    (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
132
133    >>> from decimal import Decimal as D
134    >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
135    >>> _sum(data)
136    (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
137
138    Mixed types are currently treated as an error, except that int is
139    allowed.
140    """
141    count = 0
142    n, d = _exact_ratio(start)
143    partials = {d: n}
144    partials_get = partials.get
145    T = _coerce(int, type(start))
146    for typ, values in groupby(data, type):
147        T = _coerce(T, typ)  # or raise TypeError
148        for n,d in map(_exact_ratio, values):
149            count += 1
150            partials[d] = partials_get(d, 0) + n
151    if None in partials:
152        # The sum will be a NAN or INF. We can ignore all the finite
153        # partials, and just look at this special one.
154        total = partials[None]
155        assert not _isfinite(total)
156    else:
157        # Sum all the partial sums using builtin sum.
158        # FIXME is this faster if we sum them in order of the denominator?
159        total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
160    return (T, total, count)
161
162
163def _isfinite(x):
164    try:
165        return x.is_finite()  # Likely a Decimal.
166    except AttributeError:
167        return math.isfinite(x)  # Coerces to float first.
168
169
170def _coerce(T, S):
171    """Coerce types T and S to a common type, or raise TypeError.
172
173    Coercion rules are currently an implementation detail. See the CoerceTest
174    test class in test_statistics for details.
175    """
176    # See http://bugs.python.org/issue24068.
177    assert T is not bool, "initial type T is bool"
178    # If the types are the same, no need to coerce anything. Put this
179    # first, so that the usual case (no coercion needed) happens as soon
180    # as possible.
181    if T is S:  return T
182    # Mixed int & other coerce to the other type.
183    if S is int or S is bool:  return T
184    if T is int:  return S
185    # If one is a (strict) subclass of the other, coerce to the subclass.
186    if issubclass(S, T):  return S
187    if issubclass(T, S):  return T
188    # Ints coerce to the other type.
189    if issubclass(T, int):  return S
190    if issubclass(S, int):  return T
191    # Mixed fraction & float coerces to float (or float subclass).
192    if issubclass(T, Fraction) and issubclass(S, float):
193        return S
194    if issubclass(T, float) and issubclass(S, Fraction):
195        return T
196    # Any other combination is disallowed.
197    msg = "don't know how to coerce %s and %s"
198    raise TypeError(msg % (T.__name__, S.__name__))
199
200
201def _exact_ratio(x):
202    """Return Real number x to exact (numerator, denominator) pair.
203
204    >>> _exact_ratio(0.25)
205    (1, 4)
206
207    x is expected to be an int, Fraction, Decimal or float.
208    """
209    try:
210        # Optimise the common case of floats. We expect that the most often
211        # used numeric type will be builtin floats, so try to make this as
212        # fast as possible.
213        if type(x) is float or type(x) is Decimal:
214            return x.as_integer_ratio()
215        try:
216            # x may be an int, Fraction, or Integral ABC.
217            return (x.numerator, x.denominator)
218        except AttributeError:
219            try:
220                # x may be a float or Decimal subclass.
221                return x.as_integer_ratio()
222            except AttributeError:
223                # Just give up?
224                pass
225    except (OverflowError, ValueError):
226        # float NAN or INF.
227        assert not _isfinite(x)
228        return (x, None)
229    msg = "can't convert type '{}' to numerator/denominator"
230    raise TypeError(msg.format(type(x).__name__))
231
232
233def _convert(value, T):
234    """Convert value to given numeric type T."""
235    if type(value) is T:
236        # This covers the cases where T is Fraction, or where value is
237        # a NAN or INF (Decimal or float).
238        return value
239    if issubclass(T, int) and value.denominator != 1:
240        T = float
241    try:
242        # FIXME: what do we do if this overflows?
243        return T(value)
244    except TypeError:
245        if issubclass(T, Decimal):
246            return T(value.numerator)/T(value.denominator)
247        else:
248            raise
249
250
251def _counts(data):
252    # Generate a table of sorted (value, frequency) pairs.
253    table = collections.Counter(iter(data)).most_common()
254    if not table:
255        return table
256    # Extract the values with the highest frequency.
257    maxfreq = table[0][1]
258    for i in range(1, len(table)):
259        if table[i][1] != maxfreq:
260            table = table[:i]
261            break
262    return table
263
264
265def _find_lteq(a, x):
266    'Locate the leftmost value exactly equal to x'
267    i = bisect_left(a, x)
268    if i != len(a) and a[i] == x:
269        return i
270    raise ValueError
271
272
273def _find_rteq(a, l, x):
274    'Locate the rightmost value exactly equal to x'
275    i = bisect_right(a, x, lo=l)
276    if i != (len(a)+1) and a[i-1] == x:
277        return i-1
278    raise ValueError
279
280
281def _fail_neg(values, errmsg='negative value'):
282    """Iterate over values, failing if any are less than zero."""
283    for x in values:
284        if x < 0:
285            raise StatisticsError(errmsg)
286        yield x
287
288
289# === Measures of central tendency (averages) ===
290
291def mean(data):
292    """Return the sample arithmetic mean of data.
293
294    >>> mean([1, 2, 3, 4, 4])
295    2.8
296
297    >>> from fractions import Fraction as F
298    >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
299    Fraction(13, 21)
300
301    >>> from decimal import Decimal as D
302    >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
303    Decimal('0.5625')
304
305    If ``data`` is empty, StatisticsError will be raised.
306    """
307    if iter(data) is data:
308        data = list(data)
309    n = len(data)
310    if n < 1:
311        raise StatisticsError('mean requires at least one data point')
312    T, total, count = _sum(data)
313    assert count == n
314    return _convert(total/n, T)
315
316
317def harmonic_mean(data):
318    """Return the harmonic mean of data.
319
320    The harmonic mean, sometimes called the subcontrary mean, is the
321    reciprocal of the arithmetic mean of the reciprocals of the data,
322    and is often appropriate when averaging quantities which are rates
323    or ratios, for example speeds. Example:
324
325    Suppose an investor purchases an equal value of shares in each of
326    three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
327    What is the average P/E ratio for the investor's portfolio?
328
329    >>> harmonic_mean([2.5, 3, 10])  # For an equal investment portfolio.
330    3.6
331
332    Using the arithmetic mean would give an average of about 5.167, which
333    is too high.
334
335    If ``data`` is empty, or any element is less than zero,
336    ``harmonic_mean`` will raise ``StatisticsError``.
337    """
338    # For a justification for using harmonic mean for P/E ratios, see
339    # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
340    # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
341    if iter(data) is data:
342        data = list(data)
343    errmsg = 'harmonic mean does not support negative values'
344    n = len(data)
345    if n < 1:
346        raise StatisticsError('harmonic_mean requires at least one data point')
347    elif n == 1:
348        x = data[0]
349        if isinstance(x, (numbers.Real, Decimal)):
350            if x < 0:
351                raise StatisticsError(errmsg)
352            return x
353        else:
354            raise TypeError('unsupported type')
355    try:
356        T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
357    except ZeroDivisionError:
358        return 0
359    assert count == n
360    return _convert(n/total, T)
361
362
363# FIXME: investigate ways to calculate medians without sorting? Quickselect?
364def median(data):
365    """Return the median (middle value) of numeric data.
366
367    When the number of data points is odd, return the middle data point.
368    When the number of data points is even, the median is interpolated by
369    taking the average of the two middle values:
370
371    >>> median([1, 3, 5])
372    3
373    >>> median([1, 3, 5, 7])
374    4.0
375
376    """
377    data = sorted(data)
378    n = len(data)
379    if n == 0:
380        raise StatisticsError("no median for empty data")
381    if n%2 == 1:
382        return data[n//2]
383    else:
384        i = n//2
385        return (data[i - 1] + data[i])/2
386
387
388def median_low(data):
389    """Return the low median of numeric data.
390
391    When the number of data points is odd, the middle value is returned.
392    When it is even, the smaller of the two middle values is returned.
393
394    >>> median_low([1, 3, 5])
395    3
396    >>> median_low([1, 3, 5, 7])
397    3
398
399    """
400    data = sorted(data)
401    n = len(data)
402    if n == 0:
403        raise StatisticsError("no median for empty data")
404    if n%2 == 1:
405        return data[n//2]
406    else:
407        return data[n//2 - 1]
408
409
410def median_high(data):
411    """Return the high median of data.
412
413    When the number of data points is odd, the middle value is returned.
414    When it is even, the larger of the two middle values is returned.
415
416    >>> median_high([1, 3, 5])
417    3
418    >>> median_high([1, 3, 5, 7])
419    5
420
421    """
422    data = sorted(data)
423    n = len(data)
424    if n == 0:
425        raise StatisticsError("no median for empty data")
426    return data[n//2]
427
428
429def median_grouped(data, interval=1):
430    """Return the 50th percentile (median) of grouped continuous data.
431
432    >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
433    3.7
434    >>> median_grouped([52, 52, 53, 54])
435    52.5
436
437    This calculates the median as the 50th percentile, and should be
438    used when your data is continuous and grouped. In the above example,
439    the values 1, 2, 3, etc. actually represent the midpoint of classes
440    0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
441    class 3.5-4.5, and interpolation is used to estimate it.
442
443    Optional argument ``interval`` represents the class interval, and
444    defaults to 1. Changing the class interval naturally will change the
445    interpolated 50th percentile value:
446
447    >>> median_grouped([1, 3, 3, 5, 7], interval=1)
448    3.25
449    >>> median_grouped([1, 3, 3, 5, 7], interval=2)
450    3.5
451
452    This function does not check whether the data points are at least
453    ``interval`` apart.
454    """
455    data = sorted(data)
456    n = len(data)
457    if n == 0:
458        raise StatisticsError("no median for empty data")
459    elif n == 1:
460        return data[0]
461    # Find the value at the midpoint. Remember this corresponds to the
462    # centre of the class interval.
463    x = data[n//2]
464    for obj in (x, interval):
465        if isinstance(obj, (str, bytes)):
466            raise TypeError('expected number but got %r' % obj)
467    try:
468        L = x - interval/2  # The lower limit of the median interval.
469    except TypeError:
470        # Mixed type. For now we just coerce to float.
471        L = float(x) - float(interval)/2
472
473    # Uses bisection search to search for x in data with log(n) time complexity
474    # Find the position of leftmost occurrence of x in data
475    l1 = _find_lteq(data, x)
476    # Find the position of rightmost occurrence of x in data[l1...len(data)]
477    # Assuming always l1 <= l2
478    l2 = _find_rteq(data, l1, x)
479    cf = l1
480    f = l2 - l1 + 1
481    return L + interval*(n/2 - cf)/f
482
483
484def mode(data):
485    """Return the most common data point from discrete or nominal data.
486
487    ``mode`` assumes discrete data, and returns a single value. This is the
488    standard treatment of the mode as commonly taught in schools:
489
490    >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
491    3
492
493    This also works with nominal (non-numeric) data:
494
495    >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
496    'red'
497
498    If there is not exactly one most common value, ``mode`` will raise
499    StatisticsError.
500    """
501    # Generate a table of sorted (value, frequency) pairs.
502    table = _counts(data)
503    if len(table) == 1:
504        return table[0][0]
505    elif table:
506        raise StatisticsError(
507                'no unique mode; found %d equally common values' % len(table)
508                )
509    else:
510        raise StatisticsError('no mode for empty data')
511
512
513# === Measures of spread ===
514
515# See http://mathworld.wolfram.com/Variance.html
516#     http://mathworld.wolfram.com/SampleVariance.html
517#     http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
518#
519# Under no circumstances use the so-called "computational formula for
520# variance", as that is only suitable for hand calculations with a small
521# amount of low-precision data. It has terrible numeric properties.
522#
523# See a comparison of three computational methods here:
524# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
525
526def _ss(data, c=None):
527    """Return sum of square deviations of sequence data.
528
529    If ``c`` is None, the mean is calculated in one pass, and the deviations
530    from the mean are calculated in a second pass. Otherwise, deviations are
531    calculated from ``c`` as given. Use the second case with care, as it can
532    lead to garbage results.
533    """
534    if c is None:
535        c = mean(data)
536    T, total, count = _sum((x-c)**2 for x in data)
537    # The following sum should mathematically equal zero, but due to rounding
538    # error may not.
539    U, total2, count2 = _sum((x-c) for x in data)
540    assert T == U and count == count2
541    total -=  total2**2/len(data)
542    assert not total < 0, 'negative sum of square deviations: %f' % total
543    return (T, total)
544
545
546def variance(data, xbar=None):
547    """Return the sample variance of data.
548
549    data should be an iterable of Real-valued numbers, with at least two
550    values. The optional argument xbar, if given, should be the mean of
551    the data. If it is missing or None, the mean is automatically calculated.
552
553    Use this function when your data is a sample from a population. To
554    calculate the variance from the entire population, see ``pvariance``.
555
556    Examples:
557
558    >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
559    >>> variance(data)
560    1.3720238095238095
561
562    If you have already calculated the mean of your data, you can pass it as
563    the optional second argument ``xbar`` to avoid recalculating it:
564
565    >>> m = mean(data)
566    >>> variance(data, m)
567    1.3720238095238095
568
569    This function does not check that ``xbar`` is actually the mean of
570    ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
571    impossible results.
572
573    Decimals and Fractions are supported:
574
575    >>> from decimal import Decimal as D
576    >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
577    Decimal('31.01875')
578
579    >>> from fractions import Fraction as F
580    >>> variance([F(1, 6), F(1, 2), F(5, 3)])
581    Fraction(67, 108)
582
583    """
584    if iter(data) is data:
585        data = list(data)
586    n = len(data)
587    if n < 2:
588        raise StatisticsError('variance requires at least two data points')
589    T, ss = _ss(data, xbar)
590    return _convert(ss/(n-1), T)
591
592
593def pvariance(data, mu=None):
594    """Return the population variance of ``data``.
595
596    data should be an iterable of Real-valued numbers, with at least one
597    value. The optional argument mu, if given, should be the mean of
598    the data. If it is missing or None, the mean is automatically calculated.
599
600    Use this function to calculate the variance from the entire population.
601    To estimate the variance from a sample, the ``variance`` function is
602    usually a better choice.
603
604    Examples:
605
606    >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
607    >>> pvariance(data)
608    1.25
609
610    If you have already calculated the mean of the data, you can pass it as
611    the optional second argument to avoid recalculating it:
612
613    >>> mu = mean(data)
614    >>> pvariance(data, mu)
615    1.25
616
617    This function does not check that ``mu`` is actually the mean of ``data``.
618    Giving arbitrary values for ``mu`` may lead to invalid or impossible
619    results.
620
621    Decimals and Fractions are supported:
622
623    >>> from decimal import Decimal as D
624    >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
625    Decimal('24.815')
626
627    >>> from fractions import Fraction as F
628    >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
629    Fraction(13, 72)
630
631    """
632    if iter(data) is data:
633        data = list(data)
634    n = len(data)
635    if n < 1:
636        raise StatisticsError('pvariance requires at least one data point')
637    T, ss = _ss(data, mu)
638    return _convert(ss/n, T)
639
640
641def stdev(data, xbar=None):
642    """Return the square root of the sample variance.
643
644    See ``variance`` for arguments and other details.
645
646    >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
647    1.0810874155219827
648
649    """
650    var = variance(data, xbar)
651    try:
652        return var.sqrt()
653    except AttributeError:
654        return math.sqrt(var)
655
656
657def pstdev(data, mu=None):
658    """Return the square root of the population variance.
659
660    See ``pvariance`` for arguments and other details.
661
662    >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
663    0.986893273527251
664
665    """
666    var = pvariance(data, mu)
667    try:
668        return var.sqrt()
669    except AttributeError:
670        return math.sqrt(var)
671