blob: f9d3802ec5f83e2d777fbbad2a2dd4ed28ccd35d [file] [log] [blame]
Olivier Deprezf4ef2d02021-04-20 13:36:24 +02001"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
10================== ==================================================
11Function Description
12================== ==================================================
13mean Arithmetic mean (average) of data.
14fmean Fast, floating point arithmetic mean.
15geometric_mean Geometric mean of data.
16harmonic_mean Harmonic mean of data.
17median Median (middle value) of data.
18median_low Low median of data.
19median_high High median of data.
20median_grouped Median, or 50th percentile, of grouped data.
21mode Mode (most common value) of data.
22multimode List of modes (most common values of data).
23quantiles Divide data into intervals with equal probability.
24================== ==================================================
25
26Calculate the arithmetic mean ("the average") of data:
27
28>>> mean([-1.0, 2.5, 3.25, 5.75])
292.625
30
31
32Calculate the standard median of discrete data:
33
34>>> median([2, 3, 4, 5])
353.5
36
37
38Calculate the median, or 50th percentile, of data grouped into class intervals
39centred on the data values provided. E.g. if your data points are rounded to
40the nearest whole number:
41
42>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
432.8333333333...
44
45This should be interpreted in this way: you have two data points in the class
46interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
47the class interval 3.5-4.5. The median of these data points is 2.8333...
48
49
50Calculating variability or spread
51---------------------------------
52
53================== =============================================
54Function Description
55================== =============================================
56pvariance Population variance of data.
57variance Sample variance of data.
58pstdev Population standard deviation of data.
59stdev Sample standard deviation of data.
60================== =============================================
61
62Calculate the standard deviation of sample data:
63
64>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
654.38961843444...
66
67If you have previously calculated the mean, you can pass it as the optional
68second argument to the four "spread" functions to avoid recalculating it:
69
70>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
71>>> mu = mean(data)
72>>> pvariance(data, mu)
732.5
74
75
76Exceptions
77----------
78
79A single exception is defined: StatisticsError is a subclass of ValueError.
80
81"""
82
83__all__ = [
84 'NormalDist',
85 'StatisticsError',
86 'fmean',
87 'geometric_mean',
88 'harmonic_mean',
89 'mean',
90 'median',
91 'median_grouped',
92 'median_high',
93 'median_low',
94 'mode',
95 'multimode',
96 'pstdev',
97 'pvariance',
98 'quantiles',
99 'stdev',
100 'variance',
101]
102
103import math
104import numbers
105import random
106
107from fractions import Fraction
108from decimal import Decimal
109from itertools import groupby
110from bisect import bisect_left, bisect_right
111from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
112from operator import itemgetter
113from collections import Counter
114
115# === Exceptions ===
116
117class StatisticsError(ValueError):
118 pass
119
120
121# === Private utilities ===
122
123def _sum(data, start=0):
124 """_sum(data [, start]) -> (type, sum, count)
125
126 Return a high-precision sum of the given numeric data as a fraction,
127 together with the type to be converted to and the count of items.
128
129 If optional argument ``start`` is given, it is added to the total.
130 If ``data`` is empty, ``start`` (defaulting to 0) is returned.
131
132
133 Examples
134 --------
135
136 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
137 (<class 'float'>, Fraction(11, 1), 5)
138
139 Some sources of round-off error will be avoided:
140
141 # Built-in sum returns zero.
142 >>> _sum([1e50, 1, -1e50] * 1000)
143 (<class 'float'>, Fraction(1000, 1), 3000)
144
145 Fractions and Decimals are also supported:
146
147 >>> from fractions import Fraction as F
148 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
149 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
150
151 >>> from decimal import Decimal as D
152 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
153 >>> _sum(data)
154 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
155
156 Mixed types are currently treated as an error, except that int is
157 allowed.
158 """
159 count = 0
160 n, d = _exact_ratio(start)
161 partials = {d: n}
162 partials_get = partials.get
163 T = _coerce(int, type(start))
164 for typ, values in groupby(data, type):
165 T = _coerce(T, typ) # or raise TypeError
166 for n, d in map(_exact_ratio, values):
167 count += 1
168 partials[d] = partials_get(d, 0) + n
169 if None in partials:
170 # The sum will be a NAN or INF. We can ignore all the finite
171 # partials, and just look at this special one.
172 total = partials[None]
173 assert not _isfinite(total)
174 else:
175 # Sum all the partial sums using builtin sum.
176 # FIXME is this faster if we sum them in order of the denominator?
177 total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
178 return (T, total, count)
179
180
181def _isfinite(x):
182 try:
183 return x.is_finite() # Likely a Decimal.
184 except AttributeError:
185 return math.isfinite(x) # Coerces to float first.
186
187
188def _coerce(T, S):
189 """Coerce types T and S to a common type, or raise TypeError.
190
191 Coercion rules are currently an implementation detail. See the CoerceTest
192 test class in test_statistics for details.
193 """
194 # See http://bugs.python.org/issue24068.
195 assert T is not bool, "initial type T is bool"
196 # If the types are the same, no need to coerce anything. Put this
197 # first, so that the usual case (no coercion needed) happens as soon
198 # as possible.
199 if T is S: return T
200 # Mixed int & other coerce to the other type.
201 if S is int or S is bool: return T
202 if T is int: return S
203 # If one is a (strict) subclass of the other, coerce to the subclass.
204 if issubclass(S, T): return S
205 if issubclass(T, S): return T
206 # Ints coerce to the other type.
207 if issubclass(T, int): return S
208 if issubclass(S, int): return T
209 # Mixed fraction & float coerces to float (or float subclass).
210 if issubclass(T, Fraction) and issubclass(S, float):
211 return S
212 if issubclass(T, float) and issubclass(S, Fraction):
213 return T
214 # Any other combination is disallowed.
215 msg = "don't know how to coerce %s and %s"
216 raise TypeError(msg % (T.__name__, S.__name__))
217
218
219def _exact_ratio(x):
220 """Return Real number x to exact (numerator, denominator) pair.
221
222 >>> _exact_ratio(0.25)
223 (1, 4)
224
225 x is expected to be an int, Fraction, Decimal or float.
226 """
227 try:
228 # Optimise the common case of floats. We expect that the most often
229 # used numeric type will be builtin floats, so try to make this as
230 # fast as possible.
231 if type(x) is float or type(x) is Decimal:
232 return x.as_integer_ratio()
233 try:
234 # x may be an int, Fraction, or Integral ABC.
235 return (x.numerator, x.denominator)
236 except AttributeError:
237 try:
238 # x may be a float or Decimal subclass.
239 return x.as_integer_ratio()
240 except AttributeError:
241 # Just give up?
242 pass
243 except (OverflowError, ValueError):
244 # float NAN or INF.
245 assert not _isfinite(x)
246 return (x, None)
247 msg = "can't convert type '{}' to numerator/denominator"
248 raise TypeError(msg.format(type(x).__name__))
249
250
251def _convert(value, T):
252 """Convert value to given numeric type T."""
253 if type(value) is T:
254 # This covers the cases where T is Fraction, or where value is
255 # a NAN or INF (Decimal or float).
256 return value
257 if issubclass(T, int) and value.denominator != 1:
258 T = float
259 try:
260 # FIXME: what do we do if this overflows?
261 return T(value)
262 except TypeError:
263 if issubclass(T, Decimal):
264 return T(value.numerator) / T(value.denominator)
265 else:
266 raise
267
268
269def _find_lteq(a, x):
270 'Locate the leftmost value exactly equal to x'
271 i = bisect_left(a, x)
272 if i != len(a) and a[i] == x:
273 return i
274 raise ValueError
275
276
277def _find_rteq(a, l, x):
278 'Locate the rightmost value exactly equal to x'
279 i = bisect_right(a, x, lo=l)
280 if i != (len(a) + 1) and a[i - 1] == x:
281 return i - 1
282 raise ValueError
283
284
285def _fail_neg(values, errmsg='negative value'):
286 """Iterate over values, failing if any are less than zero."""
287 for x in values:
288 if x < 0:
289 raise StatisticsError(errmsg)
290 yield x
291
292
293# === Measures of central tendency (averages) ===
294
295def mean(data):
296 """Return the sample arithmetic mean of data.
297
298 >>> mean([1, 2, 3, 4, 4])
299 2.8
300
301 >>> from fractions import Fraction as F
302 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
303 Fraction(13, 21)
304
305 >>> from decimal import Decimal as D
306 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
307 Decimal('0.5625')
308
309 If ``data`` is empty, StatisticsError will be raised.
310 """
311 if iter(data) is data:
312 data = list(data)
313 n = len(data)
314 if n < 1:
315 raise StatisticsError('mean requires at least one data point')
316 T, total, count = _sum(data)
317 assert count == n
318 return _convert(total / n, T)
319
320
321def fmean(data):
322 """Convert data to floats and compute the arithmetic mean.
323
324 This runs faster than the mean() function and it always returns a float.
325 If the input dataset is empty, it raises a StatisticsError.
326
327 >>> fmean([3.5, 4.0, 5.25])
328 4.25
329 """
330 try:
331 n = len(data)
332 except TypeError:
333 # Handle iterators that do not define __len__().
334 n = 0
335 def count(iterable):
336 nonlocal n
337 for n, x in enumerate(iterable, start=1):
338 yield x
339 total = fsum(count(data))
340 else:
341 total = fsum(data)
342 try:
343 return total / n
344 except ZeroDivisionError:
345 raise StatisticsError('fmean requires at least one data point') from None
346
347
348def geometric_mean(data):
349 """Convert data to floats and compute the geometric mean.
350
351 Raises a StatisticsError if the input dataset is empty,
352 if it contains a zero, or if it contains a negative value.
353
354 No special efforts are made to achieve exact results.
355 (However, this may change in the future.)
356
357 >>> round(geometric_mean([54, 24, 36]), 9)
358 36.0
359 """
360 try:
361 return exp(fmean(map(log, data)))
362 except ValueError:
363 raise StatisticsError('geometric mean requires a non-empty dataset '
364 ' containing positive numbers') from None
365
366
367def harmonic_mean(data):
368 """Return the harmonic mean of data.
369
370 The harmonic mean, sometimes called the subcontrary mean, is the
371 reciprocal of the arithmetic mean of the reciprocals of the data,
372 and is often appropriate when averaging quantities which are rates
373 or ratios, for example speeds. Example:
374
375 Suppose an investor purchases an equal value of shares in each of
376 three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
377 What is the average P/E ratio for the investor's portfolio?
378
379 >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
380 3.6
381
382 Using the arithmetic mean would give an average of about 5.167, which
383 is too high.
384
385 If ``data`` is empty, or any element is less than zero,
386 ``harmonic_mean`` will raise ``StatisticsError``.
387 """
388 # For a justification for using harmonic mean for P/E ratios, see
389 # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
390 # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
391 if iter(data) is data:
392 data = list(data)
393 errmsg = 'harmonic mean does not support negative values'
394 n = len(data)
395 if n < 1:
396 raise StatisticsError('harmonic_mean requires at least one data point')
397 elif n == 1:
398 x = data[0]
399 if isinstance(x, (numbers.Real, Decimal)):
400 if x < 0:
401 raise StatisticsError(errmsg)
402 return x
403 else:
404 raise TypeError('unsupported type')
405 try:
406 T, total, count = _sum(1 / x for x in _fail_neg(data, errmsg))
407 except ZeroDivisionError:
408 return 0
409 assert count == n
410 return _convert(n / total, T)
411
412
413# FIXME: investigate ways to calculate medians without sorting? Quickselect?
414def median(data):
415 """Return the median (middle value) of numeric data.
416
417 When the number of data points is odd, return the middle data point.
418 When the number of data points is even, the median is interpolated by
419 taking the average of the two middle values:
420
421 >>> median([1, 3, 5])
422 3
423 >>> median([1, 3, 5, 7])
424 4.0
425
426 """
427 data = sorted(data)
428 n = len(data)
429 if n == 0:
430 raise StatisticsError("no median for empty data")
431 if n % 2 == 1:
432 return data[n // 2]
433 else:
434 i = n // 2
435 return (data[i - 1] + data[i]) / 2
436
437
438def median_low(data):
439 """Return the low median of numeric data.
440
441 When the number of data points is odd, the middle value is returned.
442 When it is even, the smaller of the two middle values is returned.
443
444 >>> median_low([1, 3, 5])
445 3
446 >>> median_low([1, 3, 5, 7])
447 3
448
449 """
450 data = sorted(data)
451 n = len(data)
452 if n == 0:
453 raise StatisticsError("no median for empty data")
454 if n % 2 == 1:
455 return data[n // 2]
456 else:
457 return data[n // 2 - 1]
458
459
460def median_high(data):
461 """Return the high median of data.
462
463 When the number of data points is odd, the middle value is returned.
464 When it is even, the larger of the two middle values is returned.
465
466 >>> median_high([1, 3, 5])
467 3
468 >>> median_high([1, 3, 5, 7])
469 5
470
471 """
472 data = sorted(data)
473 n = len(data)
474 if n == 0:
475 raise StatisticsError("no median for empty data")
476 return data[n // 2]
477
478
479def median_grouped(data, interval=1):
480 """Return the 50th percentile (median) of grouped continuous data.
481
482 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
483 3.7
484 >>> median_grouped([52, 52, 53, 54])
485 52.5
486
487 This calculates the median as the 50th percentile, and should be
488 used when your data is continuous and grouped. In the above example,
489 the values 1, 2, 3, etc. actually represent the midpoint of classes
490 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
491 class 3.5-4.5, and interpolation is used to estimate it.
492
493 Optional argument ``interval`` represents the class interval, and
494 defaults to 1. Changing the class interval naturally will change the
495 interpolated 50th percentile value:
496
497 >>> median_grouped([1, 3, 3, 5, 7], interval=1)
498 3.25
499 >>> median_grouped([1, 3, 3, 5, 7], interval=2)
500 3.5
501
502 This function does not check whether the data points are at least
503 ``interval`` apart.
504 """
505 data = sorted(data)
506 n = len(data)
507 if n == 0:
508 raise StatisticsError("no median for empty data")
509 elif n == 1:
510 return data[0]
511 # Find the value at the midpoint. Remember this corresponds to the
512 # centre of the class interval.
513 x = data[n // 2]
514 for obj in (x, interval):
515 if isinstance(obj, (str, bytes)):
516 raise TypeError('expected number but got %r' % obj)
517 try:
518 L = x - interval / 2 # The lower limit of the median interval.
519 except TypeError:
520 # Mixed type. For now we just coerce to float.
521 L = float(x) - float(interval) / 2
522
523 # Uses bisection search to search for x in data with log(n) time complexity
524 # Find the position of leftmost occurrence of x in data
525 l1 = _find_lteq(data, x)
526 # Find the position of rightmost occurrence of x in data[l1...len(data)]
527 # Assuming always l1 <= l2
528 l2 = _find_rteq(data, l1, x)
529 cf = l1
530 f = l2 - l1 + 1
531 return L + interval * (n / 2 - cf) / f
532
533
534def mode(data):
535 """Return the most common data point from discrete or nominal data.
536
537 ``mode`` assumes discrete data, and returns a single value. This is the
538 standard treatment of the mode as commonly taught in schools:
539
540 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
541 3
542
543 This also works with nominal (non-numeric) data:
544
545 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
546 'red'
547
548 If there are multiple modes with same frequency, return the first one
549 encountered:
550
551 >>> mode(['red', 'red', 'green', 'blue', 'blue'])
552 'red'
553
554 If *data* is empty, ``mode``, raises StatisticsError.
555
556 """
557 pairs = Counter(iter(data)).most_common(1)
558 try:
559 return pairs[0][0]
560 except IndexError:
561 raise StatisticsError('no mode for empty data') from None
562
563
564def multimode(data):
565 """Return a list of the most frequently occurring values.
566
567 Will return more than one result if there are multiple modes
568 or an empty list if *data* is empty.
569
570 >>> multimode('aabbbbbbbbcc')
571 ['b']
572 >>> multimode('aabbbbccddddeeffffgg')
573 ['b', 'd', 'f']
574 >>> multimode('')
575 []
576 """
577 counts = Counter(iter(data)).most_common()
578 maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
579 return list(map(itemgetter(0), mode_items))
580
581
582# Notes on methods for computing quantiles
583# ----------------------------------------
584#
585# There is no one perfect way to compute quantiles. Here we offer
586# two methods that serve common needs. Most other packages
587# surveyed offered at least one or both of these two, making them
588# "standard" in the sense of "widely-adopted and reproducible".
589# They are also easy to explain, easy to compute manually, and have
590# straight-forward interpretations that aren't surprising.
591
592# The default method is known as "R6", "PERCENTILE.EXC", or "expected
593# value of rank order statistics". The alternative method is known as
594# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
595
596# For sample data where there is a positive probability for values
597# beyond the range of the data, the R6 exclusive method is a
598# reasonable choice. Consider a random sample of nine values from a
599# population with a uniform distribution from 0.0 to 1.0. The
600# distribution of the third ranked sample point is described by
601# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
602# mean=0.300. Only the latter (which corresponds with R6) gives the
603# desired cut point with 30% of the population falling below that
604# value, making it comparable to a result from an inv_cdf() function.
605# The R6 exclusive method is also idempotent.
606
607# For describing population data where the end points are known to
608# be included in the data, the R7 inclusive method is a reasonable
609# choice. Instead of the mean, it uses the mode of the beta
610# distribution for the interior points. Per Hyndman & Fan, "One nice
611# property is that the vertices of Q7(p) divide the range into n - 1
612# intervals, and exactly 100p% of the intervals lie to the left of
613# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
614
615# If needed, other methods could be added. However, for now, the
616# position is that fewer options make for easier choices and that
617# external packages can be used for anything more advanced.
618
619def quantiles(data, *, n=4, method='exclusive'):
620 """Divide *data* into *n* continuous intervals with equal probability.
621
622 Returns a list of (n - 1) cut points separating the intervals.
623
624 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles.
625 Set *n* to 100 for percentiles which gives the 99 cuts points that
626 separate *data* in to 100 equal sized groups.
627
628 The *data* can be any iterable containing sample.
629 The cut points are linearly interpolated between data points.
630
631 If *method* is set to *inclusive*, *data* is treated as population
632 data. The minimum value is treated as the 0th percentile and the
633 maximum value is treated as the 100th percentile.
634 """
635 if n < 1:
636 raise StatisticsError('n must be at least 1')
637 data = sorted(data)
638 ld = len(data)
639 if ld < 2:
640 raise StatisticsError('must have at least two data points')
641 if method == 'inclusive':
642 m = ld - 1
643 result = []
644 for i in range(1, n):
645 j, delta = divmod(i * m, n)
646 interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
647 result.append(interpolated)
648 return result
649 if method == 'exclusive':
650 m = ld + 1
651 result = []
652 for i in range(1, n):
653 j = i * m // n # rescale i to m/n
654 j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1
655 delta = i*m - j*n # exact integer math
656 interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
657 result.append(interpolated)
658 return result
659 raise ValueError(f'Unknown method: {method!r}')
660
661
662# === Measures of spread ===
663
664# See http://mathworld.wolfram.com/Variance.html
665# http://mathworld.wolfram.com/SampleVariance.html
666# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
667#
668# Under no circumstances use the so-called "computational formula for
669# variance", as that is only suitable for hand calculations with a small
670# amount of low-precision data. It has terrible numeric properties.
671#
672# See a comparison of three computational methods here:
673# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
674
675def _ss(data, c=None):
676 """Return sum of square deviations of sequence data.
677
678 If ``c`` is None, the mean is calculated in one pass, and the deviations
679 from the mean are calculated in a second pass. Otherwise, deviations are
680 calculated from ``c`` as given. Use the second case with care, as it can
681 lead to garbage results.
682 """
683 if c is not None:
684 T, total, count = _sum((x-c)**2 for x in data)
685 return (T, total)
686 c = mean(data)
687 T, total, count = _sum((x-c)**2 for x in data)
688 # The following sum should mathematically equal zero, but due to rounding
689 # error may not.
690 U, total2, count2 = _sum((x - c) for x in data)
691 assert T == U and count == count2
692 total -= total2 ** 2 / len(data)
693 assert not total < 0, 'negative sum of square deviations: %f' % total
694 return (T, total)
695
696
697def variance(data, xbar=None):
698 """Return the sample variance of data.
699
700 data should be an iterable of Real-valued numbers, with at least two
701 values. The optional argument xbar, if given, should be the mean of
702 the data. If it is missing or None, the mean is automatically calculated.
703
704 Use this function when your data is a sample from a population. To
705 calculate the variance from the entire population, see ``pvariance``.
706
707 Examples:
708
709 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
710 >>> variance(data)
711 1.3720238095238095
712
713 If you have already calculated the mean of your data, you can pass it as
714 the optional second argument ``xbar`` to avoid recalculating it:
715
716 >>> m = mean(data)
717 >>> variance(data, m)
718 1.3720238095238095
719
720 This function does not check that ``xbar`` is actually the mean of
721 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
722 impossible results.
723
724 Decimals and Fractions are supported:
725
726 >>> from decimal import Decimal as D
727 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
728 Decimal('31.01875')
729
730 >>> from fractions import Fraction as F
731 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
732 Fraction(67, 108)
733
734 """
735 if iter(data) is data:
736 data = list(data)
737 n = len(data)
738 if n < 2:
739 raise StatisticsError('variance requires at least two data points')
740 T, ss = _ss(data, xbar)
741 return _convert(ss / (n - 1), T)
742
743
744def pvariance(data, mu=None):
745 """Return the population variance of ``data``.
746
747 data should be a sequence or iterable of Real-valued numbers, with at least one
748 value. The optional argument mu, if given, should be the mean of
749 the data. If it is missing or None, the mean is automatically calculated.
750
751 Use this function to calculate the variance from the entire population.
752 To estimate the variance from a sample, the ``variance`` function is
753 usually a better choice.
754
755 Examples:
756
757 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
758 >>> pvariance(data)
759 1.25
760
761 If you have already calculated the mean of the data, you can pass it as
762 the optional second argument to avoid recalculating it:
763
764 >>> mu = mean(data)
765 >>> pvariance(data, mu)
766 1.25
767
768 Decimals and Fractions are supported:
769
770 >>> from decimal import Decimal as D
771 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
772 Decimal('24.815')
773
774 >>> from fractions import Fraction as F
775 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
776 Fraction(13, 72)
777
778 """
779 if iter(data) is data:
780 data = list(data)
781 n = len(data)
782 if n < 1:
783 raise StatisticsError('pvariance requires at least one data point')
784 T, ss = _ss(data, mu)
785 return _convert(ss / n, T)
786
787
788def stdev(data, xbar=None):
789 """Return the square root of the sample variance.
790
791 See ``variance`` for arguments and other details.
792
793 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
794 1.0810874155219827
795
796 """
797 var = variance(data, xbar)
798 try:
799 return var.sqrt()
800 except AttributeError:
801 return math.sqrt(var)
802
803
804def pstdev(data, mu=None):
805 """Return the square root of the population variance.
806
807 See ``pvariance`` for arguments and other details.
808
809 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
810 0.986893273527251
811
812 """
813 var = pvariance(data, mu)
814 try:
815 return var.sqrt()
816 except AttributeError:
817 return math.sqrt(var)
818
819
820## Normal Distribution #####################################################
821
822
823def _normal_dist_inv_cdf(p, mu, sigma):
824 # There is no closed-form solution to the inverse CDF for the normal
825 # distribution, so we use a rational approximation instead:
826 # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
827 # Normal Distribution". Applied Statistics. Blackwell Publishing. 37
828 # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
829 q = p - 0.5
830 if fabs(q) <= 0.425:
831 r = 0.180625 - q * q
832 # Hash sum: 55.88319_28806_14901_4439
833 num = (((((((2.50908_09287_30122_6727e+3 * r +
834 3.34305_75583_58812_8105e+4) * r +
835 6.72657_70927_00870_0853e+4) * r +
836 4.59219_53931_54987_1457e+4) * r +
837 1.37316_93765_50946_1125e+4) * r +
838 1.97159_09503_06551_4427e+3) * r +
839 1.33141_66789_17843_7745e+2) * r +
840 3.38713_28727_96366_6080e+0) * q
841 den = (((((((5.22649_52788_52854_5610e+3 * r +
842 2.87290_85735_72194_2674e+4) * r +
843 3.93078_95800_09271_0610e+4) * r +
844 2.12137_94301_58659_5867e+4) * r +
845 5.39419_60214_24751_1077e+3) * r +
846 6.87187_00749_20579_0830e+2) * r +
847 4.23133_30701_60091_1252e+1) * r +
848 1.0)
849 x = num / den
850 return mu + (x * sigma)
851 r = p if q <= 0.0 else 1.0 - p
852 r = sqrt(-log(r))
853 if r <= 5.0:
854 r = r - 1.6
855 # Hash sum: 49.33206_50330_16102_89036
856 num = (((((((7.74545_01427_83414_07640e-4 * r +
857 2.27238_44989_26918_45833e-2) * r +
858 2.41780_72517_74506_11770e-1) * r +
859 1.27045_82524_52368_38258e+0) * r +
860 3.64784_83247_63204_60504e+0) * r +
861 5.76949_72214_60691_40550e+0) * r +
862 4.63033_78461_56545_29590e+0) * r +
863 1.42343_71107_49683_57734e+0)
864 den = (((((((1.05075_00716_44416_84324e-9 * r +
865 5.47593_80849_95344_94600e-4) * r +
866 1.51986_66563_61645_71966e-2) * r +
867 1.48103_97642_74800_74590e-1) * r +
868 6.89767_33498_51000_04550e-1) * r +
869 1.67638_48301_83803_84940e+0) * r +
870 2.05319_16266_37758_82187e+0) * r +
871 1.0)
872 else:
873 r = r - 5.0
874 # Hash sum: 47.52583_31754_92896_71629
875 num = (((((((2.01033_43992_92288_13265e-7 * r +
876 2.71155_55687_43487_57815e-5) * r +
877 1.24266_09473_88078_43860e-3) * r +
878 2.65321_89526_57612_30930e-2) * r +
879 2.96560_57182_85048_91230e-1) * r +
880 1.78482_65399_17291_33580e+0) * r +
881 5.46378_49111_64114_36990e+0) * r +
882 6.65790_46435_01103_77720e+0)
883 den = (((((((2.04426_31033_89939_78564e-15 * r +
884 1.42151_17583_16445_88870e-7) * r +
885 1.84631_83175_10054_68180e-5) * r +
886 7.86869_13114_56132_59100e-4) * r +
887 1.48753_61290_85061_48525e-2) * r +
888 1.36929_88092_27358_05310e-1) * r +
889 5.99832_20655_58879_37690e-1) * r +
890 1.0)
891 x = num / den
892 if q < 0.0:
893 x = -x
894 return mu + (x * sigma)
895
896
897# If available, use C implementation
898try:
899 from _statistics import _normal_dist_inv_cdf
900except ImportError:
901 pass
902
903
904class NormalDist:
905 "Normal distribution of a random variable"
906 # https://en.wikipedia.org/wiki/Normal_distribution
907 # https://en.wikipedia.org/wiki/Variance#Properties
908
909 __slots__ = {
910 '_mu': 'Arithmetic mean of a normal distribution',
911 '_sigma': 'Standard deviation of a normal distribution',
912 }
913
914 def __init__(self, mu=0.0, sigma=1.0):
915 "NormalDist where mu is the mean and sigma is the standard deviation."
916 if sigma < 0.0:
917 raise StatisticsError('sigma must be non-negative')
918 self._mu = float(mu)
919 self._sigma = float(sigma)
920
921 @classmethod
922 def from_samples(cls, data):
923 "Make a normal distribution instance from sample data."
924 if not isinstance(data, (list, tuple)):
925 data = list(data)
926 xbar = fmean(data)
927 return cls(xbar, stdev(data, xbar))
928
929 def samples(self, n, *, seed=None):
930 "Generate *n* samples for a given mean and standard deviation."
931 gauss = random.gauss if seed is None else random.Random(seed).gauss
932 mu, sigma = self._mu, self._sigma
933 return [gauss(mu, sigma) for i in range(n)]
934
935 def pdf(self, x):
936 "Probability density function. P(x <= X < x+dx) / dx"
937 variance = self._sigma ** 2.0
938 if not variance:
939 raise StatisticsError('pdf() not defined when sigma is zero')
940 return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance)
941
942 def cdf(self, x):
943 "Cumulative distribution function. P(X <= x)"
944 if not self._sigma:
945 raise StatisticsError('cdf() not defined when sigma is zero')
946 return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
947
948 def inv_cdf(self, p):
949 """Inverse cumulative distribution function. x : P(X <= x) = p
950
951 Finds the value of the random variable such that the probability of
952 the variable being less than or equal to that value equals the given
953 probability.
954
955 This function is also called the percent point function or quantile
956 function.
957 """
958 if p <= 0.0 or p >= 1.0:
959 raise StatisticsError('p must be in the range 0.0 < p < 1.0')
960 if self._sigma <= 0.0:
961 raise StatisticsError('cdf() not defined when sigma at or below zero')
962 return _normal_dist_inv_cdf(p, self._mu, self._sigma)
963
964 def quantiles(self, n=4):
965 """Divide into *n* continuous intervals with equal probability.
966
967 Returns a list of (n - 1) cut points separating the intervals.
968
969 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles.
970 Set *n* to 100 for percentiles which gives the 99 cuts points that
971 separate the normal distribution in to 100 equal sized groups.
972 """
973 return [self.inv_cdf(i / n) for i in range(1, n)]
974
975 def overlap(self, other):
976 """Compute the overlapping coefficient (OVL) between two normal distributions.
977
978 Measures the agreement between two normal probability distributions.
979 Returns a value between 0.0 and 1.0 giving the overlapping area in
980 the two underlying probability density functions.
981
982 >>> N1 = NormalDist(2.4, 1.6)
983 >>> N2 = NormalDist(3.2, 2.0)
984 >>> N1.overlap(N2)
985 0.8035050657330205
986 """
987 # See: "The overlapping coefficient as a measure of agreement between
988 # probability distributions and point estimation of the overlap of two
989 # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
990 # http://dx.doi.org/10.1080/03610928908830127
991 if not isinstance(other, NormalDist):
992 raise TypeError('Expected another NormalDist instance')
993 X, Y = self, other
994 if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity
995 X, Y = Y, X
996 X_var, Y_var = X.variance, Y.variance
997 if not X_var or not Y_var:
998 raise StatisticsError('overlap() not defined when sigma is zero')
999 dv = Y_var - X_var
1000 dm = fabs(Y._mu - X._mu)
1001 if not dv:
1002 return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
1003 a = X._mu * Y_var - Y._mu * X_var
1004 b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
1005 x1 = (a + b) / dv
1006 x2 = (a - b) / dv
1007 return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
1008
1009 def zscore(self, x):
1010 """Compute the Standard Score. (x - mean) / stdev
1011
1012 Describes *x* in terms of the number of standard deviations
1013 above or below the mean of the normal distribution.
1014 """
1015 # https://www.statisticshowto.com/probability-and-statistics/z-score/
1016 if not self._sigma:
1017 raise StatisticsError('zscore() not defined when sigma is zero')
1018 return (x - self._mu) / self._sigma
1019
1020 @property
1021 def mean(self):
1022 "Arithmetic mean of the normal distribution."
1023 return self._mu
1024
1025 @property
1026 def median(self):
1027 "Return the median of the normal distribution"
1028 return self._mu
1029
1030 @property
1031 def mode(self):
1032 """Return the mode of the normal distribution
1033
1034 The mode is the value x where which the probability density
1035 function (pdf) takes its maximum value.
1036 """
1037 return self._mu
1038
1039 @property
1040 def stdev(self):
1041 "Standard deviation of the normal distribution."
1042 return self._sigma
1043
1044 @property
1045 def variance(self):
1046 "Square of the standard deviation."
1047 return self._sigma ** 2.0
1048
1049 def __add__(x1, x2):
1050 """Add a constant or another NormalDist instance.
1051
1052 If *other* is a constant, translate mu by the constant,
1053 leaving sigma unchanged.
1054
1055 If *other* is a NormalDist, add both the means and the variances.
1056 Mathematically, this works only if the two distributions are
1057 independent or if they are jointly normally distributed.
1058 """
1059 if isinstance(x2, NormalDist):
1060 return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
1061 return NormalDist(x1._mu + x2, x1._sigma)
1062
1063 def __sub__(x1, x2):
1064 """Subtract a constant or another NormalDist instance.
1065
1066 If *other* is a constant, translate by the constant mu,
1067 leaving sigma unchanged.
1068
1069 If *other* is a NormalDist, subtract the means and add the variances.
1070 Mathematically, this works only if the two distributions are
1071 independent or if they are jointly normally distributed.
1072 """
1073 if isinstance(x2, NormalDist):
1074 return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
1075 return NormalDist(x1._mu - x2, x1._sigma)
1076
1077 def __mul__(x1, x2):
1078 """Multiply both mu and sigma by a constant.
1079
1080 Used for rescaling, perhaps to change measurement units.
1081 Sigma is scaled with the absolute value of the constant.
1082 """
1083 return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
1084
1085 def __truediv__(x1, x2):
1086 """Divide both mu and sigma by a constant.
1087
1088 Used for rescaling, perhaps to change measurement units.
1089 Sigma is scaled with the absolute value of the constant.
1090 """
1091 return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
1092
1093 def __pos__(x1):
1094 "Return a copy of the instance."
1095 return NormalDist(x1._mu, x1._sigma)
1096
1097 def __neg__(x1):
1098 "Negates mu while keeping sigma the same."
1099 return NormalDist(-x1._mu, x1._sigma)
1100
1101 __radd__ = __add__
1102
1103 def __rsub__(x1, x2):
1104 "Subtract a NormalDist from a constant or another NormalDist."
1105 return -(x1 - x2)
1106
1107 __rmul__ = __mul__
1108
1109 def __eq__(x1, x2):
1110 "Two NormalDist objects are equal if their mu and sigma are both equal."
1111 if not isinstance(x2, NormalDist):
1112 return NotImplemented
1113 return x1._mu == x2._mu and x1._sigma == x2._sigma
1114
1115 def __hash__(self):
1116 "NormalDist objects hash equal if their mu and sigma are both equal."
1117 return hash((self._mu, self._sigma))
1118
1119 def __repr__(self):
1120 return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'