Blame - linux-x64/clang/python3/lib/python3.9/statistics.py - hafnium/prebuilts

blob: f9d3802ec5f83e2d777fbbad2a2dd4ed28ccd35d [file] [log] [blame]

Olivier Deprez	f4ef2d0	2021-04-20 13:36:24 +0200	[diff] [blame]	1	"""
				2	Basic statistics module.
				3
				4	This module provides functions for calculating statistics of data, including
				5	averages, variance, and standard deviation.
				6
				7	Calculating averages
				8	--------------------
				9
				10	================== ==================================================
				11	Function Description
				12	================== ==================================================
				13	mean Arithmetic mean (average) of data.
				14	fmean Fast, floating point arithmetic mean.
				15	geometric_mean Geometric mean of data.
				16	harmonic_mean Harmonic mean of data.
				17	median Median (middle value) of data.
				18	median_low Low median of data.
				19	median_high High median of data.
				20	median_grouped Median, or 50th percentile, of grouped data.
				21	mode Mode (most common value) of data.
				22	multimode List of modes (most common values of data).
				23	quantiles Divide data into intervals with equal probability.
				24	================== ==================================================
				25
				26	Calculate the arithmetic mean ("the average") of data:
				27
				28	>>> mean([-1.0, 2.5, 3.25, 5.75])
				29	2.625
				30
				31
				32	Calculate the standard median of discrete data:
				33
				34	>>> median([2, 3, 4, 5])
				35	3.5
				36
				37
				38	Calculate the median, or 50th percentile, of data grouped into class intervals
				39	centred on the data values provided. E.g. if your data points are rounded to
				40	the nearest whole number:
				41
				42	>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
				43	2.8333333333...
				44
				45	This should be interpreted in this way: you have two data points in the class
				46	interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
				47	the class interval 3.5-4.5. The median of these data points is 2.8333...
				48
				49
				50	Calculating variability or spread
				51	---------------------------------
				52
				53	================== =============================================
				54	Function Description
				55	================== =============================================
				56	pvariance Population variance of data.
				57	variance Sample variance of data.
				58	pstdev Population standard deviation of data.
				59	stdev Sample standard deviation of data.
				60	================== =============================================
				61
				62	Calculate the standard deviation of sample data:
				63
				64	>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
				65	4.38961843444...
				66
				67	If you have previously calculated the mean, you can pass it as the optional
				68	second argument to the four "spread" functions to avoid recalculating it:
				69
				70	>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
				71	>>> mu = mean(data)
				72	>>> pvariance(data, mu)
				73	2.5
				74
				75
				76	Exceptions
				77	----------
				78
				79	A single exception is defined: StatisticsError is a subclass of ValueError.
				80
				81	"""
				82
				83	__all__ = [
				84	'NormalDist',
				85	'StatisticsError',
				86	'fmean',
				87	'geometric_mean',
				88	'harmonic_mean',
				89	'mean',
				90	'median',
				91	'median_grouped',
				92	'median_high',
				93	'median_low',
				94	'mode',
				95	'multimode',
				96	'pstdev',
				97	'pvariance',
				98	'quantiles',
				99	'stdev',
				100	'variance',
				101	]
				102
				103	import math
				104	import numbers
				105	import random
				106
				107	from fractions import Fraction
				108	from decimal import Decimal
				109	from itertools import groupby
				110	from bisect import bisect_left, bisect_right
				111	from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
				112	from operator import itemgetter
				113	from collections import Counter
				114
				115	# === Exceptions ===
				116
				117	class StatisticsError(ValueError):
				118	pass
				119
				120
				121	# === Private utilities ===
				122
				123	def _sum(data, start=0):
				124	"""_sum(data [, start]) -> (type, sum, count)
				125
				126	Return a high-precision sum of the given numeric data as a fraction,
				127	together with the type to be converted to and the count of items.
				128
				129	If optional argument ``start`` is given, it is added to the total.
				130	If ``data`` is empty, ``start`` (defaulting to 0) is returned.
				131
				132
				133	Examples
				134	--------
				135
				136	>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
				137	(<class 'float'>, Fraction(11, 1), 5)
				138
				139	Some sources of round-off error will be avoided:
				140
				141	# Built-in sum returns zero.
				142	>>> _sum([1e50, 1, -1e50] * 1000)
				143	(<class 'float'>, Fraction(1000, 1), 3000)
				144
				145	Fractions and Decimals are also supported:
				146
				147	>>> from fractions import Fraction as F
				148	>>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
				149	(<class 'fractions.Fraction'>, Fraction(63, 20), 4)
				150
				151	>>> from decimal import Decimal as D
				152	>>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
				153	>>> _sum(data)
				154	(<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
				155
				156	Mixed types are currently treated as an error, except that int is
				157	allowed.
				158	"""
				159	count = 0
				160	n, d = _exact_ratio(start)
				161	partials = {d: n}
				162	partials_get = partials.get
				163	T = _coerce(int, type(start))
				164	for typ, values in groupby(data, type):
				165	T = _coerce(T, typ) # or raise TypeError
				166	for n, d in map(_exact_ratio, values):
				167	count += 1
				168	partials[d] = partials_get(d, 0) + n
				169	if None in partials:
				170	# The sum will be a NAN or INF. We can ignore all the finite
				171	# partials, and just look at this special one.
				172	total = partials[None]
				173	assert not _isfinite(total)
				174	else:
				175	# Sum all the partial sums using builtin sum.
				176	# FIXME is this faster if we sum them in order of the denominator?
				177	total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
				178	return (T, total, count)
				179
				180
				181	def _isfinite(x):
				182	try:
				183	return x.is_finite() # Likely a Decimal.
				184	except AttributeError:
				185	return math.isfinite(x) # Coerces to float first.
				186
				187
				188	def _coerce(T, S):
				189	"""Coerce types T and S to a common type, or raise TypeError.
				190
				191	Coercion rules are currently an implementation detail. See the CoerceTest
				192	test class in test_statistics for details.
				193	"""
				194	# See http://bugs.python.org/issue24068.
				195	assert T is not bool, "initial type T is bool"
				196	# If the types are the same, no need to coerce anything. Put this
				197	# first, so that the usual case (no coercion needed) happens as soon
				198	# as possible.
				199	if T is S: return T
				200	# Mixed int & other coerce to the other type.
				201	if S is int or S is bool: return T
				202	if T is int: return S
				203	# If one is a (strict) subclass of the other, coerce to the subclass.
				204	if issubclass(S, T): return S
				205	if issubclass(T, S): return T
				206	# Ints coerce to the other type.
				207	if issubclass(T, int): return S
				208	if issubclass(S, int): return T
				209	# Mixed fraction & float coerces to float (or float subclass).
				210	if issubclass(T, Fraction) and issubclass(S, float):
				211	return S
				212	if issubclass(T, float) and issubclass(S, Fraction):
				213	return T
				214	# Any other combination is disallowed.
				215	msg = "don't know how to coerce %s and %s"
				216	raise TypeError(msg % (T.__name__, S.__name__))
				217
				218
				219	def _exact_ratio(x):
				220	"""Return Real number x to exact (numerator, denominator) pair.
				221
				222	>>> _exact_ratio(0.25)
				223	(1, 4)
				224
				225	x is expected to be an int, Fraction, Decimal or float.
				226	"""
				227	try:
				228	# Optimise the common case of floats. We expect that the most often
				229	# used numeric type will be builtin floats, so try to make this as
				230	# fast as possible.
				231	if type(x) is float or type(x) is Decimal:
				232	return x.as_integer_ratio()
				233	try:
				234	# x may be an int, Fraction, or Integral ABC.
				235	return (x.numerator, x.denominator)
				236	except AttributeError:
				237	try:
				238	# x may be a float or Decimal subclass.
				239	return x.as_integer_ratio()
				240	except AttributeError:
				241	# Just give up?
				242	pass
				243	except (OverflowError, ValueError):
				244	# float NAN or INF.
				245	assert not _isfinite(x)
				246	return (x, None)
				247	msg = "can't convert type '{}' to numerator/denominator"
				248	raise TypeError(msg.format(type(x).__name__))
				249
				250
				251	def _convert(value, T):
				252	"""Convert value to given numeric type T."""
				253	if type(value) is T:
				254	# This covers the cases where T is Fraction, or where value is
				255	# a NAN or INF (Decimal or float).
				256	return value
				257	if issubclass(T, int) and value.denominator != 1:
				258	T = float
				259	try:
				260	# FIXME: what do we do if this overflows?
				261	return T(value)
				262	except TypeError:
				263	if issubclass(T, Decimal):
				264	return T(value.numerator) / T(value.denominator)
				265	else:
				266	raise
				267
				268
				269	def _find_lteq(a, x):
				270	'Locate the leftmost value exactly equal to x'
				271	i = bisect_left(a, x)
				272	if i != len(a) and a[i] == x:
				273	return i
				274	raise ValueError
				275
				276
				277	def _find_rteq(a, l, x):
				278	'Locate the rightmost value exactly equal to x'
				279	i = bisect_right(a, x, lo=l)
				280	if i != (len(a) + 1) and a[i - 1] == x:
				281	return i - 1
				282	raise ValueError
				283
				284
				285	def _fail_neg(values, errmsg='negative value'):
				286	"""Iterate over values, failing if any are less than zero."""
				287	for x in values:
				288	if x < 0:
				289	raise StatisticsError(errmsg)
				290	yield x
				291
				292
				293	# === Measures of central tendency (averages) ===
				294
				295	def mean(data):
				296	"""Return the sample arithmetic mean of data.
				297
				298	>>> mean([1, 2, 3, 4, 4])
				299	2.8
				300
				301	>>> from fractions import Fraction as F
				302	>>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
				303	Fraction(13, 21)
				304
				305	>>> from decimal import Decimal as D
				306	>>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
				307	Decimal('0.5625')
				308
				309	If ``data`` is empty, StatisticsError will be raised.
				310	"""
				311	if iter(data) is data:
				312	data = list(data)
				313	n = len(data)
				314	if n < 1:
				315	raise StatisticsError('mean requires at least one data point')
				316	T, total, count = _sum(data)
				317	assert count == n
				318	return _convert(total / n, T)
				319
				320
				321	def fmean(data):
				322	"""Convert data to floats and compute the arithmetic mean.
				323
				324	This runs faster than the mean() function and it always returns a float.
				325	If the input dataset is empty, it raises a StatisticsError.
				326
				327	>>> fmean([3.5, 4.0, 5.25])
				328	4.25
				329	"""
				330	try:
				331	n = len(data)
				332	except TypeError:
				333	# Handle iterators that do not define __len__().
				334	n = 0
				335	def count(iterable):
				336	nonlocal n
				337	for n, x in enumerate(iterable, start=1):
				338	yield x
				339	total = fsum(count(data))
				340	else:
				341	total = fsum(data)
				342	try:
				343	return total / n
				344	except ZeroDivisionError:
				345	raise StatisticsError('fmean requires at least one data point') from None
				346
				347
				348	def geometric_mean(data):
				349	"""Convert data to floats and compute the geometric mean.
				350
				351	Raises a StatisticsError if the input dataset is empty,
				352	if it contains a zero, or if it contains a negative value.
				353
				354	No special efforts are made to achieve exact results.
				355	(However, this may change in the future.)
				356
				357	>>> round(geometric_mean([54, 24, 36]), 9)
				358	36.0
				359	"""
				360	try:
				361	return exp(fmean(map(log, data)))
				362	except ValueError:
				363	raise StatisticsError('geometric mean requires a non-empty dataset '
				364	' containing positive numbers') from None
				365
				366
				367	def harmonic_mean(data):
				368	"""Return the harmonic mean of data.
				369
				370	The harmonic mean, sometimes called the subcontrary mean, is the
				371	reciprocal of the arithmetic mean of the reciprocals of the data,
				372	and is often appropriate when averaging quantities which are rates
				373	or ratios, for example speeds. Example:
				374
				375	Suppose an investor purchases an equal value of shares in each of
				376	three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
				377	What is the average P/E ratio for the investor's portfolio?
				378
				379	>>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
				380	3.6
				381
				382	Using the arithmetic mean would give an average of about 5.167, which
				383	is too high.
				384
				385	If ``data`` is empty, or any element is less than zero,
				386	``harmonic_mean`` will raise ``StatisticsError``.
				387	"""
				388	# For a justification for using harmonic mean for P/E ratios, see
				389	# http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
				390	# http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
				391	if iter(data) is data:
				392	data = list(data)
				393	errmsg = 'harmonic mean does not support negative values'
				394	n = len(data)
				395	if n < 1:
				396	raise StatisticsError('harmonic_mean requires at least one data point')
				397	elif n == 1:
				398	x = data[0]
				399	if isinstance(x, (numbers.Real, Decimal)):
				400	if x < 0:
				401	raise StatisticsError(errmsg)
				402	return x
				403	else:
				404	raise TypeError('unsupported type')
				405	try:
				406	T, total, count = _sum(1 / x for x in _fail_neg(data, errmsg))
				407	except ZeroDivisionError:
				408	return 0
				409	assert count == n
				410	return _convert(n / total, T)
				411
				412
				413	# FIXME: investigate ways to calculate medians without sorting? Quickselect?
				414	def median(data):
				415	"""Return the median (middle value) of numeric data.
				416
				417	When the number of data points is odd, return the middle data point.
				418	When the number of data points is even, the median is interpolated by
				419	taking the average of the two middle values:
				420
				421	>>> median([1, 3, 5])
				422	3
				423	>>> median([1, 3, 5, 7])
				424	4.0
				425
				426	"""
				427	data = sorted(data)
				428	n = len(data)
				429	if n == 0:
				430	raise StatisticsError("no median for empty data")
				431	if n % 2 == 1:
				432	return data[n // 2]
				433	else:
				434	i = n // 2
				435	return (data[i - 1] + data[i]) / 2
				436
				437
				438	def median_low(data):
				439	"""Return the low median of numeric data.
				440
				441	When the number of data points is odd, the middle value is returned.
				442	When it is even, the smaller of the two middle values is returned.
				443
				444	>>> median_low([1, 3, 5])
				445	3
				446	>>> median_low([1, 3, 5, 7])
				447	3
				448
				449	"""
				450	data = sorted(data)
				451	n = len(data)
				452	if n == 0:
				453	raise StatisticsError("no median for empty data")
				454	if n % 2 == 1:
				455	return data[n // 2]
				456	else:
				457	return data[n // 2 - 1]
				458
				459
				460	def median_high(data):
				461	"""Return the high median of data.
				462
				463	When the number of data points is odd, the middle value is returned.
				464	When it is even, the larger of the two middle values is returned.
				465
				466	>>> median_high([1, 3, 5])
				467	3
				468	>>> median_high([1, 3, 5, 7])
				469	5
				470
				471	"""
				472	data = sorted(data)
				473	n = len(data)
				474	if n == 0:
				475	raise StatisticsError("no median for empty data")
				476	return data[n // 2]
				477
				478
				479	def median_grouped(data, interval=1):
				480	"""Return the 50th percentile (median) of grouped continuous data.
				481
				482	>>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
				483	3.7
				484	>>> median_grouped([52, 52, 53, 54])
				485	52.5
				486
				487	This calculates the median as the 50th percentile, and should be
				488	used when your data is continuous and grouped. In the above example,
				489	the values 1, 2, 3, etc. actually represent the midpoint of classes
				490	0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
				491	class 3.5-4.5, and interpolation is used to estimate it.
				492
				493	Optional argument ``interval`` represents the class interval, and
				494	defaults to 1. Changing the class interval naturally will change the
				495	interpolated 50th percentile value:
				496
				497	>>> median_grouped([1, 3, 3, 5, 7], interval=1)
				498	3.25
				499	>>> median_grouped([1, 3, 3, 5, 7], interval=2)
				500	3.5
				501
				502	This function does not check whether the data points are at least
				503	``interval`` apart.
				504	"""
				505	data = sorted(data)
				506	n = len(data)
				507	if n == 0:
				508	raise StatisticsError("no median for empty data")
				509	elif n == 1:
				510	return data[0]
				511	# Find the value at the midpoint. Remember this corresponds to the
				512	# centre of the class interval.
				513	x = data[n // 2]
				514	for obj in (x, interval):
				515	if isinstance(obj, (str, bytes)):
				516	raise TypeError('expected number but got %r' % obj)
				517	try:
				518	L = x - interval / 2 # The lower limit of the median interval.
				519	except TypeError:
				520	# Mixed type. For now we just coerce to float.
				521	L = float(x) - float(interval) / 2
				522
				523	# Uses bisection search to search for x in data with log(n) time complexity
				524	# Find the position of leftmost occurrence of x in data
				525	l1 = _find_lteq(data, x)
				526	# Find the position of rightmost occurrence of x in data[l1...len(data)]
				527	# Assuming always l1 <= l2
				528	l2 = _find_rteq(data, l1, x)
				529	cf = l1
				530	f = l2 - l1 + 1
				531	return L + interval * (n / 2 - cf) / f
				532
				533
				534	def mode(data):
				535	"""Return the most common data point from discrete or nominal data.
				536
				537	``mode`` assumes discrete data, and returns a single value. This is the
				538	standard treatment of the mode as commonly taught in schools:
				539
				540	>>> mode([1, 1, 2, 3, 3, 3, 3, 4])
				541	3
				542
				543	This also works with nominal (non-numeric) data:
				544
				545	>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
				546	'red'
				547
				548	If there are multiple modes with same frequency, return the first one
				549	encountered:
				550
				551	>>> mode(['red', 'red', 'green', 'blue', 'blue'])
				552	'red'
				553
				554	If data is empty, ``mode``, raises StatisticsError.
				555
				556	"""
				557	pairs = Counter(iter(data)).most_common(1)
				558	try:
				559	return pairs[0][0]
				560	except IndexError:
				561	raise StatisticsError('no mode for empty data') from None
				562
				563
				564	def multimode(data):
				565	"""Return a list of the most frequently occurring values.
				566
				567	Will return more than one result if there are multiple modes
				568	or an empty list if data is empty.
				569
				570	>>> multimode('aabbbbbbbbcc')
				571	['b']
				572	>>> multimode('aabbbbccddddeeffffgg')
				573	['b', 'd', 'f']
				574	>>> multimode('')
				575	[]
				576	"""
				577	counts = Counter(iter(data)).most_common()
				578	maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
				579	return list(map(itemgetter(0), mode_items))
				580
				581
				582	# Notes on methods for computing quantiles
				583	# ----------------------------------------
				584	#
				585	# There is no one perfect way to compute quantiles. Here we offer
				586	# two methods that serve common needs. Most other packages
				587	# surveyed offered at least one or both of these two, making them
				588	# "standard" in the sense of "widely-adopted and reproducible".
				589	# They are also easy to explain, easy to compute manually, and have
				590	# straight-forward interpretations that aren't surprising.
				591
				592	# The default method is known as "R6", "PERCENTILE.EXC", or "expected
				593	# value of rank order statistics". The alternative method is known as
				594	# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
				595
				596	# For sample data where there is a positive probability for values
				597	# beyond the range of the data, the R6 exclusive method is a
				598	# reasonable choice. Consider a random sample of nine values from a
				599	# population with a uniform distribution from 0.0 to 1.0. The
				600	# distribution of the third ranked sample point is described by
				601	# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
				602	# mean=0.300. Only the latter (which corresponds with R6) gives the
				603	# desired cut point with 30% of the population falling below that
				604	# value, making it comparable to a result from an inv_cdf() function.
				605	# The R6 exclusive method is also idempotent.
				606
				607	# For describing population data where the end points are known to
				608	# be included in the data, the R7 inclusive method is a reasonable
				609	# choice. Instead of the mean, it uses the mode of the beta
				610	# distribution for the interior points. Per Hyndman & Fan, "One nice
				611	# property is that the vertices of Q7(p) divide the range into n - 1
				612	# intervals, and exactly 100p% of the intervals lie to the left of
				613	# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
				614
				615	# If needed, other methods could be added. However, for now, the
				616	# position is that fewer options make for easier choices and that
				617	# external packages can be used for anything more advanced.
				618
				619	def quantiles(data, *, n=4, method='exclusive'):
				620	"""Divide data into n continuous intervals with equal probability.
				621
				622	Returns a list of (n - 1) cut points separating the intervals.
				623
				624	Set n to 4 for quartiles (the default). Set n to 10 for deciles.
				625	Set n to 100 for percentiles which gives the 99 cuts points that
				626	separate data in to 100 equal sized groups.
				627
				628	The data can be any iterable containing sample.
				629	The cut points are linearly interpolated between data points.
				630
				631	If method is set to inclusive, data is treated as population
				632	data. The minimum value is treated as the 0th percentile and the
				633	maximum value is treated as the 100th percentile.
				634	"""
				635	if n < 1:
				636	raise StatisticsError('n must be at least 1')
				637	data = sorted(data)
				638	ld = len(data)
				639	if ld < 2:
				640	raise StatisticsError('must have at least two data points')
				641	if method == 'inclusive':
				642	m = ld - 1
				643	result = []
				644	for i in range(1, n):
				645	j, delta = divmod(i * m, n)
				646	interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
				647	result.append(interpolated)
				648	return result
				649	if method == 'exclusive':
				650	m = ld + 1
				651	result = []
				652	for i in range(1, n):
				653	j = i * m // n # rescale i to m/n
				654	j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1
				655	delta = im - jn # exact integer math
				656	interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
				657	result.append(interpolated)
				658	return result
				659	raise ValueError(f'Unknown method: {method!r}')
				660
				661
				662	# === Measures of spread ===
				663
				664	# See http://mathworld.wolfram.com/Variance.html
				665	# http://mathworld.wolfram.com/SampleVariance.html
				666	# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
				667	#
				668	# Under no circumstances use the so-called "computational formula for
				669	# variance", as that is only suitable for hand calculations with a small
				670	# amount of low-precision data. It has terrible numeric properties.
				671	#
				672	# See a comparison of three computational methods here:
				673	# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
				674
				675	def _ss(data, c=None):
				676	"""Return sum of square deviations of sequence data.
				677
				678	If ``c`` is None, the mean is calculated in one pass, and the deviations
				679	from the mean are calculated in a second pass. Otherwise, deviations are
				680	calculated from ``c`` as given. Use the second case with care, as it can
				681	lead to garbage results.
				682	"""
				683	if c is not None:
				684	T, total, count = _sum((x-c)**2 for x in data)
				685	return (T, total)
				686	c = mean(data)
				687	T, total, count = _sum((x-c)**2 for x in data)
				688	# The following sum should mathematically equal zero, but due to rounding
				689	# error may not.
				690	U, total2, count2 = _sum((x - c) for x in data)
				691	assert T == U and count == count2
				692	total -= total2 ** 2 / len(data)
				693	assert not total < 0, 'negative sum of square deviations: %f' % total
				694	return (T, total)
				695
				696
				697	def variance(data, xbar=None):
				698	"""Return the sample variance of data.
				699
				700	data should be an iterable of Real-valued numbers, with at least two
				701	values. The optional argument xbar, if given, should be the mean of
				702	the data. If it is missing or None, the mean is automatically calculated.
				703
				704	Use this function when your data is a sample from a population. To
				705	calculate the variance from the entire population, see ``pvariance``.
				706
				707	Examples:
				708
				709	>>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
				710	>>> variance(data)
				711	1.3720238095238095
				712
				713	If you have already calculated the mean of your data, you can pass it as
				714	the optional second argument ``xbar`` to avoid recalculating it:
				715
				716	>>> m = mean(data)
				717	>>> variance(data, m)
				718	1.3720238095238095
				719
				720	This function does not check that ``xbar`` is actually the mean of
				721	``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
				722	impossible results.
				723
				724	Decimals and Fractions are supported:
				725
				726	>>> from decimal import Decimal as D
				727	>>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				728	Decimal('31.01875')
				729
				730	>>> from fractions import Fraction as F
				731	>>> variance([F(1, 6), F(1, 2), F(5, 3)])
				732	Fraction(67, 108)
				733
				734	"""
				735	if iter(data) is data:
				736	data = list(data)
				737	n = len(data)
				738	if n < 2:
				739	raise StatisticsError('variance requires at least two data points')
				740	T, ss = _ss(data, xbar)
				741	return _convert(ss / (n - 1), T)
				742
				743
				744	def pvariance(data, mu=None):
				745	"""Return the population variance of ``data``.
				746
				747	data should be a sequence or iterable of Real-valued numbers, with at least one
				748	value. The optional argument mu, if given, should be the mean of
				749	the data. If it is missing or None, the mean is automatically calculated.
				750
				751	Use this function to calculate the variance from the entire population.
				752	To estimate the variance from a sample, the ``variance`` function is
				753	usually a better choice.
				754
				755	Examples:
				756
				757	>>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
				758	>>> pvariance(data)
				759	1.25
				760
				761	If you have already calculated the mean of the data, you can pass it as
				762	the optional second argument to avoid recalculating it:
				763
				764	>>> mu = mean(data)
				765	>>> pvariance(data, mu)
				766	1.25
				767
				768	Decimals and Fractions are supported:
				769
				770	>>> from decimal import Decimal as D
				771	>>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
				772	Decimal('24.815')
				773
				774	>>> from fractions import Fraction as F
				775	>>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
				776	Fraction(13, 72)
				777
				778	"""
				779	if iter(data) is data:
				780	data = list(data)
				781	n = len(data)
				782	if n < 1:
				783	raise StatisticsError('pvariance requires at least one data point')
				784	T, ss = _ss(data, mu)
				785	return _convert(ss / n, T)
				786
				787
				788	def stdev(data, xbar=None):
				789	"""Return the square root of the sample variance.
				790
				791	See ``variance`` for arguments and other details.
				792
				793	>>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				794	1.0810874155219827
				795
				796	"""
				797	var = variance(data, xbar)
				798	try:
				799	return var.sqrt()
				800	except AttributeError:
				801	return math.sqrt(var)
				802
				803
				804	def pstdev(data, mu=None):
				805	"""Return the square root of the population variance.
				806
				807	See ``pvariance`` for arguments and other details.
				808
				809	>>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
				810	0.986893273527251
				811
				812	"""
				813	var = pvariance(data, mu)
				814	try:
				815	return var.sqrt()
				816	except AttributeError:
				817	return math.sqrt(var)
				818
				819
				820	## Normal Distribution #####################################################
				821
				822
				823	def _normal_dist_inv_cdf(p, mu, sigma):
				824	# There is no closed-form solution to the inverse CDF for the normal
				825	# distribution, so we use a rational approximation instead:
				826	# Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
				827	# Normal Distribution". Applied Statistics. Blackwell Publishing. 37
				828	# (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
				829	q = p - 0.5
				830	if fabs(q) <= 0.425:
				831	r = 0.180625 - q * q
				832	# Hash sum: 55.88319_28806_14901_4439
				833	num = (((((((2.50908_09287_30122_6727e+3 * r +
				834	3.34305_75583_58812_8105e+4) * r +
				835	6.72657_70927_00870_0853e+4) * r +
				836	4.59219_53931_54987_1457e+4) * r +
				837	1.37316_93765_50946_1125e+4) * r +
				838	1.97159_09503_06551_4427e+3) * r +
				839	1.33141_66789_17843_7745e+2) * r +
				840	3.38713_28727_96366_6080e+0) * q
				841	den = (((((((5.22649_52788_52854_5610e+3 * r +
				842	2.87290_85735_72194_2674e+4) * r +
				843	3.93078_95800_09271_0610e+4) * r +
				844	2.12137_94301_58659_5867e+4) * r +
				845	5.39419_60214_24751_1077e+3) * r +
				846	6.87187_00749_20579_0830e+2) * r +
				847	4.23133_30701_60091_1252e+1) * r +
				848	1.0)
				849	x = num / den
				850	return mu + (x * sigma)
				851	r = p if q <= 0.0 else 1.0 - p
				852	r = sqrt(-log(r))
				853	if r <= 5.0:
				854	r = r - 1.6
				855	# Hash sum: 49.33206_50330_16102_89036
				856	num = (((((((7.74545_01427_83414_07640e-4 * r +
				857	2.27238_44989_26918_45833e-2) * r +
				858	2.41780_72517_74506_11770e-1) * r +
				859	1.27045_82524_52368_38258e+0) * r +
				860	3.64784_83247_63204_60504e+0) * r +
				861	5.76949_72214_60691_40550e+0) * r +
				862	4.63033_78461_56545_29590e+0) * r +
				863	1.42343_71107_49683_57734e+0)
				864	den = (((((((1.05075_00716_44416_84324e-9 * r +
				865	5.47593_80849_95344_94600e-4) * r +
				866	1.51986_66563_61645_71966e-2) * r +
				867	1.48103_97642_74800_74590e-1) * r +
				868	6.89767_33498_51000_04550e-1) * r +
				869	1.67638_48301_83803_84940e+0) * r +
				870	2.05319_16266_37758_82187e+0) * r +
				871	1.0)
				872	else:
				873	r = r - 5.0
				874	# Hash sum: 47.52583_31754_92896_71629
				875	num = (((((((2.01033_43992_92288_13265e-7 * r +
				876	2.71155_55687_43487_57815e-5) * r +
				877	1.24266_09473_88078_43860e-3) * r +
				878	2.65321_89526_57612_30930e-2) * r +
				879	2.96560_57182_85048_91230e-1) * r +
				880	1.78482_65399_17291_33580e+0) * r +
				881	5.46378_49111_64114_36990e+0) * r +
				882	6.65790_46435_01103_77720e+0)
				883	den = (((((((2.04426_31033_89939_78564e-15 * r +
				884	1.42151_17583_16445_88870e-7) * r +
				885	1.84631_83175_10054_68180e-5) * r +
				886	7.86869_13114_56132_59100e-4) * r +
				887	1.48753_61290_85061_48525e-2) * r +
				888	1.36929_88092_27358_05310e-1) * r +
				889	5.99832_20655_58879_37690e-1) * r +
				890	1.0)
				891	x = num / den
				892	if q < 0.0:
				893	x = -x
				894	return mu + (x * sigma)
				895
				896
				897	# If available, use C implementation
				898	try:
				899	from _statistics import _normal_dist_inv_cdf
				900	except ImportError:
				901	pass
				902
				903
				904	class NormalDist:
				905	"Normal distribution of a random variable"
				906	# https://en.wikipedia.org/wiki/Normal_distribution
				907	# https://en.wikipedia.org/wiki/Variance#Properties
				908
				909	__slots__ = {
				910	'_mu': 'Arithmetic mean of a normal distribution',
				911	'_sigma': 'Standard deviation of a normal distribution',
				912	}
				913
				914	def __init__(self, mu=0.0, sigma=1.0):
				915	"NormalDist where mu is the mean and sigma is the standard deviation."
				916	if sigma < 0.0:
				917	raise StatisticsError('sigma must be non-negative')
				918	self._mu = float(mu)
				919	self._sigma = float(sigma)
				920
				921	@classmethod
				922	def from_samples(cls, data):
				923	"Make a normal distribution instance from sample data."
				924	if not isinstance(data, (list, tuple)):
				925	data = list(data)
				926	xbar = fmean(data)
				927	return cls(xbar, stdev(data, xbar))
				928
				929	def samples(self, n, *, seed=None):
				930	"Generate n samples for a given mean and standard deviation."
				931	gauss = random.gauss if seed is None else random.Random(seed).gauss
				932	mu, sigma = self._mu, self._sigma
				933	return [gauss(mu, sigma) for i in range(n)]
				934
				935	def pdf(self, x):
				936	"Probability density function. P(x <= X < x+dx) / dx"
				937	variance = self._sigma ** 2.0
				938	if not variance:
				939	raise StatisticsError('pdf() not defined when sigma is zero')
				940	return exp((x - self._mu)*2.0 / (-2.0variance)) / sqrt(tau*variance)
				941
				942	def cdf(self, x):
				943	"Cumulative distribution function. P(X <= x)"
				944	if not self._sigma:
				945	raise StatisticsError('cdf() not defined when sigma is zero')
				946	return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
				947
				948	def inv_cdf(self, p):
				949	"""Inverse cumulative distribution function. x : P(X <= x) = p
				950
				951	Finds the value of the random variable such that the probability of
				952	the variable being less than or equal to that value equals the given
				953	probability.
				954
				955	This function is also called the percent point function or quantile
				956	function.
				957	"""
				958	if p <= 0.0 or p >= 1.0:
				959	raise StatisticsError('p must be in the range 0.0 < p < 1.0')
				960	if self._sigma <= 0.0:
				961	raise StatisticsError('cdf() not defined when sigma at or below zero')
				962	return _normal_dist_inv_cdf(p, self._mu, self._sigma)
				963
				964	def quantiles(self, n=4):
				965	"""Divide into n continuous intervals with equal probability.
				966
				967	Returns a list of (n - 1) cut points separating the intervals.
				968
				969	Set n to 4 for quartiles (the default). Set n to 10 for deciles.
				970	Set n to 100 for percentiles which gives the 99 cuts points that
				971	separate the normal distribution in to 100 equal sized groups.
				972	"""
				973	return [self.inv_cdf(i / n) for i in range(1, n)]
				974
				975	def overlap(self, other):
				976	"""Compute the overlapping coefficient (OVL) between two normal distributions.
				977
				978	Measures the agreement between two normal probability distributions.
				979	Returns a value between 0.0 and 1.0 giving the overlapping area in
				980	the two underlying probability density functions.
				981
				982	>>> N1 = NormalDist(2.4, 1.6)
				983	>>> N2 = NormalDist(3.2, 2.0)
				984	>>> N1.overlap(N2)
				985	0.8035050657330205
				986	"""
				987	# See: "The overlapping coefficient as a measure of agreement between
				988	# probability distributions and point estimation of the overlap of two
				989	# normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
				990	# http://dx.doi.org/10.1080/03610928908830127
				991	if not isinstance(other, NormalDist):
				992	raise TypeError('Expected another NormalDist instance')
				993	X, Y = self, other
				994	if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity
				995	X, Y = Y, X
				996	X_var, Y_var = X.variance, Y.variance
				997	if not X_var or not Y_var:
				998	raise StatisticsError('overlap() not defined when sigma is zero')
				999	dv = Y_var - X_var
				1000	dm = fabs(Y._mu - X._mu)
				1001	if not dv:
				1002	return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
				1003	a = X._mu * Y_var - Y._mu * X_var
				1004	b = X._sigma * Y._sigma * sqrt(dm*2.0 + dv log(Y_var / X_var))
				1005	x1 = (a + b) / dv
				1006	x2 = (a - b) / dv
				1007	return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
				1008
				1009	def zscore(self, x):
				1010	"""Compute the Standard Score. (x - mean) / stdev
				1011
				1012	Describes x in terms of the number of standard deviations
				1013	above or below the mean of the normal distribution.
				1014	"""
				1015	# https://www.statisticshowto.com/probability-and-statistics/z-score/
				1016	if not self._sigma:
				1017	raise StatisticsError('zscore() not defined when sigma is zero')
				1018	return (x - self._mu) / self._sigma
				1019
				1020	@property
				1021	def mean(self):
				1022	"Arithmetic mean of the normal distribution."
				1023	return self._mu
				1024
				1025	@property
				1026	def median(self):
				1027	"Return the median of the normal distribution"
				1028	return self._mu
				1029
				1030	@property
				1031	def mode(self):
				1032	"""Return the mode of the normal distribution
				1033
				1034	The mode is the value x where which the probability density
				1035	function (pdf) takes its maximum value.
				1036	"""
				1037	return self._mu
				1038
				1039	@property
				1040	def stdev(self):
				1041	"Standard deviation of the normal distribution."
				1042	return self._sigma
				1043
				1044	@property
				1045	def variance(self):
				1046	"Square of the standard deviation."
				1047	return self._sigma ** 2.0
				1048
				1049	def __add__(x1, x2):
				1050	"""Add a constant or another NormalDist instance.
				1051
				1052	If other is a constant, translate mu by the constant,
				1053	leaving sigma unchanged.
				1054
				1055	If other is a NormalDist, add both the means and the variances.
				1056	Mathematically, this works only if the two distributions are
				1057	independent or if they are jointly normally distributed.
				1058	"""
				1059	if isinstance(x2, NormalDist):
				1060	return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
				1061	return NormalDist(x1._mu + x2, x1._sigma)
				1062
				1063	def __sub__(x1, x2):
				1064	"""Subtract a constant or another NormalDist instance.
				1065
				1066	If other is a constant, translate by the constant mu,
				1067	leaving sigma unchanged.
				1068
				1069	If other is a NormalDist, subtract the means and add the variances.
				1070	Mathematically, this works only if the two distributions are
				1071	independent or if they are jointly normally distributed.
				1072	"""
				1073	if isinstance(x2, NormalDist):
				1074	return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
				1075	return NormalDist(x1._mu - x2, x1._sigma)
				1076
				1077	def __mul__(x1, x2):
				1078	"""Multiply both mu and sigma by a constant.
				1079
				1080	Used for rescaling, perhaps to change measurement units.
				1081	Sigma is scaled with the absolute value of the constant.
				1082	"""
				1083	return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
				1084
				1085	def __truediv__(x1, x2):
				1086	"""Divide both mu and sigma by a constant.
				1087
				1088	Used for rescaling, perhaps to change measurement units.
				1089	Sigma is scaled with the absolute value of the constant.
				1090	"""
				1091	return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
				1092
				1093	def __pos__(x1):
				1094	"Return a copy of the instance."
				1095	return NormalDist(x1._mu, x1._sigma)
				1096
				1097	def __neg__(x1):
				1098	"Negates mu while keeping sigma the same."
				1099	return NormalDist(-x1._mu, x1._sigma)
				1100
				1101	__radd__ = __add__
				1102
				1103	def __rsub__(x1, x2):
				1104	"Subtract a NormalDist from a constant or another NormalDist."
				1105	return -(x1 - x2)
				1106
				1107	__rmul__ = __mul__
				1108
				1109	def __eq__(x1, x2):
				1110	"Two NormalDist objects are equal if their mu and sigma are both equal."
				1111	if not isinstance(x2, NormalDist):
				1112	return NotImplemented
				1113	return x1._mu == x2._mu and x1._sigma == x2._sigma
				1114
				1115	def __hash__(self):
				1116	"NormalDist objects hash equal if their mu and sigma are both equal."
				1117	return hash((self._mu, self._sigma))
				1118
				1119	def __repr__(self):
				1120	return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'