1## Module statistics.py 2## 3## Copyright (c) 2013 Steven D'Aprano <steve+python@pearwood.info>. 4## 5## Licensed under the Apache License, Version 2.0 (the "License"); 6## you may not use this file except in compliance with the License. 7## You may obtain a copy of the License at 8## 9## http://www.apache.org/licenses/LICENSE-2.0 10## 11## Unless required by applicable law or agreed to in writing, software 12## distributed under the License is distributed on an "AS IS" BASIS, 13## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14## See the License for the specific language governing permissions and 15## limitations under the License. 16 17 18""" 19Basic statistics module. 20 21This module provides functions for calculating statistics of data, including 22averages, variance, and standard deviation. 23 24Calculating averages 25-------------------- 26 27================== ============================================= 28Function Description 29================== ============================================= 30mean Arithmetic mean (average) of data. 31median Median (middle value) of data. 32median_low Low median of data. 33median_high High median of data. 34median_grouped Median, or 50th percentile, of grouped data. 35mode Mode (most common value) of data. 36================== ============================================= 37 38Calculate the arithmetic mean ("the average") of data: 39 40>>> mean([-1.0, 2.5, 3.25, 5.75]) 412.625 42 43 44Calculate the standard median of discrete data: 45 46>>> median([2, 3, 4, 5]) 473.5 48 49 50Calculate the median, or 50th percentile, of data grouped into class intervals 51centred on the data values provided. E.g. if your data points are rounded to 52the nearest whole number: 53 54>>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS 552.8333333333... 56 57This should be interpreted in this way: you have two data points in the class 58interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in 59the class interval 3.5-4.5. The median of these data points is 2.8333... 60 61 62Calculating variability or spread 63--------------------------------- 64 65================== ============================================= 66Function Description 67================== ============================================= 68pvariance Population variance of data. 69variance Sample variance of data. 70pstdev Population standard deviation of data. 71stdev Sample standard deviation of data. 72================== ============================================= 73 74Calculate the standard deviation of sample data: 75 76>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS 774.38961843444... 78 79If you have previously calculated the mean, you can pass it as the optional 80second argument to the four "spread" functions to avoid recalculating it: 81 82>>> data = [1, 2, 2, 4, 4, 4, 5, 6] 83>>> mu = mean(data) 84>>> pvariance(data, mu) 852.5 86 87 88Exceptions 89---------- 90 91A single exception is defined: StatisticsError is a subclass of ValueError. 92 93""" 94 95__all__ = [ 'StatisticsError', 96 'pstdev', 'pvariance', 'stdev', 'variance', 97 'median', 'median_low', 'median_high', 'median_grouped', 98 'mean', 'mode', 99 ] 100 101 102import collections 103import math 104 105from fractions import Fraction 106from decimal import Decimal 107from itertools import groupby 108 109 110 111# === Exceptions === 112 113class StatisticsError(ValueError): 114 pass 115 116 117# === Private utilities === 118 119def _sum(data, start=0): 120 """_sum(data [, start]) -> (type, sum, count) 121 122 Return a high-precision sum of the given numeric data as a fraction, 123 together with the type to be converted to and the count of items. 124 125 If optional argument ``start`` is given, it is added to the total. 126 If ``data`` is empty, ``start`` (defaulting to 0) is returned. 127 128 129 Examples 130 -------- 131 132 >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75) 133 (<class 'float'>, Fraction(11, 1), 5) 134 135 Some sources of round-off error will be avoided: 136 137 >>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero. 138 (<class 'float'>, Fraction(1000, 1), 3000) 139 140 Fractions and Decimals are also supported: 141 142 >>> from fractions import Fraction as F 143 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)]) 144 (<class 'fractions.Fraction'>, Fraction(63, 20), 4) 145 146 >>> from decimal import Decimal as D 147 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")] 148 >>> _sum(data) 149 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4) 150 151 Mixed types are currently treated as an error, except that int is 152 allowed. 153 """ 154 count = 0 155 n, d = _exact_ratio(start) 156 partials = {d: n} 157 partials_get = partials.get 158 T = _coerce(int, type(start)) 159 for typ, values in groupby(data, type): 160 T = _coerce(T, typ) # or raise TypeError 161 for n,d in map(_exact_ratio, values): 162 count += 1 163 partials[d] = partials_get(d, 0) + n 164 if None in partials: 165 # The sum will be a NAN or INF. We can ignore all the finite 166 # partials, and just look at this special one. 167 total = partials[None] 168 assert not _isfinite(total) 169 else: 170 # Sum all the partial sums using builtin sum. 171 # FIXME is this faster if we sum them in order of the denominator? 172 total = sum(Fraction(n, d) for d, n in sorted(partials.items())) 173 return (T, total, count) 174 175 176def _isfinite(x): 177 try: 178 return x.is_finite() # Likely a Decimal. 179 except AttributeError: 180 return math.isfinite(x) # Coerces to float first. 181 182 183def _coerce(T, S): 184 """Coerce types T and S to a common type, or raise TypeError. 185 186 Coercion rules are currently an implementation detail. See the CoerceTest 187 test class in test_statistics for details. 188 """ 189 # See http://bugs.python.org/issue24068. 190 assert T is not bool, "initial type T is bool" 191 # If the types are the same, no need to coerce anything. Put this 192 # first, so that the usual case (no coercion needed) happens as soon 193 # as possible. 194 if T is S: return T 195 # Mixed int & other coerce to the other type. 196 if S is int or S is bool: return T 197 if T is int: return S 198 # If one is a (strict) subclass of the other, coerce to the subclass. 199 if issubclass(S, T): return S 200 if issubclass(T, S): return T 201 # Ints coerce to the other type. 202 if issubclass(T, int): return S 203 if issubclass(S, int): return T 204 # Mixed fraction & float coerces to float (or float subclass). 205 if issubclass(T, Fraction) and issubclass(S, float): 206 return S 207 if issubclass(T, float) and issubclass(S, Fraction): 208 return T 209 # Any other combination is disallowed. 210 msg = "don't know how to coerce %s and %s" 211 raise TypeError(msg % (T.__name__, S.__name__)) 212 213 214def _exact_ratio(x): 215 """Return Real number x to exact (numerator, denominator) pair. 216 217 >>> _exact_ratio(0.25) 218 (1, 4) 219 220 x is expected to be an int, Fraction, Decimal or float. 221 """ 222 try: 223 # Optimise the common case of floats. We expect that the most often 224 # used numeric type will be builtin floats, so try to make this as 225 # fast as possible. 226 if type(x) is float: 227 return x.as_integer_ratio() 228 try: 229 # x may be an int, Fraction, or Integral ABC. 230 return (x.numerator, x.denominator) 231 except AttributeError: 232 try: 233 # x may be a float subclass. 234 return x.as_integer_ratio() 235 except AttributeError: 236 try: 237 # x may be a Decimal. 238 return _decimal_to_ratio(x) 239 except AttributeError: 240 # Just give up? 241 pass 242 except (OverflowError, ValueError): 243 # float NAN or INF. 244 assert not math.isfinite(x) 245 return (x, None) 246 msg = "can't convert type '{}' to numerator/denominator" 247 raise TypeError(msg.format(type(x).__name__)) 248 249 250# FIXME This is faster than Fraction.from_decimal, but still too slow. 251def _decimal_to_ratio(d): 252 """Convert Decimal d to exact integer ratio (numerator, denominator). 253 254 >>> from decimal import Decimal 255 >>> _decimal_to_ratio(Decimal("2.6")) 256 (26, 10) 257 258 """ 259 sign, digits, exp = d.as_tuple() 260 if exp in ('F', 'n', 'N'): # INF, NAN, sNAN 261 assert not d.is_finite() 262 return (d, None) 263 num = 0 264 for digit in digits: 265 num = num*10 + digit 266 if exp < 0: 267 den = 10**-exp 268 else: 269 num *= 10**exp 270 den = 1 271 if sign: 272 num = -num 273 return (num, den) 274 275 276def _convert(value, T): 277 """Convert value to given numeric type T.""" 278 if type(value) is T: 279 # This covers the cases where T is Fraction, or where value is 280 # a NAN or INF (Decimal or float). 281 return value 282 if issubclass(T, int) and value.denominator != 1: 283 T = float 284 try: 285 # FIXME: what do we do if this overflows? 286 return T(value) 287 except TypeError: 288 if issubclass(T, Decimal): 289 return T(value.numerator)/T(value.denominator) 290 else: 291 raise 292 293 294def _counts(data): 295 # Generate a table of sorted (value, frequency) pairs. 296 table = collections.Counter(iter(data)).most_common() 297 if not table: 298 return table 299 # Extract the values with the highest frequency. 300 maxfreq = table[0][1] 301 for i in range(1, len(table)): 302 if table[i][1] != maxfreq: 303 table = table[:i] 304 break 305 return table 306 307 308# === Measures of central tendency (averages) === 309 310def mean(data): 311 """Return the sample arithmetic mean of data. 312 313 >>> mean([1, 2, 3, 4, 4]) 314 2.8 315 316 >>> from fractions import Fraction as F 317 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)]) 318 Fraction(13, 21) 319 320 >>> from decimal import Decimal as D 321 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")]) 322 Decimal('0.5625') 323 324 If ``data`` is empty, StatisticsError will be raised. 325 """ 326 if iter(data) is data: 327 data = list(data) 328 n = len(data) 329 if n < 1: 330 raise StatisticsError('mean requires at least one data point') 331 T, total, count = _sum(data) 332 assert count == n 333 return _convert(total/n, T) 334 335 336# FIXME: investigate ways to calculate medians without sorting? Quickselect? 337def median(data): 338 """Return the median (middle value) of numeric data. 339 340 When the number of data points is odd, return the middle data point. 341 When the number of data points is even, the median is interpolated by 342 taking the average of the two middle values: 343 344 >>> median([1, 3, 5]) 345 3 346 >>> median([1, 3, 5, 7]) 347 4.0 348 349 """ 350 data = sorted(data) 351 n = len(data) 352 if n == 0: 353 raise StatisticsError("no median for empty data") 354 if n%2 == 1: 355 return data[n//2] 356 else: 357 i = n//2 358 return (data[i - 1] + data[i])/2 359 360 361def median_low(data): 362 """Return the low median of numeric data. 363 364 When the number of data points is odd, the middle value is returned. 365 When it is even, the smaller of the two middle values is returned. 366 367 >>> median_low([1, 3, 5]) 368 3 369 >>> median_low([1, 3, 5, 7]) 370 3 371 372 """ 373 data = sorted(data) 374 n = len(data) 375 if n == 0: 376 raise StatisticsError("no median for empty data") 377 if n%2 == 1: 378 return data[n//2] 379 else: 380 return data[n//2 - 1] 381 382 383def median_high(data): 384 """Return the high median of data. 385 386 When the number of data points is odd, the middle value is returned. 387 When it is even, the larger of the two middle values is returned. 388 389 >>> median_high([1, 3, 5]) 390 3 391 >>> median_high([1, 3, 5, 7]) 392 5 393 394 """ 395 data = sorted(data) 396 n = len(data) 397 if n == 0: 398 raise StatisticsError("no median for empty data") 399 return data[n//2] 400 401 402def median_grouped(data, interval=1): 403 """Return the 50th percentile (median) of grouped continuous data. 404 405 >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5]) 406 3.7 407 >>> median_grouped([52, 52, 53, 54]) 408 52.5 409 410 This calculates the median as the 50th percentile, and should be 411 used when your data is continuous and grouped. In the above example, 412 the values 1, 2, 3, etc. actually represent the midpoint of classes 413 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in 414 class 3.5-4.5, and interpolation is used to estimate it. 415 416 Optional argument ``interval`` represents the class interval, and 417 defaults to 1. Changing the class interval naturally will change the 418 interpolated 50th percentile value: 419 420 >>> median_grouped([1, 3, 3, 5, 7], interval=1) 421 3.25 422 >>> median_grouped([1, 3, 3, 5, 7], interval=2) 423 3.5 424 425 This function does not check whether the data points are at least 426 ``interval`` apart. 427 """ 428 data = sorted(data) 429 n = len(data) 430 if n == 0: 431 raise StatisticsError("no median for empty data") 432 elif n == 1: 433 return data[0] 434 # Find the value at the midpoint. Remember this corresponds to the 435 # centre of the class interval. 436 x = data[n//2] 437 for obj in (x, interval): 438 if isinstance(obj, (str, bytes)): 439 raise TypeError('expected number but got %r' % obj) 440 try: 441 L = x - interval/2 # The lower limit of the median interval. 442 except TypeError: 443 # Mixed type. For now we just coerce to float. 444 L = float(x) - float(interval)/2 445 cf = data.index(x) # Number of values below the median interval. 446 # FIXME The following line could be more efficient for big lists. 447 f = data.count(x) # Number of data points in the median interval. 448 return L + interval*(n/2 - cf)/f 449 450 451def mode(data): 452 """Return the most common data point from discrete or nominal data. 453 454 ``mode`` assumes discrete data, and returns a single value. This is the 455 standard treatment of the mode as commonly taught in schools: 456 457 >>> mode([1, 1, 2, 3, 3, 3, 3, 4]) 458 3 459 460 This also works with nominal (non-numeric) data: 461 462 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"]) 463 'red' 464 465 If there is not exactly one most common value, ``mode`` will raise 466 StatisticsError. 467 """ 468 # Generate a table of sorted (value, frequency) pairs. 469 table = _counts(data) 470 if len(table) == 1: 471 return table[0][0] 472 elif table: 473 raise StatisticsError( 474 'no unique mode; found %d equally common values' % len(table) 475 ) 476 else: 477 raise StatisticsError('no mode for empty data') 478 479 480# === Measures of spread === 481 482# See http://mathworld.wolfram.com/Variance.html 483# http://mathworld.wolfram.com/SampleVariance.html 484# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 485# 486# Under no circumstances use the so-called "computational formula for 487# variance", as that is only suitable for hand calculations with a small 488# amount of low-precision data. It has terrible numeric properties. 489# 490# See a comparison of three computational methods here: 491# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/ 492 493def _ss(data, c=None): 494 """Return sum of square deviations of sequence data. 495 496 If ``c`` is None, the mean is calculated in one pass, and the deviations 497 from the mean are calculated in a second pass. Otherwise, deviations are 498 calculated from ``c`` as given. Use the second case with care, as it can 499 lead to garbage results. 500 """ 501 if c is None: 502 c = mean(data) 503 T, total, count = _sum((x-c)**2 for x in data) 504 # The following sum should mathematically equal zero, but due to rounding 505 # error may not. 506 U, total2, count2 = _sum((x-c) for x in data) 507 assert T == U and count == count2 508 total -= total2**2/len(data) 509 assert not total < 0, 'negative sum of square deviations: %f' % total 510 return (T, total) 511 512 513def variance(data, xbar=None): 514 """Return the sample variance of data. 515 516 data should be an iterable of Real-valued numbers, with at least two 517 values. The optional argument xbar, if given, should be the mean of 518 the data. If it is missing or None, the mean is automatically calculated. 519 520 Use this function when your data is a sample from a population. To 521 calculate the variance from the entire population, see ``pvariance``. 522 523 Examples: 524 525 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5] 526 >>> variance(data) 527 1.3720238095238095 528 529 If you have already calculated the mean of your data, you can pass it as 530 the optional second argument ``xbar`` to avoid recalculating it: 531 532 >>> m = mean(data) 533 >>> variance(data, m) 534 1.3720238095238095 535 536 This function does not check that ``xbar`` is actually the mean of 537 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or 538 impossible results. 539 540 Decimals and Fractions are supported: 541 542 >>> from decimal import Decimal as D 543 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) 544 Decimal('31.01875') 545 546 >>> from fractions import Fraction as F 547 >>> variance([F(1, 6), F(1, 2), F(5, 3)]) 548 Fraction(67, 108) 549 550 """ 551 if iter(data) is data: 552 data = list(data) 553 n = len(data) 554 if n < 2: 555 raise StatisticsError('variance requires at least two data points') 556 T, ss = _ss(data, xbar) 557 return _convert(ss/(n-1), T) 558 559 560def pvariance(data, mu=None): 561 """Return the population variance of ``data``. 562 563 data should be an iterable of Real-valued numbers, with at least one 564 value. The optional argument mu, if given, should be the mean of 565 the data. If it is missing or None, the mean is automatically calculated. 566 567 Use this function to calculate the variance from the entire population. 568 To estimate the variance from a sample, the ``variance`` function is 569 usually a better choice. 570 571 Examples: 572 573 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25] 574 >>> pvariance(data) 575 1.25 576 577 If you have already calculated the mean of the data, you can pass it as 578 the optional second argument to avoid recalculating it: 579 580 >>> mu = mean(data) 581 >>> pvariance(data, mu) 582 1.25 583 584 This function does not check that ``mu`` is actually the mean of ``data``. 585 Giving arbitrary values for ``mu`` may lead to invalid or impossible 586 results. 587 588 Decimals and Fractions are supported: 589 590 >>> from decimal import Decimal as D 591 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) 592 Decimal('24.815') 593 594 >>> from fractions import Fraction as F 595 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)]) 596 Fraction(13, 72) 597 598 """ 599 if iter(data) is data: 600 data = list(data) 601 n = len(data) 602 if n < 1: 603 raise StatisticsError('pvariance requires at least one data point') 604 ss = _ss(data, mu) 605 T, ss = _ss(data, mu) 606 return _convert(ss/n, T) 607 608 609def stdev(data, xbar=None): 610 """Return the square root of the sample variance. 611 612 See ``variance`` for arguments and other details. 613 614 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 615 1.0810874155219827 616 617 """ 618 var = variance(data, xbar) 619 try: 620 return var.sqrt() 621 except AttributeError: 622 return math.sqrt(var) 623 624 625def pstdev(data, mu=None): 626 """Return the square root of the population variance. 627 628 See ``pvariance`` for arguments and other details. 629 630 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 631 0.986893273527251 632 633 """ 634 var = pvariance(data, mu) 635 try: 636 return var.sqrt() 637 except AttributeError: 638 return math.sqrt(var) 639