acgc.stats.bivariate
Bivariate statistics
Statistical measures of relationships between two populations
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3""" Bivariate statistics 4 5Statistical measures of relationships between two populations 6""" 7 8import numpy as np 9from scipy import stats 10from .bivariate_lines import sen, sma, bivariate_line_equation 11# import xarray as xr 12 13__all__ = [ 14 "BivariateStatistics", 15 "nmb", 16 "nmae", 17 "nmbf", 18 "nmaef" 19] 20 21def nmb( x0, x1 ): 22 '''Compute Normalized Mean Bias (NMB) 23 24 NMB = ( mean(x1) - mean(x0) ) / mean(x0) 25 26 Parameters 27 ---------- 28 x0 : array_like 29 reference values 30 x1 : array_like 31 experiment values 32 ''' 33 34 assert (len(x0) == len(x1)), \ 35 "Parameters x0 and x1 must have the same length" 36 37 # Mean values 38 x0_mean = np.mean(x0) 39 x1_mean = np.mean(x1) 40 41 # Metric value 42 return x1_mean / x0_mean - 1 43 44def nmae( x0, x1 ): 45 '''Compute Normalized Mean Absolute Error (NMAE) 46 47 NMAE = mean(abs(x1 - x0)) / abs(mean(x0)) 48 49 Parameters 50 --------- 51 x0 : array_like 52 reference values 53 x1 : array_like 54 experiment values 55 ''' 56 57 # Mean values 58 x0_mean = np.mean(x0) 59 60 # Mean absolute difference 61 abs_diff = np.mean( np.abs(x1 - x0) ) 62 63 # Metric value 64 return abs_diff / np.abs( x0_mean ) 65 66 67def nmbf( x0, x1 ): 68 '''Compute Normalized Mean Bias Factor (NMBF) 69 70 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 71 72 Parameters 73 ---------- 74 x0 : array_like 75 reference values 76 x1 : array_like 77 experiment values 78 ''' 79 80 # Ensure that arguments have the same length 81 assert (len(x0) == len(x1)), \ 82 "Parameters x0 and x1 must have the same length" 83 84 # Mean values 85 x0_mean = np.mean(x0) 86 x1_mean = np.mean(x1) 87 88 # Metric value 89 if x1_mean >= x0_mean: 90 result = x1_mean / x0_mean - 1 91 else: 92 result= 1 - x0_mean / x1_mean 93 # Equivalent (faster?) implementation 94 #S = (mMean - oMean) / np.abs(mMean - oMean) 95 #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 ) 96 97 return result 98 99def nmaef( x0, x1 ): 100 '''Compute Normalized Mean Absolute Error Factor (NMAEF) 101 102 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 103 104 Parameters 105 ---------- 106 x0 : array_like 107 reference values 108 x1 : array_like 109 experiment values 110 ''' 111 112 # Ensure that arguments have the same length 113 assert (len(x0) == len(x1)), \ 114 "Parameters x0 and x1 must have the same length" 115 116 # Mean values 117 x0_mean = np.mean(x0) 118 x1_mean = np.mean(x1) 119 120 # Mean absolute difference 121 abs_diff = np.mean( np.abs(x1 - x0)) 122 123 # Metric value 124 if x1_mean >= x0_mean: 125 result = abs_diff / x0_mean 126 else: 127 result = abs_diff / x1_mean 128 # Equivalent (faster?) implementation 129 #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean) 130 #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) ) 131 132 return result 133 134def _texify_name(name): 135 '''Return a LaTex formatted string for some variables 136 137 Parameters 138 ---------- 139 name : str 140 141 Returns 142 ------- 143 pretty_name : str 144 ''' 145 if name.lower()=='n': 146 pretty_name = r'$n$' 147 elif name=='R2': 148 pretty_name = f'$R^2$' 149 elif name=='r2': 150 pretty_name = f'$r^2$' 151 elif name.lower()=='y_ols': 152 pretty_name = r'$y_{\rm OLS}$' 153 elif name.lower()=='y_sma': 154 pretty_name = r'$y_{\rm SMA}$' 155 elif name.lower()=='y_sen': 156 pretty_name = r'$y_{\rm Sen}$' 157 else: 158 pretty_name = name 159 return pretty_name 160 161def _number2str(value, 162 intformat='{:d}', 163 floatformat='{:.4f}'): 164 '''Format number as string using integer and float format specifiers 165 166 Parameters 167 ---------- 168 value : numeric, str 169 value to be converted 170 intformat : str, default='{:d}' 171 format specifier for integer types 172 floatformat : str, default='{:.4f}' 173 format specifier for float types 174 175 Returns 176 ------- 177 str 178 ''' 179 if isinstance(value,str): 180 pass 181 elif isinstance(value,(int,np.integer)): 182 value = intformat.format(value) 183 else: 184 value = floatformat.format(value) 185 return value 186 187class BivariateStatistics: 188 '''A suite of common statistics to quantify bivariate relationships 189 190 Class method 'summary' provides a formatted summary of these statistics 191 192 Attributes 193 ---------- 194 count, n : int 195 number of valid (not NaN) data value pairs 196 xmean, ymean : float 197 mean of x and y variables 198 xmedian, ymedian :float 199 median of x and y variables 200 xstd, ystd : float 201 standard deviation of x and y variables 202 mean_difference, md : float 203 ymean - xmean 204 mean_absolute_difference, mad : float 205 mean( |y-x| ) 206 relative_mean_difference, rmd : float 207 md / xmean 208 relative_mean_absolute_difference, rmad :float 209 mad / xmean 210 standardized_mean_difference, smd : float 211 md / xstd 212 standardized_mean_absolute_difference, smad : float 213 mad /xstd 214 mean_relative_difference, mrd : float 215 mean(y/x) - 1 216 mean_log10_ratio, mlr : float 217 mean( log10(y/x) ) 218 mean_absolute_log10_ratio, malr : float 219 mean( abs( log10(y/x) ) ) 220 median_difference, medd : float 221 median(y-x) 222 median_absolute_difference, medad : float 223 median(|y-x|) 224 relative_median_difference, rmedd : float 225 median(y-x) / xmedian 226 relative_median_absolute_difference, rmedad : float 227 median(|y-x|) / xmedian 228 median_relative_difference, medianrd, medrd : float 229 median(y/x)-1 230 median_log10_ratio, medlr : float 231 median( log10(y/x) ) 232 median_absolute_log10_ratio, medalr : float 233 median( abs( log10(y/x) ) ) 234 normalized_mean_bias_factor, nmbf : float 235 see `nmbf` 236 normalized_mean_absolute_error_factor, nmaef : float 237 see `nmaef` 238 root_mean_square_difference, rmsd : float 239 $\\sqrt{ \\langle (y - x)^2 \\rangle }$ 240 covariance : float 241 cov(x,y) 242 correlation_pearson, correlation, pearsonr, R, r : float 243 Pearson linear correlation coefficient 244 correlation_spearman, spearmanr : float 245 Spearman, non-parametric rank correlation coefficient 246 R2, r2 : float 247 Linear coefficient of determination, $R^2$ 248 ''' 249 250 def __init__(self,x,y,w=None,dropna=False,data=None): 251 '''Compute suite of bivariate statistics during initialization 252 253 Statistic values are saved in attributes. 254 CAUTION: Weights w are ignored except in SMA fit 255 256 Parameters 257 ---------- 258 x : ndarray or str 259 independent variable values 260 y : ndarray or str 261 dependent variable values, same size as x 262 w : ndarray or str, optional 263 weights for points (x,y), same size as x and y 264 dropna : bool, optional (default=False) 265 drops NaN values from x, y, and w 266 data : dict-like, optional 267 if x, y, or w are str, then they should be keys in data 268 ''' 269 270 # Get values from data if needed 271 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 272 raise ValueError( 'Data argument must be used if x, y, or w is a string') 273 if isinstance(x,str): 274 x = data[x] 275 if isinstance(y,str): 276 y = data[y] 277 if isinstance(w,str): 278 w = data[w] 279 280 #Ensure that x and y have same length 281 if len(x) != len(y): 282 raise ValueError( 'Arguments x and y must have the same length' ) 283 if w is None: 284 w = np.ones_like(x) 285 if len(w) != len(x): 286 raise ValueError( 'Argument w (if present) must have the same length as x' ) 287 288 # Drop NaN values 289 if dropna: 290 isna = np.isnan(x*y*w) 291 x = x[~isna] 292 y = y[~isna] 293 w = w[~isna] 294 295 # Differences and ratios used repeatedly 296 diff = y - x 297 absdiff = np.abs( y - x ) 298 # Ignore divide by zero and 0/0 while dividing 299 old_settings = np.seterr(divide='ignore',invalid='ignore') 300 ratio = y/x 301 log10ratio = np.log10(ratio) 302 np.seterr(**old_settings) 303 304 # Number of data points 305 self.count = self.n = len(x) 306 307 # Means, medians, and standard deviations 308 self.xmean = np.mean(x) 309 self.ymean = np.mean(y) 310 self.xmedian = np.median(x) 311 self.ymedian = np.median(y) 312 self.xstd = np.std(x) 313 self.ystd = np.std(y) 314 315 # Save values for use later 316 self._x = x 317 self._y = y 318 self._w = w 319 320 # Mean and mean absolute differences 321 self.mean_difference = self.md = self.ymean - self.xmean 322 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 323 324 # Relative and standardized differences 325 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 326 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 327 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 328 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 329 330 # Mean and median relative differences 331 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 332 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 333 334 # Median and median absolute differences 335 self.median_difference = self.medd = np.median( diff ) 336 self.median_absolute_difference = self.medad = np.median( absdiff ) 337 338 # Relative median differences 339 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 340 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 341 342 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 343 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 344 345 # Mean and mean absolute log ratio 346 self.mean_log10_ratio = self.mlr = np.mean( log10ratio ) 347 self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) ) 348 349 # Median and median absolute log ratio 350 self.median_log10_ratio = self.medlr = np.median( log10ratio ) 351 self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) ) 352 353 # RMS difference 354 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 355 356 # Covariance, correlation 357 self.covariance = np.cov(x,y)[0][1] 358 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 359 np.corrcoef(x,y)[0][1] 360 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 361 self.R2 = self.r2 = self.R**2 362 363 def __getitem__(self,key): 364 '''Accesses attribute values via object['key']''' 365 return getattr(self,key) 366 367 def fitline(self,method='sma',intercept=True,**kwargs): 368 '''Compute bivariate line fit 369 370 Parameters 371 ---------- 372 method : str 373 line fitting method: sma (default), ols, wls, York, sen, siegel 374 intercept : bool 375 defines whether non-zero intercept should be fitted 376 **kwargs 377 passed to `acgc.stats.sma` (e.g. robust=True) 378 379 Returns 380 ------- 381 result : dict 382 dictionary with keys: 383 - slope (float) 384 slope of fitted line 385 - intercept (float) 386 intercept of fitted line 387 - fittedvalues (array (N,)) 388 values on fit line 389 - residuals (array (N,)) 390 residual from fit line 391 ''' 392 393 fitintercept = intercept 394 395 if method.lower()=='sma': 396 fit = sma( self._x, 397 self._y, 398 self._w, 399 intercept=fitintercept, 400 **kwargs) 401 slope = fit['slope'] 402 intercept= fit['intercept'] 403 404 elif method.lower()=='ols': 405 if fitintercept: 406 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 407 self._y, rcond=None ) 408 else: 409 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 410 slope = ols[0][0] 411 intercept = ols[0][1] 412 413 elif method.lower() in ['theil','sen','theilsen']: 414 fitintercept = True 415 fit = sen( self._x, 416 self._y, 417 **kwargs) 418 slope = fit.slope 419 intercept = fit.intercept 420 421 elif method.lower()=='siegel': 422 fitintercept = True 423 siegel = stats.siegelslopes( self._x, 424 self._y ) 425 slope = siegel.slope 426 intercept = siegel.intercept 427 428 elif method.lower()=='wls': 429 raise NotImplementedError('WLS regression not implemented yet') 430 431 elif method.lower()=='york': 432 raise NotImplementedError('York regression not implemented yet') 433 434 else: 435 raise ValueError('Undefined method '+method) 436 437 line = dict( slope = slope, 438 intercept = intercept, 439 fittedvalues = slope * self._x + intercept, 440 residuals = self._y - ( slope * self._x + intercept ), 441 method = method, 442 fitintercept = fitintercept ) 443 444 return line 445 446 def slope(self,method='sma',intercept=True,**kwargs): 447 '''Compute slope of bivariate line fit 448 449 Parameters 450 ---------- 451 method : str 452 line fitting method: sma (default), ols, wls 453 intercept : bool 454 defines whether non-zero intercept should be fitted 455 **kwargs 456 passed to `fitline` 457 458 Returns 459 ------- 460 slope : float 461 value of y intercept 462 ''' 463 return self.fitline(method,intercept,**kwargs)['slope'] 464 465 def intercept(self,method='sma',intercept=True,**kwargs): 466 '''Compute intercept of bivariate line fit 467 468 Parameters 469 ---------- 470 method : str 471 line fitting method: sma (default) or ols 472 intercept : bool 473 defines whether non-zero intercept should be fitted 474 **kwargs 475 passed to `fitline` 476 477 Returns 478 ------- 479 intercept : float 480 value of y intercept 481 ''' 482 return self.fitline(method,intercept,**kwargs)['intercept'] 483 484 def _expand_variables(self,variables): 485 '''Expand special strings into a list of variables 486 487 Parameter 488 --------- 489 variables : list or str, default='common' 490 Special strings ("all","common") will be expanded to a list of variables 491 list arguments will not be modified 492 493 Returns 494 ------- 495 list 496 variable names 497 ''' 498 if variables is None: 499 variables='common' 500 if variables=='all': 501 variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD', 502 'MLR','MALR', 503 'MedD','MedAD','RMedD','RMedAD','MedRD', 504 'MedLR','MedALR', 505 'NMBF','NMAEF','RMSD','cov', 506 'R','R2','spearmanr','slope','intercept', 507 'fitline','n'] 508 elif variables=='common': 509 variables=['MD','MAD','RMD','RMAD','MRD','R2','slope','n'] 510 if not isinstance(variables,list): 511 raise ValueError( 512 'variables must be a list, None, or one of these strings: "all","common"') 513 514 return variables 515 516 def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'): 517 '''Summarize bivariate statistics into a dict 518 519 Parameters 520 ---------- 521 vars : list or str, default='common' 522 names of attribute variables to include in summary 523 names are case insensitive 524 The following strings are also accepted in place of a list 525 "all" (displays all variables) 526 "common" (displays all measures of mean difference) 527 fitline_kw : dict, default=None 528 keywords passed to `fitline` 529 floatformat_fiteqn : str, default=floatformat 530 format specifier for slope and intercept (a,b) in y = a x + b 531 532 Returns 533 ------- 534 summary : dict 535 names and values of variables 536 ''' 537 538 # List of variables 539 variables = self._expand_variables(variables) 540 541 if fitline_kw is None: 542 fitline_kw = {'method':'sma', 543 'intercept':True} 544 545 # Construct the dict 546 summary = {} 547 for v in variables: 548 if v in ['slope','intercept']: 549 # These variables are object methods 550 func = getattr(self,v) 551 value = func(**fitline_kw) 552 elif v == 'fitline': 553 line = self.fitline(**fitline_kw) 554 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 555 else: 556 # Retrieve values 557 value = getattr(self,v.lower()) 558 559 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 560 summary[v] = value 561 562 return summary 563 564 def summary(self, variables=None, fitline_kw=None, 565 intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None, 566 stringlength=None ): 567 '''Summarize bivariate statistics 568 569 Parameters 570 ---------- 571 vars : list or str, default='common' 572 names of attribute variables to include in summary 573 names are case insensitive 574 The following strings are also accepted in place of a list 575 "all" (displays all variables) 576 "common" (displays all measures of mean difference) 577 fitline_kw : dict, default=None 578 keywords passed to `fitline` 579 intformat : str, default='{:d}' 580 format specifier for integer values 581 floatformat : str, default='{:.4f}' 582 format specifier for floating point values 583 floatformat_fiteqn : str, default=floatformat 584 format specifier for slope and intercept (a,b) in y = a x + b 585 stringlength : int, default=None 586 length of the variables on output 587 default (None) is to use the length of the longest variable name 588 589 Returns 590 ------- 591 summary : str 592 names and values of variables 593 ''' 594 # List of variables 595 variables = self._expand_variables(variables) 596 597 if floatformat_fiteqn is None: 598 floatformat_fiteqn = floatformat 599 if stringlength is None: 600 stringlength = np.max([len(v) for v in variables]) 601 stringformat = '{:'+str(stringlength)+'s}' 602 603 # Get a dict containing the needed variables 604 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 605 606 # Extract length of the float numbers from floatformat 607 # import re 608 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 609 # floatformat )[0] ) ).astype(int) 610 611 # summary = (stringformat+'{:>10s}').format('Variable','Value') 612 summarytext = '' 613 for k,v in summarydict.items(): 614 vstr = _number2str(v,intformat,floatformat) 615 summarytext += (stringformat+' = {:s}\n').format(k,vstr) 616 617 return summarytext 618 619 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 620 intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None, 621 loc=None, loc_units='axes', 622 **kwargs): 623 '''Display bivariate statistics as a table inset on a plot axis 624 625 Parameters 626 ---------- 627 ax : matplotlib.Figure.Axis 628 axis where the table will be displayed 629 variables : list or str, default='common' 630 names of attribute variables to include in summary 631 names are case insensitive 632 The following strings are also accepted in place of a list 633 "all" (displays all variables) 634 "common" (displays all measures of mean difference) 635 fitline_kw : dict, default=None 636 keywords passed to `fitline` 637 intformat : str, default='{:d}' 638 format specifier for integer values 639 floatformat : str, default='{:.3f}' 640 format specifier for floating point values 641 floatformat_fiteqn : str, default=floatformat 642 format specifier for slope and intercept (a,b) in y = a x + b 643 loc : tuple (x0,y0), default=(0.85, 0.05) 644 location on the axis where the table will be drawn 645 can be in data units or axes units [0-1] 646 loc_units : {'axes' (default), 'data'} 647 specifies whether loc has 'data' units or 'axes' units [0-1] 648 649 Returns 650 ------- 651 text1, text2 : matplotlib text object 652 Artist for the two text boxes 653 ''' 654 # List of variables 655 variables = self._expand_variables(variables) 656 657 if floatformat_fiteqn is None: 658 floatformat_fiteqn = floatformat 659 660 # Default location in lower right corner 661 if loc is None: 662 loc = (0.8,0.05) 663 664 # Coordinates for loc 665 if loc_units.lower()=='data': 666 coord=ax.transData 667 elif loc_units.lower() in ['axes','axis']: 668 coord=ax.transAxes 669 else: 670 raise ValueError('Display units should be "Data" or "Axes"') 671 672 # Get a dict containing the needed variables 673 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 674 675 # Column of label text 676 label_text = '\n'.join([_texify_name(key) 677 for key in summarydict]) 678 # Column of value text 679 value_text = '\n'.join([_number2str(v,intformat,floatformat) 680 for v in summarydict.values()]) 681 682 # Check if horizontal alignment keyword is used 683 ha='' 684 try: 685 ha = kwargs['ha'] 686 except KeyError: 687 pass 688 try: 689 ha = kwargs['horizontalalignment'] 690 except KeyError: 691 pass 692 693 # For right alignment, align on values first 694 # Otherwise, align on labels 695 if ha=='right': 696 first_text = value_text 697 second_text = label_text 698 sign = -1 699 else: 700 first_text = label_text 701 second_text = value_text 702 sign = +1 703 704 # Add first column of text 705 t1=ax.text(loc[0],loc[1], 706 first_text, 707 transform=coord, 708 **kwargs 709 ) 710 711 # Get width of first text column 712 bbox = t1.get_window_extent().transformed(coord.inverted()) 713 width = bbox.x1-bbox.x0 714 715 # Add second column of text 716 t2 = ax.text(loc[0]+width*sign,loc[1], 717 second_text, 718 transform=coord, 719 **kwargs 720 ) 721 722 ################################## 723 # Early version of this function using matplotlib.table.table() 724 725 # if isinstance(loc,(tuple,list)): 726 # # Create an inset axis to contain the table 727 # tableaxis = ax.inset_axes(loc) 728 # table_width=1 729 # else: 730 # tableaxis = ax 731 732 # # Display the table on the axis 733 # return mtable.table( 734 # tableaxis, 735 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 736 # rowLabels=[texify_name(key) for key in summarydict], 737 # colWidths=[table_width/2]*2, 738 # edges=edges, 739 # loc=loc, bbox=bbox 740 # ) 741 742 return [t1,t2]
188class BivariateStatistics: 189 '''A suite of common statistics to quantify bivariate relationships 190 191 Class method 'summary' provides a formatted summary of these statistics 192 193 Attributes 194 ---------- 195 count, n : int 196 number of valid (not NaN) data value pairs 197 xmean, ymean : float 198 mean of x and y variables 199 xmedian, ymedian :float 200 median of x and y variables 201 xstd, ystd : float 202 standard deviation of x and y variables 203 mean_difference, md : float 204 ymean - xmean 205 mean_absolute_difference, mad : float 206 mean( |y-x| ) 207 relative_mean_difference, rmd : float 208 md / xmean 209 relative_mean_absolute_difference, rmad :float 210 mad / xmean 211 standardized_mean_difference, smd : float 212 md / xstd 213 standardized_mean_absolute_difference, smad : float 214 mad /xstd 215 mean_relative_difference, mrd : float 216 mean(y/x) - 1 217 mean_log10_ratio, mlr : float 218 mean( log10(y/x) ) 219 mean_absolute_log10_ratio, malr : float 220 mean( abs( log10(y/x) ) ) 221 median_difference, medd : float 222 median(y-x) 223 median_absolute_difference, medad : float 224 median(|y-x|) 225 relative_median_difference, rmedd : float 226 median(y-x) / xmedian 227 relative_median_absolute_difference, rmedad : float 228 median(|y-x|) / xmedian 229 median_relative_difference, medianrd, medrd : float 230 median(y/x)-1 231 median_log10_ratio, medlr : float 232 median( log10(y/x) ) 233 median_absolute_log10_ratio, medalr : float 234 median( abs( log10(y/x) ) ) 235 normalized_mean_bias_factor, nmbf : float 236 see `nmbf` 237 normalized_mean_absolute_error_factor, nmaef : float 238 see `nmaef` 239 root_mean_square_difference, rmsd : float 240 $\\sqrt{ \\langle (y - x)^2 \\rangle }$ 241 covariance : float 242 cov(x,y) 243 correlation_pearson, correlation, pearsonr, R, r : float 244 Pearson linear correlation coefficient 245 correlation_spearman, spearmanr : float 246 Spearman, non-parametric rank correlation coefficient 247 R2, r2 : float 248 Linear coefficient of determination, $R^2$ 249 ''' 250 251 def __init__(self,x,y,w=None,dropna=False,data=None): 252 '''Compute suite of bivariate statistics during initialization 253 254 Statistic values are saved in attributes. 255 CAUTION: Weights w are ignored except in SMA fit 256 257 Parameters 258 ---------- 259 x : ndarray or str 260 independent variable values 261 y : ndarray or str 262 dependent variable values, same size as x 263 w : ndarray or str, optional 264 weights for points (x,y), same size as x and y 265 dropna : bool, optional (default=False) 266 drops NaN values from x, y, and w 267 data : dict-like, optional 268 if x, y, or w are str, then they should be keys in data 269 ''' 270 271 # Get values from data if needed 272 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 273 raise ValueError( 'Data argument must be used if x, y, or w is a string') 274 if isinstance(x,str): 275 x = data[x] 276 if isinstance(y,str): 277 y = data[y] 278 if isinstance(w,str): 279 w = data[w] 280 281 #Ensure that x and y have same length 282 if len(x) != len(y): 283 raise ValueError( 'Arguments x and y must have the same length' ) 284 if w is None: 285 w = np.ones_like(x) 286 if len(w) != len(x): 287 raise ValueError( 'Argument w (if present) must have the same length as x' ) 288 289 # Drop NaN values 290 if dropna: 291 isna = np.isnan(x*y*w) 292 x = x[~isna] 293 y = y[~isna] 294 w = w[~isna] 295 296 # Differences and ratios used repeatedly 297 diff = y - x 298 absdiff = np.abs( y - x ) 299 # Ignore divide by zero and 0/0 while dividing 300 old_settings = np.seterr(divide='ignore',invalid='ignore') 301 ratio = y/x 302 log10ratio = np.log10(ratio) 303 np.seterr(**old_settings) 304 305 # Number of data points 306 self.count = self.n = len(x) 307 308 # Means, medians, and standard deviations 309 self.xmean = np.mean(x) 310 self.ymean = np.mean(y) 311 self.xmedian = np.median(x) 312 self.ymedian = np.median(y) 313 self.xstd = np.std(x) 314 self.ystd = np.std(y) 315 316 # Save values for use later 317 self._x = x 318 self._y = y 319 self._w = w 320 321 # Mean and mean absolute differences 322 self.mean_difference = self.md = self.ymean - self.xmean 323 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 324 325 # Relative and standardized differences 326 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 327 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 328 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 329 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 330 331 # Mean and median relative differences 332 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 333 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 334 335 # Median and median absolute differences 336 self.median_difference = self.medd = np.median( diff ) 337 self.median_absolute_difference = self.medad = np.median( absdiff ) 338 339 # Relative median differences 340 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 341 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 342 343 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 344 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 345 346 # Mean and mean absolute log ratio 347 self.mean_log10_ratio = self.mlr = np.mean( log10ratio ) 348 self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) ) 349 350 # Median and median absolute log ratio 351 self.median_log10_ratio = self.medlr = np.median( log10ratio ) 352 self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) ) 353 354 # RMS difference 355 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 356 357 # Covariance, correlation 358 self.covariance = np.cov(x,y)[0][1] 359 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 360 np.corrcoef(x,y)[0][1] 361 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 362 self.R2 = self.r2 = self.R**2 363 364 def __getitem__(self,key): 365 '''Accesses attribute values via object['key']''' 366 return getattr(self,key) 367 368 def fitline(self,method='sma',intercept=True,**kwargs): 369 '''Compute bivariate line fit 370 371 Parameters 372 ---------- 373 method : str 374 line fitting method: sma (default), ols, wls, York, sen, siegel 375 intercept : bool 376 defines whether non-zero intercept should be fitted 377 **kwargs 378 passed to `acgc.stats.sma` (e.g. robust=True) 379 380 Returns 381 ------- 382 result : dict 383 dictionary with keys: 384 - slope (float) 385 slope of fitted line 386 - intercept (float) 387 intercept of fitted line 388 - fittedvalues (array (N,)) 389 values on fit line 390 - residuals (array (N,)) 391 residual from fit line 392 ''' 393 394 fitintercept = intercept 395 396 if method.lower()=='sma': 397 fit = sma( self._x, 398 self._y, 399 self._w, 400 intercept=fitintercept, 401 **kwargs) 402 slope = fit['slope'] 403 intercept= fit['intercept'] 404 405 elif method.lower()=='ols': 406 if fitintercept: 407 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 408 self._y, rcond=None ) 409 else: 410 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 411 slope = ols[0][0] 412 intercept = ols[0][1] 413 414 elif method.lower() in ['theil','sen','theilsen']: 415 fitintercept = True 416 fit = sen( self._x, 417 self._y, 418 **kwargs) 419 slope = fit.slope 420 intercept = fit.intercept 421 422 elif method.lower()=='siegel': 423 fitintercept = True 424 siegel = stats.siegelslopes( self._x, 425 self._y ) 426 slope = siegel.slope 427 intercept = siegel.intercept 428 429 elif method.lower()=='wls': 430 raise NotImplementedError('WLS regression not implemented yet') 431 432 elif method.lower()=='york': 433 raise NotImplementedError('York regression not implemented yet') 434 435 else: 436 raise ValueError('Undefined method '+method) 437 438 line = dict( slope = slope, 439 intercept = intercept, 440 fittedvalues = slope * self._x + intercept, 441 residuals = self._y - ( slope * self._x + intercept ), 442 method = method, 443 fitintercept = fitintercept ) 444 445 return line 446 447 def slope(self,method='sma',intercept=True,**kwargs): 448 '''Compute slope of bivariate line fit 449 450 Parameters 451 ---------- 452 method : str 453 line fitting method: sma (default), ols, wls 454 intercept : bool 455 defines whether non-zero intercept should be fitted 456 **kwargs 457 passed to `fitline` 458 459 Returns 460 ------- 461 slope : float 462 value of y intercept 463 ''' 464 return self.fitline(method,intercept,**kwargs)['slope'] 465 466 def intercept(self,method='sma',intercept=True,**kwargs): 467 '''Compute intercept of bivariate line fit 468 469 Parameters 470 ---------- 471 method : str 472 line fitting method: sma (default) or ols 473 intercept : bool 474 defines whether non-zero intercept should be fitted 475 **kwargs 476 passed to `fitline` 477 478 Returns 479 ------- 480 intercept : float 481 value of y intercept 482 ''' 483 return self.fitline(method,intercept,**kwargs)['intercept'] 484 485 def _expand_variables(self,variables): 486 '''Expand special strings into a list of variables 487 488 Parameter 489 --------- 490 variables : list or str, default='common' 491 Special strings ("all","common") will be expanded to a list of variables 492 list arguments will not be modified 493 494 Returns 495 ------- 496 list 497 variable names 498 ''' 499 if variables is None: 500 variables='common' 501 if variables=='all': 502 variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD', 503 'MLR','MALR', 504 'MedD','MedAD','RMedD','RMedAD','MedRD', 505 'MedLR','MedALR', 506 'NMBF','NMAEF','RMSD','cov', 507 'R','R2','spearmanr','slope','intercept', 508 'fitline','n'] 509 elif variables=='common': 510 variables=['MD','MAD','RMD','RMAD','MRD','R2','slope','n'] 511 if not isinstance(variables,list): 512 raise ValueError( 513 'variables must be a list, None, or one of these strings: "all","common"') 514 515 return variables 516 517 def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'): 518 '''Summarize bivariate statistics into a dict 519 520 Parameters 521 ---------- 522 vars : list or str, default='common' 523 names of attribute variables to include in summary 524 names are case insensitive 525 The following strings are also accepted in place of a list 526 "all" (displays all variables) 527 "common" (displays all measures of mean difference) 528 fitline_kw : dict, default=None 529 keywords passed to `fitline` 530 floatformat_fiteqn : str, default=floatformat 531 format specifier for slope and intercept (a,b) in y = a x + b 532 533 Returns 534 ------- 535 summary : dict 536 names and values of variables 537 ''' 538 539 # List of variables 540 variables = self._expand_variables(variables) 541 542 if fitline_kw is None: 543 fitline_kw = {'method':'sma', 544 'intercept':True} 545 546 # Construct the dict 547 summary = {} 548 for v in variables: 549 if v in ['slope','intercept']: 550 # These variables are object methods 551 func = getattr(self,v) 552 value = func(**fitline_kw) 553 elif v == 'fitline': 554 line = self.fitline(**fitline_kw) 555 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 556 else: 557 # Retrieve values 558 value = getattr(self,v.lower()) 559 560 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 561 summary[v] = value 562 563 return summary 564 565 def summary(self, variables=None, fitline_kw=None, 566 intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None, 567 stringlength=None ): 568 '''Summarize bivariate statistics 569 570 Parameters 571 ---------- 572 vars : list or str, default='common' 573 names of attribute variables to include in summary 574 names are case insensitive 575 The following strings are also accepted in place of a list 576 "all" (displays all variables) 577 "common" (displays all measures of mean difference) 578 fitline_kw : dict, default=None 579 keywords passed to `fitline` 580 intformat : str, default='{:d}' 581 format specifier for integer values 582 floatformat : str, default='{:.4f}' 583 format specifier for floating point values 584 floatformat_fiteqn : str, default=floatformat 585 format specifier for slope and intercept (a,b) in y = a x + b 586 stringlength : int, default=None 587 length of the variables on output 588 default (None) is to use the length of the longest variable name 589 590 Returns 591 ------- 592 summary : str 593 names and values of variables 594 ''' 595 # List of variables 596 variables = self._expand_variables(variables) 597 598 if floatformat_fiteqn is None: 599 floatformat_fiteqn = floatformat 600 if stringlength is None: 601 stringlength = np.max([len(v) for v in variables]) 602 stringformat = '{:'+str(stringlength)+'s}' 603 604 # Get a dict containing the needed variables 605 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 606 607 # Extract length of the float numbers from floatformat 608 # import re 609 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 610 # floatformat )[0] ) ).astype(int) 611 612 # summary = (stringformat+'{:>10s}').format('Variable','Value') 613 summarytext = '' 614 for k,v in summarydict.items(): 615 vstr = _number2str(v,intformat,floatformat) 616 summarytext += (stringformat+' = {:s}\n').format(k,vstr) 617 618 return summarytext 619 620 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 621 intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None, 622 loc=None, loc_units='axes', 623 **kwargs): 624 '''Display bivariate statistics as a table inset on a plot axis 625 626 Parameters 627 ---------- 628 ax : matplotlib.Figure.Axis 629 axis where the table will be displayed 630 variables : list or str, default='common' 631 names of attribute variables to include in summary 632 names are case insensitive 633 The following strings are also accepted in place of a list 634 "all" (displays all variables) 635 "common" (displays all measures of mean difference) 636 fitline_kw : dict, default=None 637 keywords passed to `fitline` 638 intformat : str, default='{:d}' 639 format specifier for integer values 640 floatformat : str, default='{:.3f}' 641 format specifier for floating point values 642 floatformat_fiteqn : str, default=floatformat 643 format specifier for slope and intercept (a,b) in y = a x + b 644 loc : tuple (x0,y0), default=(0.85, 0.05) 645 location on the axis where the table will be drawn 646 can be in data units or axes units [0-1] 647 loc_units : {'axes' (default), 'data'} 648 specifies whether loc has 'data' units or 'axes' units [0-1] 649 650 Returns 651 ------- 652 text1, text2 : matplotlib text object 653 Artist for the two text boxes 654 ''' 655 # List of variables 656 variables = self._expand_variables(variables) 657 658 if floatformat_fiteqn is None: 659 floatformat_fiteqn = floatformat 660 661 # Default location in lower right corner 662 if loc is None: 663 loc = (0.8,0.05) 664 665 # Coordinates for loc 666 if loc_units.lower()=='data': 667 coord=ax.transData 668 elif loc_units.lower() in ['axes','axis']: 669 coord=ax.transAxes 670 else: 671 raise ValueError('Display units should be "Data" or "Axes"') 672 673 # Get a dict containing the needed variables 674 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 675 676 # Column of label text 677 label_text = '\n'.join([_texify_name(key) 678 for key in summarydict]) 679 # Column of value text 680 value_text = '\n'.join([_number2str(v,intformat,floatformat) 681 for v in summarydict.values()]) 682 683 # Check if horizontal alignment keyword is used 684 ha='' 685 try: 686 ha = kwargs['ha'] 687 except KeyError: 688 pass 689 try: 690 ha = kwargs['horizontalalignment'] 691 except KeyError: 692 pass 693 694 # For right alignment, align on values first 695 # Otherwise, align on labels 696 if ha=='right': 697 first_text = value_text 698 second_text = label_text 699 sign = -1 700 else: 701 first_text = label_text 702 second_text = value_text 703 sign = +1 704 705 # Add first column of text 706 t1=ax.text(loc[0],loc[1], 707 first_text, 708 transform=coord, 709 **kwargs 710 ) 711 712 # Get width of first text column 713 bbox = t1.get_window_extent().transformed(coord.inverted()) 714 width = bbox.x1-bbox.x0 715 716 # Add second column of text 717 t2 = ax.text(loc[0]+width*sign,loc[1], 718 second_text, 719 transform=coord, 720 **kwargs 721 ) 722 723 ################################## 724 # Early version of this function using matplotlib.table.table() 725 726 # if isinstance(loc,(tuple,list)): 727 # # Create an inset axis to contain the table 728 # tableaxis = ax.inset_axes(loc) 729 # table_width=1 730 # else: 731 # tableaxis = ax 732 733 # # Display the table on the axis 734 # return mtable.table( 735 # tableaxis, 736 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 737 # rowLabels=[texify_name(key) for key in summarydict], 738 # colWidths=[table_width/2]*2, 739 # edges=edges, 740 # loc=loc, bbox=bbox 741 # ) 742 743 return [t1,t2]
A suite of common statistics to quantify bivariate relationships
Class method 'summary' provides a formatted summary of these statistics
Attributes
- count, n (int): number of valid (not NaN) data value pairs
- xmean, ymean (float): mean of x and y variables
- xmedian, ymedian (float): median of x and y variables
- xstd, ystd (float): standard deviation of x and y variables
- mean_difference, md (float): ymean - xmean
- mean_absolute_difference, mad (float): mean( |y-x| )
- relative_mean_difference, rmd (float): md / xmean
- relative_mean_absolute_difference, rmad (float): mad / xmean
- standardized_mean_difference, smd (float): md / xstd
- standardized_mean_absolute_difference, smad (float): mad /xstd
- mean_relative_difference, mrd (float): mean(y/x) - 1
- mean_log10_ratio, mlr (float): mean( log10(y/x) )
- mean_absolute_log10_ratio, malr (float): mean( abs( log10(y/x) ) )
- median_difference, medd (float): median(y-x)
- median_absolute_difference, medad (float): median(|y-x|)
- relative_median_difference, rmedd (float): median(y-x) / xmedian
- relative_median_absolute_difference, rmedad (float): median(|y-x|) / xmedian
- median_relative_difference, medianrd, medrd (float): median(y/x)-1
- median_log10_ratio, medlr (float): median( log10(y/x) )
- median_absolute_log10_ratio, medalr (float): median( abs( log10(y/x) ) )
- normalized_mean_bias_factor, nmbf (float):
see
nmbf - normalized_mean_absolute_error_factor, nmaef (float):
see
nmaef - root_mean_square_difference, rmsd (float): $\sqrt{ \langle (y - x)^2 \rangle }$
- covariance (float): cov(x,y)
- correlation_pearson, correlation, pearsonr, R, r (float): Pearson linear correlation coefficient
- correlation_spearman, spearmanr (float): Spearman, non-parametric rank correlation coefficient
- R2, r2 (float): Linear coefficient of determination, $R^2$
251 def __init__(self,x,y,w=None,dropna=False,data=None): 252 '''Compute suite of bivariate statistics during initialization 253 254 Statistic values are saved in attributes. 255 CAUTION: Weights w are ignored except in SMA fit 256 257 Parameters 258 ---------- 259 x : ndarray or str 260 independent variable values 261 y : ndarray or str 262 dependent variable values, same size as x 263 w : ndarray or str, optional 264 weights for points (x,y), same size as x and y 265 dropna : bool, optional (default=False) 266 drops NaN values from x, y, and w 267 data : dict-like, optional 268 if x, y, or w are str, then they should be keys in data 269 ''' 270 271 # Get values from data if needed 272 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 273 raise ValueError( 'Data argument must be used if x, y, or w is a string') 274 if isinstance(x,str): 275 x = data[x] 276 if isinstance(y,str): 277 y = data[y] 278 if isinstance(w,str): 279 w = data[w] 280 281 #Ensure that x and y have same length 282 if len(x) != len(y): 283 raise ValueError( 'Arguments x and y must have the same length' ) 284 if w is None: 285 w = np.ones_like(x) 286 if len(w) != len(x): 287 raise ValueError( 'Argument w (if present) must have the same length as x' ) 288 289 # Drop NaN values 290 if dropna: 291 isna = np.isnan(x*y*w) 292 x = x[~isna] 293 y = y[~isna] 294 w = w[~isna] 295 296 # Differences and ratios used repeatedly 297 diff = y - x 298 absdiff = np.abs( y - x ) 299 # Ignore divide by zero and 0/0 while dividing 300 old_settings = np.seterr(divide='ignore',invalid='ignore') 301 ratio = y/x 302 log10ratio = np.log10(ratio) 303 np.seterr(**old_settings) 304 305 # Number of data points 306 self.count = self.n = len(x) 307 308 # Means, medians, and standard deviations 309 self.xmean = np.mean(x) 310 self.ymean = np.mean(y) 311 self.xmedian = np.median(x) 312 self.ymedian = np.median(y) 313 self.xstd = np.std(x) 314 self.ystd = np.std(y) 315 316 # Save values for use later 317 self._x = x 318 self._y = y 319 self._w = w 320 321 # Mean and mean absolute differences 322 self.mean_difference = self.md = self.ymean - self.xmean 323 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 324 325 # Relative and standardized differences 326 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 327 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 328 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 329 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 330 331 # Mean and median relative differences 332 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 333 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 334 335 # Median and median absolute differences 336 self.median_difference = self.medd = np.median( diff ) 337 self.median_absolute_difference = self.medad = np.median( absdiff ) 338 339 # Relative median differences 340 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 341 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 342 343 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 344 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 345 346 # Mean and mean absolute log ratio 347 self.mean_log10_ratio = self.mlr = np.mean( log10ratio ) 348 self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) ) 349 350 # Median and median absolute log ratio 351 self.median_log10_ratio = self.medlr = np.median( log10ratio ) 352 self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) ) 353 354 # RMS difference 355 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 356 357 # Covariance, correlation 358 self.covariance = np.cov(x,y)[0][1] 359 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 360 np.corrcoef(x,y)[0][1] 361 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 362 self.R2 = self.r2 = self.R**2
Compute suite of bivariate statistics during initialization
Statistic values are saved in attributes. CAUTION: Weights w are ignored except in SMA fit
Parameters
- x (ndarray or str): independent variable values
- y (ndarray or str): dependent variable values, same size as x
- w (ndarray or str, optional): weights for points (x,y), same size as x and y
- dropna (bool, optional (default=False)): drops NaN values from x, y, and w
- data (dict-like, optional): if x, y, or w are str, then they should be keys in data
368 def fitline(self,method='sma',intercept=True,**kwargs): 369 '''Compute bivariate line fit 370 371 Parameters 372 ---------- 373 method : str 374 line fitting method: sma (default), ols, wls, York, sen, siegel 375 intercept : bool 376 defines whether non-zero intercept should be fitted 377 **kwargs 378 passed to `acgc.stats.sma` (e.g. robust=True) 379 380 Returns 381 ------- 382 result : dict 383 dictionary with keys: 384 - slope (float) 385 slope of fitted line 386 - intercept (float) 387 intercept of fitted line 388 - fittedvalues (array (N,)) 389 values on fit line 390 - residuals (array (N,)) 391 residual from fit line 392 ''' 393 394 fitintercept = intercept 395 396 if method.lower()=='sma': 397 fit = sma( self._x, 398 self._y, 399 self._w, 400 intercept=fitintercept, 401 **kwargs) 402 slope = fit['slope'] 403 intercept= fit['intercept'] 404 405 elif method.lower()=='ols': 406 if fitintercept: 407 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 408 self._y, rcond=None ) 409 else: 410 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 411 slope = ols[0][0] 412 intercept = ols[0][1] 413 414 elif method.lower() in ['theil','sen','theilsen']: 415 fitintercept = True 416 fit = sen( self._x, 417 self._y, 418 **kwargs) 419 slope = fit.slope 420 intercept = fit.intercept 421 422 elif method.lower()=='siegel': 423 fitintercept = True 424 siegel = stats.siegelslopes( self._x, 425 self._y ) 426 slope = siegel.slope 427 intercept = siegel.intercept 428 429 elif method.lower()=='wls': 430 raise NotImplementedError('WLS regression not implemented yet') 431 432 elif method.lower()=='york': 433 raise NotImplementedError('York regression not implemented yet') 434 435 else: 436 raise ValueError('Undefined method '+method) 437 438 line = dict( slope = slope, 439 intercept = intercept, 440 fittedvalues = slope * self._x + intercept, 441 residuals = self._y - ( slope * self._x + intercept ), 442 method = method, 443 fitintercept = fitintercept ) 444 445 return line
Compute bivariate line fit
Parameters
- method (str): line fitting method: sma (default), ols, wls, York, sen, siegel
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
acgc.stats.sma(e.g. robust=True)
Returns
- result (dict):
dictionary with keys:
- slope (float) slope of fitted line
- intercept (float) intercept of fitted line
- fittedvalues (array (N,)) values on fit line
- residuals (array (N,)) residual from fit line
447 def slope(self,method='sma',intercept=True,**kwargs): 448 '''Compute slope of bivariate line fit 449 450 Parameters 451 ---------- 452 method : str 453 line fitting method: sma (default), ols, wls 454 intercept : bool 455 defines whether non-zero intercept should be fitted 456 **kwargs 457 passed to `fitline` 458 459 Returns 460 ------- 461 slope : float 462 value of y intercept 463 ''' 464 return self.fitline(method,intercept,**kwargs)['slope']
Compute slope of bivariate line fit
Parameters
- method (str): line fitting method: sma (default), ols, wls
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
fitline
Returns
- slope (float): value of y intercept
466 def intercept(self,method='sma',intercept=True,**kwargs): 467 '''Compute intercept of bivariate line fit 468 469 Parameters 470 ---------- 471 method : str 472 line fitting method: sma (default) or ols 473 intercept : bool 474 defines whether non-zero intercept should be fitted 475 **kwargs 476 passed to `fitline` 477 478 Returns 479 ------- 480 intercept : float 481 value of y intercept 482 ''' 483 return self.fitline(method,intercept,**kwargs)['intercept']
Compute intercept of bivariate line fit
Parameters
- method (str): line fitting method: sma (default) or ols
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
fitline
Returns
- intercept (float): value of y intercept
517 def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'): 518 '''Summarize bivariate statistics into a dict 519 520 Parameters 521 ---------- 522 vars : list or str, default='common' 523 names of attribute variables to include in summary 524 names are case insensitive 525 The following strings are also accepted in place of a list 526 "all" (displays all variables) 527 "common" (displays all measures of mean difference) 528 fitline_kw : dict, default=None 529 keywords passed to `fitline` 530 floatformat_fiteqn : str, default=floatformat 531 format specifier for slope and intercept (a,b) in y = a x + b 532 533 Returns 534 ------- 535 summary : dict 536 names and values of variables 537 ''' 538 539 # List of variables 540 variables = self._expand_variables(variables) 541 542 if fitline_kw is None: 543 fitline_kw = {'method':'sma', 544 'intercept':True} 545 546 # Construct the dict 547 summary = {} 548 for v in variables: 549 if v in ['slope','intercept']: 550 # These variables are object methods 551 func = getattr(self,v) 552 value = func(**fitline_kw) 553 elif v == 'fitline': 554 line = self.fitline(**fitline_kw) 555 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 556 else: 557 # Retrieve values 558 value = getattr(self,v.lower()) 559 560 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 561 summary[v] = value 562 563 return summary
Summarize bivariate statistics into a dict
Parameters
- vars (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - fitline_kw (dict, default=None):
keywords passed to
fitline - floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
Returns
- summary (dict): names and values of variables
565 def summary(self, variables=None, fitline_kw=None, 566 intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None, 567 stringlength=None ): 568 '''Summarize bivariate statistics 569 570 Parameters 571 ---------- 572 vars : list or str, default='common' 573 names of attribute variables to include in summary 574 names are case insensitive 575 The following strings are also accepted in place of a list 576 "all" (displays all variables) 577 "common" (displays all measures of mean difference) 578 fitline_kw : dict, default=None 579 keywords passed to `fitline` 580 intformat : str, default='{:d}' 581 format specifier for integer values 582 floatformat : str, default='{:.4f}' 583 format specifier for floating point values 584 floatformat_fiteqn : str, default=floatformat 585 format specifier for slope and intercept (a,b) in y = a x + b 586 stringlength : int, default=None 587 length of the variables on output 588 default (None) is to use the length of the longest variable name 589 590 Returns 591 ------- 592 summary : str 593 names and values of variables 594 ''' 595 # List of variables 596 variables = self._expand_variables(variables) 597 598 if floatformat_fiteqn is None: 599 floatformat_fiteqn = floatformat 600 if stringlength is None: 601 stringlength = np.max([len(v) for v in variables]) 602 stringformat = '{:'+str(stringlength)+'s}' 603 604 # Get a dict containing the needed variables 605 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 606 607 # Extract length of the float numbers from floatformat 608 # import re 609 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 610 # floatformat )[0] ) ).astype(int) 611 612 # summary = (stringformat+'{:>10s}').format('Variable','Value') 613 summarytext = '' 614 for k,v in summarydict.items(): 615 vstr = _number2str(v,intformat,floatformat) 616 summarytext += (stringformat+' = {:s}\n').format(k,vstr) 617 618 return summarytext
Summarize bivariate statistics
Parameters
- vars (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - fitline_kw (dict, default=None):
keywords passed to
fitline - intformat : str, default='{ (d}'): format specifier for integer values
- floatformat : str, default='{ (.4f}'): format specifier for floating point values
- floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
- stringlength (int, default=None): length of the variables on output default (None) is to use the length of the longest variable name
Returns
- summary (str): names and values of variables
620 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 621 intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None, 622 loc=None, loc_units='axes', 623 **kwargs): 624 '''Display bivariate statistics as a table inset on a plot axis 625 626 Parameters 627 ---------- 628 ax : matplotlib.Figure.Axis 629 axis where the table will be displayed 630 variables : list or str, default='common' 631 names of attribute variables to include in summary 632 names are case insensitive 633 The following strings are also accepted in place of a list 634 "all" (displays all variables) 635 "common" (displays all measures of mean difference) 636 fitline_kw : dict, default=None 637 keywords passed to `fitline` 638 intformat : str, default='{:d}' 639 format specifier for integer values 640 floatformat : str, default='{:.3f}' 641 format specifier for floating point values 642 floatformat_fiteqn : str, default=floatformat 643 format specifier for slope and intercept (a,b) in y = a x + b 644 loc : tuple (x0,y0), default=(0.85, 0.05) 645 location on the axis where the table will be drawn 646 can be in data units or axes units [0-1] 647 loc_units : {'axes' (default), 'data'} 648 specifies whether loc has 'data' units or 'axes' units [0-1] 649 650 Returns 651 ------- 652 text1, text2 : matplotlib text object 653 Artist for the two text boxes 654 ''' 655 # List of variables 656 variables = self._expand_variables(variables) 657 658 if floatformat_fiteqn is None: 659 floatformat_fiteqn = floatformat 660 661 # Default location in lower right corner 662 if loc is None: 663 loc = (0.8,0.05) 664 665 # Coordinates for loc 666 if loc_units.lower()=='data': 667 coord=ax.transData 668 elif loc_units.lower() in ['axes','axis']: 669 coord=ax.transAxes 670 else: 671 raise ValueError('Display units should be "Data" or "Axes"') 672 673 # Get a dict containing the needed variables 674 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 675 676 # Column of label text 677 label_text = '\n'.join([_texify_name(key) 678 for key in summarydict]) 679 # Column of value text 680 value_text = '\n'.join([_number2str(v,intformat,floatformat) 681 for v in summarydict.values()]) 682 683 # Check if horizontal alignment keyword is used 684 ha='' 685 try: 686 ha = kwargs['ha'] 687 except KeyError: 688 pass 689 try: 690 ha = kwargs['horizontalalignment'] 691 except KeyError: 692 pass 693 694 # For right alignment, align on values first 695 # Otherwise, align on labels 696 if ha=='right': 697 first_text = value_text 698 second_text = label_text 699 sign = -1 700 else: 701 first_text = label_text 702 second_text = value_text 703 sign = +1 704 705 # Add first column of text 706 t1=ax.text(loc[0],loc[1], 707 first_text, 708 transform=coord, 709 **kwargs 710 ) 711 712 # Get width of first text column 713 bbox = t1.get_window_extent().transformed(coord.inverted()) 714 width = bbox.x1-bbox.x0 715 716 # Add second column of text 717 t2 = ax.text(loc[0]+width*sign,loc[1], 718 second_text, 719 transform=coord, 720 **kwargs 721 ) 722 723 ################################## 724 # Early version of this function using matplotlib.table.table() 725 726 # if isinstance(loc,(tuple,list)): 727 # # Create an inset axis to contain the table 728 # tableaxis = ax.inset_axes(loc) 729 # table_width=1 730 # else: 731 # tableaxis = ax 732 733 # # Display the table on the axis 734 # return mtable.table( 735 # tableaxis, 736 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 737 # rowLabels=[texify_name(key) for key in summarydict], 738 # colWidths=[table_width/2]*2, 739 # edges=edges, 740 # loc=loc, bbox=bbox 741 # ) 742 743 return [t1,t2]
Display bivariate statistics as a table inset on a plot axis
Parameters
- ax (matplotlib.Figure.Axis): axis where the table will be displayed
- variables (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - fitline_kw (dict, default=None):
keywords passed to
fitline - intformat : str, default='{ (d}'): format specifier for integer values
- floatformat : str, default='{ (.3f}'): format specifier for floating point values
- floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
- loc (tuple (x0,y0), default=(0.85, 0.05)): location on the axis where the table will be drawn can be in data units or axes units [0-1]
- loc_units ({'axes' (default), 'data'}): specifies whether loc has 'data' units or 'axes' units [0-1]
Returns
- text1, text2 (matplotlib text object): Artist for the two text boxes
22def nmb( x0, x1 ): 23 '''Compute Normalized Mean Bias (NMB) 24 25 NMB = ( mean(x1) - mean(x0) ) / mean(x0) 26 27 Parameters 28 ---------- 29 x0 : array_like 30 reference values 31 x1 : array_like 32 experiment values 33 ''' 34 35 assert (len(x0) == len(x1)), \ 36 "Parameters x0 and x1 must have the same length" 37 38 # Mean values 39 x0_mean = np.mean(x0) 40 x1_mean = np.mean(x1) 41 42 # Metric value 43 return x1_mean / x0_mean - 1
Compute Normalized Mean Bias (NMB)
NMB = ( mean(x1) - mean(x0) ) / mean(x0)
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
45def nmae( x0, x1 ): 46 '''Compute Normalized Mean Absolute Error (NMAE) 47 48 NMAE = mean(abs(x1 - x0)) / abs(mean(x0)) 49 50 Parameters 51 --------- 52 x0 : array_like 53 reference values 54 x1 : array_like 55 experiment values 56 ''' 57 58 # Mean values 59 x0_mean = np.mean(x0) 60 61 # Mean absolute difference 62 abs_diff = np.mean( np.abs(x1 - x0) ) 63 64 # Metric value 65 return abs_diff / np.abs( x0_mean )
Compute Normalized Mean Absolute Error (NMAE)
NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
68def nmbf( x0, x1 ): 69 '''Compute Normalized Mean Bias Factor (NMBF) 70 71 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 72 73 Parameters 74 ---------- 75 x0 : array_like 76 reference values 77 x1 : array_like 78 experiment values 79 ''' 80 81 # Ensure that arguments have the same length 82 assert (len(x0) == len(x1)), \ 83 "Parameters x0 and x1 must have the same length" 84 85 # Mean values 86 x0_mean = np.mean(x0) 87 x1_mean = np.mean(x1) 88 89 # Metric value 90 if x1_mean >= x0_mean: 91 result = x1_mean / x0_mean - 1 92 else: 93 result= 1 - x0_mean / x1_mean 94 # Equivalent (faster?) implementation 95 #S = (mMean - oMean) / np.abs(mMean - oMean) 96 #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 ) 97 98 return result
Compute Normalized Mean Bias Factor (NMBF)
Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
100def nmaef( x0, x1 ): 101 '''Compute Normalized Mean Absolute Error Factor (NMAEF) 102 103 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 104 105 Parameters 106 ---------- 107 x0 : array_like 108 reference values 109 x1 : array_like 110 experiment values 111 ''' 112 113 # Ensure that arguments have the same length 114 assert (len(x0) == len(x1)), \ 115 "Parameters x0 and x1 must have the same length" 116 117 # Mean values 118 x0_mean = np.mean(x0) 119 x1_mean = np.mean(x1) 120 121 # Mean absolute difference 122 abs_diff = np.mean( np.abs(x1 - x0)) 123 124 # Metric value 125 if x1_mean >= x0_mean: 126 result = abs_diff / x0_mean 127 else: 128 result = abs_diff / x1_mean 129 # Equivalent (faster?) implementation 130 #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean) 131 #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) ) 132 133 return result
Compute Normalized Mean Absolute Error Factor (NMAEF)
Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values