acgc.stats.bivariate

Bivariate statistics

Statistical measures of relationships between two populations

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3""" Bivariate statistics
  4
  5Statistical measures of relationships between two populations
  6"""
  7
  8import numpy as np
  9from scipy import stats
 10from .bivariate_lines import sen, sma, bivariate_line_equation
 11# import xarray as xr
 12
 13__all__ = [
 14    "BivariateStatistics",
 15    "nmb",
 16    "nmae",
 17    "nmbf",
 18    "nmaef"
 19]
 20
 21def nmb( x0, x1 ):
 22    '''Compute Normalized Mean Bias (NMB)
 23
 24    NMB = ( mean(x1) - mean(x0) ) / mean(x0)
 25
 26    Parameters
 27    ----------
 28    x0 : array_like
 29        reference values
 30    x1 : array_like
 31        experiment values
 32    '''
 33
 34    assert (len(x0) == len(x1)), \
 35        "Parameters x0 and x1 must have the same length"
 36
 37    # Mean values
 38    x0_mean = np.mean(x0)
 39    x1_mean = np.mean(x1)
 40
 41    # Metric value
 42    return x1_mean / x0_mean - 1
 43
 44def nmae( x0, x1 ):
 45    '''Compute Normalized Mean Absolute Error (NMAE)
 46
 47    NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
 48
 49    Parameters
 50    ---------
 51    x0 : array_like
 52        reference values
 53    x1 : array_like
 54        experiment values
 55    '''
 56
 57     # Mean values
 58    x0_mean = np.mean(x0)
 59
 60    # Mean absolute difference
 61    abs_diff = np.mean( np.abs(x1 - x0) )
 62
 63    # Metric value
 64    return abs_diff / np.abs( x0_mean )
 65
 66
 67def nmbf( x0, x1 ):
 68    '''Compute Normalized Mean Bias Factor (NMBF)
 69
 70    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
 71
 72    Parameters
 73    ----------
 74    x0 : array_like
 75        reference values
 76    x1 : array_like
 77        experiment values
 78    '''
 79
 80    # Ensure that arguments have the same length
 81    assert (len(x0) == len(x1)), \
 82        "Parameters x0 and x1 must have the same length"
 83
 84    # Mean values
 85    x0_mean = np.mean(x0)
 86    x1_mean = np.mean(x1)
 87
 88    # Metric value
 89    if x1_mean >= x0_mean:
 90        result = x1_mean / x0_mean - 1
 91    else:
 92        result= 1 - x0_mean / x1_mean
 93    # Equivalent (faster?) implementation
 94    #S = (mMean - oMean) / np.abs(mMean - oMean)
 95    #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 )
 96
 97    return result
 98
 99def nmaef( x0, x1 ):
100    '''Compute Normalized Mean Absolute Error Factor (NMAEF)
101
102    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
103    
104    Parameters
105    ----------
106    x0 : array_like
107        reference values
108    x1 : array_like
109        experiment values
110    '''
111
112    # Ensure that arguments have the same length
113    assert (len(x0) == len(x1)), \
114        "Parameters x0 and x1 must have the same length"
115
116    # Mean values
117    x0_mean = np.mean(x0)
118    x1_mean = np.mean(x1)
119
120    # Mean absolute difference
121    abs_diff = np.mean( np.abs(x1 - x0))
122
123    # Metric value
124    if x1_mean >= x0_mean:
125        result = abs_diff / x0_mean 
126    else:
127        result = abs_diff / x1_mean
128    # Equivalent (faster?) implementation
129    #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean)
130    #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) )
131
132    return result
133
134def _texify_name(name):
135    '''Return a LaTex formatted string for some variables
136    
137    Parameters
138    ----------
139    name : str
140    
141    Returns
142    -------
143    pretty_name : str
144    '''
145    if name.lower()=='n':
146        pretty_name = r'$n$'
147    elif name=='R2':
148        pretty_name = f'$R^2$'
149    elif name=='r2':
150        pretty_name = f'$r^2$'
151    elif name.lower()=='y_ols':
152        pretty_name = r'$y_{\rm OLS}$'
153    elif name.lower()=='y_sma':
154        pretty_name = r'$y_{\rm SMA}$'
155    elif name.lower()=='y_sen':
156        pretty_name = r'$y_{\rm Sen}$'
157    else:
158        pretty_name = name
159    return pretty_name
160
161def _number2str(value,
162                intformat='{:d}',
163                floatformat='{:.4f}'):
164    '''Format number as string using integer and float format specifiers
165    
166    Parameters
167    ----------
168    value : numeric, str
169        value to be converted
170    intformat : str, default='{:d}'
171        format specifier for integer types
172    floatformat : str, default='{:.4f}'
173        format specifier for float types
174
175    Returns
176    -------
177    str
178    '''
179    if isinstance(value,str):
180        pass
181    elif isinstance(value,(int,np.integer)):
182        value = intformat.format(value)
183    else:
184        value = floatformat.format(value)
185    return value
186
187class BivariateStatistics:
188    '''A suite of common statistics to quantify bivariate relationships
189
190    Class method 'summary' provides a formatted summary of these statistics
191    
192    Attributes
193    ----------
194    count, n : int
195        number of valid (not NaN) data value pairs
196    xmean, ymean : float
197        mean of x and y variables
198    xmedian, ymedian :float
199        median of x and y variables
200    xstd, ystd : float
201        standard deviation of x and y variables
202    mean_difference, md : float
203        ymean - xmean
204    mean_absolute_difference, mad : float
205        mean( |y-x| )
206    relative_mean_difference, rmd : float
207        md / xmean
208    relative_mean_absolute_difference, rmad :float
209        mad / xmean
210    standardized_mean_difference, smd : float
211        md / xstd
212    standardized_mean_absolute_difference, smad : float
213        mad /xstd
214    mean_relative_difference, mrd : float
215        mean(y/x) - 1
216    mean_log10_ratio, mlr : float
217        mean( log10(y/x) )
218    mean_absolute_log10_ratio, malr : float
219        mean( abs( log10(y/x) ) )
220    median_difference, medd : float
221        median(y-x)
222    median_absolute_difference, medad : float
223        median(|y-x|)
224    relative_median_difference, rmedd : float
225        median(y-x) / xmedian
226    relative_median_absolute_difference, rmedad : float
227        median(|y-x|) / xmedian
228    median_relative_difference, medianrd, medrd : float
229        median(y/x)-1
230    median_log10_ratio, medlr : float
231        median( log10(y/x) )
232    median_absolute_log10_ratio, medalr : float
233        median( abs( log10(y/x) ) )
234    normalized_mean_bias_factor, nmbf : float
235        see `nmbf` 
236    normalized_mean_absolute_error_factor, nmaef : float
237        see `nmaef`
238    root_mean_square_difference, rmsd : float
239        $\\sqrt{ \\langle (y - x)^2 \\rangle }$
240    covariance : float
241        cov(x,y)
242    correlation_pearson, correlation, pearsonr, R, r : float
243        Pearson linear correlation coefficient 
244    correlation_spearman, spearmanr : float
245        Spearman, non-parametric rank correlation coefficient
246    R2, r2 : float
247        Linear coefficient of determination, $R^2$
248    '''
249
250    def __init__(self,x,y,w=None,dropna=False,data=None):
251        '''Compute suite of bivariate statistics during initialization
252        
253        Statistic values are saved in attributes.
254        CAUTION: Weights w are ignored except in SMA fit
255
256        Parameters
257        ----------
258        x : ndarray or str
259            independent variable values
260        y : ndarray or str
261            dependent variable values, same size as x
262        w : ndarray or str, optional
263            weights for points (x,y), same size as x and y
264        dropna : bool, optional (default=False)
265            drops NaN values from x, y, and w
266        data : dict-like, optional
267            if x, y, or w are str, then they should be keys in data
268        '''
269
270        # Get values from data if needed
271        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
272            raise ValueError( 'Data argument must be used if x, y, or w is a string')
273        if isinstance(x,str):
274            x = data[x]
275        if isinstance(y,str):
276            y = data[y]
277        if isinstance(w,str):
278            w = data[w]
279
280        #Ensure that x and y have same length
281        if len(x) != len(y):
282            raise ValueError( 'Arguments x and y must have the same length' )
283        if w is None:
284            w = np.ones_like(x)
285        if len(w) != len(x):
286            raise ValueError( 'Argument w (if present) must have the same length as x' )
287
288        # Drop NaN values
289        if dropna:
290            isna = np.isnan(x*y*w)
291            x = x[~isna]
292            y = y[~isna]
293            w = w[~isna]
294
295        # Differences and ratios used repeatedly
296        diff = y - x
297        absdiff = np.abs( y - x )
298        # Ignore divide by zero and 0/0 while dividing
299        old_settings = np.seterr(divide='ignore',invalid='ignore')
300        ratio = y/x
301        log10ratio = np.log10(ratio)
302        np.seterr(**old_settings)
303
304        # Number of data points
305        self.count = self.n = len(x)
306
307        # Means, medians, and standard deviations
308        self.xmean = np.mean(x)
309        self.ymean = np.mean(y)
310        self.xmedian = np.median(x)
311        self.ymedian = np.median(y)
312        self.xstd   = np.std(x)
313        self.ystd   = np.std(y)
314
315        # Save values for use later
316        self._x = x
317        self._y = y
318        self._w = w
319
320        # Mean and mean absolute differences
321        self.mean_difference            = self.md   = self.ymean - self.xmean
322        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
323
324        # Relative and standardized differences
325        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
326        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
327        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
328        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
329
330        # Mean and median relative differences
331        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
332        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
333
334        # Median and median absolute differences
335        self.median_difference          = self.medd  = np.median( diff )
336        self.median_absolute_difference = self.medad = np.median( absdiff )
337
338        # Relative median differences
339        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
340        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
341
342        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
343        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
344
345        # Mean and mean absolute log ratio
346        self.mean_log10_ratio          = self.mlr  = np.mean( log10ratio )
347        self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) )
348        
349        # Median and median absolute log ratio
350        self.median_log10_ratio          = self.medlr  = np.median( log10ratio )
351        self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) )
352        
353        # RMS difference
354        self.root_mean_square_difference    = self.rmsd     = np.sqrt( np.mean( np.power( diff, 2) ) )
355
356        # Covariance, correlation
357        self.covariance = np.cov(x,y)[0][1]
358        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
359            np.corrcoef(x,y)[0][1]
360        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
361        self.R2 = self.r2 = self.R**2
362
363    def __getitem__(self,key):
364        '''Accesses attribute values via object['key']'''
365        return getattr(self,key)
366
367    def fitline(self,method='sma',intercept=True,**kwargs):
368        '''Compute bivariate line fit
369        
370        Parameters
371        ----------
372        method : str
373            line fitting method: sma (default), ols, wls, York, sen, siegel
374        intercept : bool
375            defines whether non-zero intercept should be fitted
376        **kwargs 
377            passed to `acgc.stats.sma` (e.g. robust=True)
378
379        Returns
380        -------
381        result : dict
382            dictionary with keys:
383            - slope (float)
384                slope of fitted line
385            - intercept (float)
386                intercept of fitted line
387            - fittedvalues (array (N,))
388                values on fit line
389            - residuals (array (N,))
390                residual from fit line
391        '''
392
393        fitintercept = intercept
394
395        if method.lower()=='sma':
396            fit = sma(  self._x,
397                        self._y,
398                        self._w,
399                        intercept=fitintercept,
400                        **kwargs)
401            slope = fit['slope']
402            intercept= fit['intercept']
403
404        elif method.lower()=='ols':
405            if fitintercept:
406                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
407                                      self._y, rcond=None )
408            else:
409                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
410            slope = ols[0][0]
411            intercept = ols[0][1]
412
413        elif method.lower() in ['theil','sen','theilsen']:
414            fitintercept = True
415            fit = sen( self._x,
416                       self._y,
417                       **kwargs)
418            slope = fit.slope
419            intercept = fit.intercept
420
421        elif method.lower()=='siegel':
422            fitintercept = True
423            siegel = stats.siegelslopes( self._x,
424                                         self._y )
425            slope = siegel.slope
426            intercept = siegel.intercept
427
428        elif method.lower()=='wls':
429            raise NotImplementedError('WLS regression not implemented yet')
430
431        elif method.lower()=='york':
432            raise NotImplementedError('York regression not implemented yet')
433
434        else:
435            raise ValueError('Undefined method '+method)
436
437        line = dict( slope          = slope,
438                     intercept      = intercept,
439                     fittedvalues   = slope * self._x + intercept,
440                     residuals      = self._y - ( slope * self._x + intercept ),
441                     method         = method,
442                     fitintercept   = fitintercept )
443
444        return line
445
446    def slope(self,method='sma',intercept=True,**kwargs):
447        '''Compute slope of bivariate line fit
448        
449        Parameters
450        ----------
451        method : str
452            line fitting method: sma (default), ols, wls
453        intercept : bool
454            defines whether non-zero intercept should be fitted
455        **kwargs 
456            passed to `fitline`
457
458        Returns
459        -------
460        slope : float
461            value of y intercept
462        '''
463        return self.fitline(method,intercept,**kwargs)['slope']
464
465    def intercept(self,method='sma',intercept=True,**kwargs):
466        '''Compute intercept of bivariate line fit
467        
468        Parameters
469        ----------
470        method : str
471            line fitting method: sma (default) or ols
472        intercept : bool
473            defines whether non-zero intercept should be fitted
474        **kwargs 
475            passed to `fitline`
476
477        Returns
478        -------
479        intercept : float
480            value of y intercept
481        '''
482        return self.fitline(method,intercept,**kwargs)['intercept']
483
484    def _expand_variables(self,variables):
485        '''Expand special strings into a list of variables
486        
487        Parameter
488        ---------
489        variables : list or str, default='common'
490            Special strings ("all","common") will be expanded to a list of variables
491            list arguments will not be modified
492
493        Returns
494        -------
495        list 
496            variable names
497        '''
498        if variables is None:
499            variables='common'
500        if variables=='all':
501            variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD',
502                       'MLR','MALR',
503                       'MedD','MedAD','RMedD','RMedAD','MedRD',
504                       'MedLR','MedALR',
505                       'NMBF','NMAEF','RMSD','cov',
506                       'R','R2','spearmanr','slope','intercept',
507                       'fitline','n']
508        elif variables=='common':
509            variables=['MD','MAD','RMD','RMAD','MRD','R2','slope','n']
510        if not isinstance(variables,list):
511            raise ValueError(
512                'variables must be a list, None, or one of these strings: "all","common"')
513
514        return variables
515
516    def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'):
517        '''Summarize bivariate statistics into a dict
518
519        Parameters
520        ----------
521        vars : list or str, default='common'
522            names of attribute variables to include in summary
523            names are case insensitive            
524            The following strings are also accepted in place of a list 
525                "all" (displays all variables)
526                "common" (displays all measures of mean difference)
527        fitline_kw : dict, default=None
528            keywords passed to `fitline`
529        floatformat_fiteqn : str, default=floatformat
530            format specifier for slope and intercept (a,b) in y = a x + b
531        
532        Returns
533        -------
534        summary : dict
535            names and values of variables
536        '''
537
538        # List of variables
539        variables = self._expand_variables(variables)
540
541        if fitline_kw is None:
542            fitline_kw = {'method':'sma',
543                          'intercept':True}
544
545        # Construct the dict
546        summary = {}
547        for v in variables:
548            if v in ['slope','intercept']:
549                # These variables are object methods
550                func = getattr(self,v)
551                value = func(**fitline_kw)
552            elif v == 'fitline':
553                line = self.fitline(**fitline_kw)
554                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
555            else:
556                # Retrieve values
557                value = getattr(self,v.lower())
558
559            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
560            summary[v] = value
561
562        return summary
563
564    def summary(self, variables=None, fitline_kw=None,
565                intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None,
566                stringlength=None ):
567        '''Summarize bivariate statistics
568
569        Parameters
570        ----------
571        vars : list or str, default='common'
572            names of attribute variables to include in summary
573            names are case insensitive            
574            The following strings are also accepted in place of a list 
575                "all" (displays all variables)
576                "common" (displays all measures of mean difference)
577        fitline_kw : dict, default=None
578            keywords passed to `fitline`
579        intformat : str, default='{:d}'
580            format specifier for integer values
581        floatformat : str, default='{:.4f}'
582            format specifier for floating point values
583        floatformat_fiteqn : str, default=floatformat
584            format specifier for slope and intercept (a,b) in y = a x + b
585        stringlength : int, default=None
586            length of the variables on output
587            default (None) is to use the length of the longest variable name
588        
589        Returns
590        -------
591        summary : str
592            names and values of variables
593        '''
594        # List of variables
595        variables = self._expand_variables(variables)
596
597        if floatformat_fiteqn is None:
598            floatformat_fiteqn = floatformat
599        if stringlength is None:
600            stringlength = np.max([len(v) for v in variables])
601        stringformat = '{:'+str(stringlength)+'s}'
602
603        # Get a dict containing the needed variables
604        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
605
606        # Extract length of the float numbers from floatformat
607        # import re
608        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
609        #       floatformat )[0] ) ).astype(int)
610
611        # summary = (stringformat+'{:>10s}').format('Variable','Value')
612        summarytext = ''
613        for k,v in summarydict.items():
614            vstr = _number2str(v,intformat,floatformat)
615            summarytext += (stringformat+' = {:s}\n').format(k,vstr)
616
617        return summarytext
618
619    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
620                          intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None,
621                          loc=None, loc_units='axes',
622                          **kwargs):
623        '''Display bivariate statistics as a table inset on a plot axis
624
625        Parameters
626        ----------
627        ax : matplotlib.Figure.Axis 
628            axis where the table will be displayed
629        variables : list or str, default='common'
630            names of attribute variables to include in summary
631            names are case insensitive            
632            The following strings are also accepted in place of a list 
633                "all" (displays all variables)
634                "common" (displays all measures of mean difference)
635        fitline_kw : dict, default=None
636            keywords passed to `fitline`
637        intformat : str, default='{:d}'
638            format specifier for integer values
639        floatformat : str, default='{:.3f}'
640            format specifier for floating point values
641        floatformat_fiteqn : str, default=floatformat
642            format specifier for slope and intercept (a,b) in y = a x + b
643        loc : tuple (x0,y0), default=(0.85, 0.05)
644            location on the axis where the table will be drawn
645            can be in data units or axes units [0-1]
646        loc_units : {'axes' (default), 'data'}
647            specifies whether loc has 'data' units or 'axes' units [0-1]
648                    
649        Returns
650        -------
651        text1, text2 : matplotlib text object
652            Artist for the two text boxes        
653        '''
654        # List of variables
655        variables = self._expand_variables(variables)
656
657        if floatformat_fiteqn is None:
658            floatformat_fiteqn = floatformat
659
660        # Default location in lower right corner
661        if loc is None:
662            loc = (0.8,0.05)
663
664        # Coordinates for loc
665        if loc_units.lower()=='data':
666            coord=ax.transData
667        elif loc_units.lower() in ['axes','axis']:
668            coord=ax.transAxes
669        else:
670            raise ValueError('Display units should be "Data" or "Axes"')
671
672        # Get a dict containing the needed variables
673        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
674
675        # Column of label text
676        label_text = '\n'.join([_texify_name(key)
677                                for key in summarydict])
678        # Column of value text
679        value_text = '\n'.join([_number2str(v,intformat,floatformat)
680                                for v in summarydict.values()])
681
682        # Check if horizontal alignment keyword is used
683        ha=''
684        try:
685            ha = kwargs['ha']
686        except KeyError:
687            pass
688        try:
689            ha = kwargs['horizontalalignment']
690        except KeyError:
691            pass
692
693        # For right alignment, align on values first
694        # Otherwise, align on labels
695        if ha=='right':
696            first_text = value_text
697            second_text = label_text
698            sign = -1
699        else:
700            first_text = label_text
701            second_text = value_text
702            sign = +1
703
704        # Add first column of text
705        t1=ax.text(loc[0],loc[1],
706                first_text,
707                transform=coord,
708                **kwargs
709                )
710
711        # Get width of first text column
712        bbox = t1.get_window_extent().transformed(coord.inverted())
713        width = bbox.x1-bbox.x0
714
715        # Add second column of text
716        t2 = ax.text(loc[0]+width*sign,loc[1],
717                     second_text,
718                     transform=coord,
719                     **kwargs
720                     )
721
722        ##################################
723        # Early version of this function using matplotlib.table.table()
724
725        # if isinstance(loc,(tuple,list)):
726        #     # Create an inset axis to contain the table
727        #     tableaxis = ax.inset_axes(loc)
728        #     table_width=1
729        # else:
730        #     tableaxis = ax
731
732        # # Display the table on the axis
733        # return mtable.table(
734        #     tableaxis,
735        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
736        #     rowLabels=[texify_name(key) for key in summarydict],
737        #     colWidths=[table_width/2]*2,
738        #     edges=edges,
739        #     loc=loc, bbox=bbox
740        #     )
741
742        return [t1,t2]
class BivariateStatistics:
188class BivariateStatistics:
189    '''A suite of common statistics to quantify bivariate relationships
190
191    Class method 'summary' provides a formatted summary of these statistics
192    
193    Attributes
194    ----------
195    count, n : int
196        number of valid (not NaN) data value pairs
197    xmean, ymean : float
198        mean of x and y variables
199    xmedian, ymedian :float
200        median of x and y variables
201    xstd, ystd : float
202        standard deviation of x and y variables
203    mean_difference, md : float
204        ymean - xmean
205    mean_absolute_difference, mad : float
206        mean( |y-x| )
207    relative_mean_difference, rmd : float
208        md / xmean
209    relative_mean_absolute_difference, rmad :float
210        mad / xmean
211    standardized_mean_difference, smd : float
212        md / xstd
213    standardized_mean_absolute_difference, smad : float
214        mad /xstd
215    mean_relative_difference, mrd : float
216        mean(y/x) - 1
217    mean_log10_ratio, mlr : float
218        mean( log10(y/x) )
219    mean_absolute_log10_ratio, malr : float
220        mean( abs( log10(y/x) ) )
221    median_difference, medd : float
222        median(y-x)
223    median_absolute_difference, medad : float
224        median(|y-x|)
225    relative_median_difference, rmedd : float
226        median(y-x) / xmedian
227    relative_median_absolute_difference, rmedad : float
228        median(|y-x|) / xmedian
229    median_relative_difference, medianrd, medrd : float
230        median(y/x)-1
231    median_log10_ratio, medlr : float
232        median( log10(y/x) )
233    median_absolute_log10_ratio, medalr : float
234        median( abs( log10(y/x) ) )
235    normalized_mean_bias_factor, nmbf : float
236        see `nmbf` 
237    normalized_mean_absolute_error_factor, nmaef : float
238        see `nmaef`
239    root_mean_square_difference, rmsd : float
240        $\\sqrt{ \\langle (y - x)^2 \\rangle }$
241    covariance : float
242        cov(x,y)
243    correlation_pearson, correlation, pearsonr, R, r : float
244        Pearson linear correlation coefficient 
245    correlation_spearman, spearmanr : float
246        Spearman, non-parametric rank correlation coefficient
247    R2, r2 : float
248        Linear coefficient of determination, $R^2$
249    '''
250
251    def __init__(self,x,y,w=None,dropna=False,data=None):
252        '''Compute suite of bivariate statistics during initialization
253        
254        Statistic values are saved in attributes.
255        CAUTION: Weights w are ignored except in SMA fit
256
257        Parameters
258        ----------
259        x : ndarray or str
260            independent variable values
261        y : ndarray or str
262            dependent variable values, same size as x
263        w : ndarray or str, optional
264            weights for points (x,y), same size as x and y
265        dropna : bool, optional (default=False)
266            drops NaN values from x, y, and w
267        data : dict-like, optional
268            if x, y, or w are str, then they should be keys in data
269        '''
270
271        # Get values from data if needed
272        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
273            raise ValueError( 'Data argument must be used if x, y, or w is a string')
274        if isinstance(x,str):
275            x = data[x]
276        if isinstance(y,str):
277            y = data[y]
278        if isinstance(w,str):
279            w = data[w]
280
281        #Ensure that x and y have same length
282        if len(x) != len(y):
283            raise ValueError( 'Arguments x and y must have the same length' )
284        if w is None:
285            w = np.ones_like(x)
286        if len(w) != len(x):
287            raise ValueError( 'Argument w (if present) must have the same length as x' )
288
289        # Drop NaN values
290        if dropna:
291            isna = np.isnan(x*y*w)
292            x = x[~isna]
293            y = y[~isna]
294            w = w[~isna]
295
296        # Differences and ratios used repeatedly
297        diff = y - x
298        absdiff = np.abs( y - x )
299        # Ignore divide by zero and 0/0 while dividing
300        old_settings = np.seterr(divide='ignore',invalid='ignore')
301        ratio = y/x
302        log10ratio = np.log10(ratio)
303        np.seterr(**old_settings)
304
305        # Number of data points
306        self.count = self.n = len(x)
307
308        # Means, medians, and standard deviations
309        self.xmean = np.mean(x)
310        self.ymean = np.mean(y)
311        self.xmedian = np.median(x)
312        self.ymedian = np.median(y)
313        self.xstd   = np.std(x)
314        self.ystd   = np.std(y)
315
316        # Save values for use later
317        self._x = x
318        self._y = y
319        self._w = w
320
321        # Mean and mean absolute differences
322        self.mean_difference            = self.md   = self.ymean - self.xmean
323        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
324
325        # Relative and standardized differences
326        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
327        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
328        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
329        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
330
331        # Mean and median relative differences
332        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
333        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
334
335        # Median and median absolute differences
336        self.median_difference          = self.medd  = np.median( diff )
337        self.median_absolute_difference = self.medad = np.median( absdiff )
338
339        # Relative median differences
340        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
341        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
342
343        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
344        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
345
346        # Mean and mean absolute log ratio
347        self.mean_log10_ratio          = self.mlr  = np.mean( log10ratio )
348        self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) )
349        
350        # Median and median absolute log ratio
351        self.median_log10_ratio          = self.medlr  = np.median( log10ratio )
352        self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) )
353        
354        # RMS difference
355        self.root_mean_square_difference    = self.rmsd     = np.sqrt( np.mean( np.power( diff, 2) ) )
356
357        # Covariance, correlation
358        self.covariance = np.cov(x,y)[0][1]
359        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
360            np.corrcoef(x,y)[0][1]
361        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
362        self.R2 = self.r2 = self.R**2
363
364    def __getitem__(self,key):
365        '''Accesses attribute values via object['key']'''
366        return getattr(self,key)
367
368    def fitline(self,method='sma',intercept=True,**kwargs):
369        '''Compute bivariate line fit
370        
371        Parameters
372        ----------
373        method : str
374            line fitting method: sma (default), ols, wls, York, sen, siegel
375        intercept : bool
376            defines whether non-zero intercept should be fitted
377        **kwargs 
378            passed to `acgc.stats.sma` (e.g. robust=True)
379
380        Returns
381        -------
382        result : dict
383            dictionary with keys:
384            - slope (float)
385                slope of fitted line
386            - intercept (float)
387                intercept of fitted line
388            - fittedvalues (array (N,))
389                values on fit line
390            - residuals (array (N,))
391                residual from fit line
392        '''
393
394        fitintercept = intercept
395
396        if method.lower()=='sma':
397            fit = sma(  self._x,
398                        self._y,
399                        self._w,
400                        intercept=fitintercept,
401                        **kwargs)
402            slope = fit['slope']
403            intercept= fit['intercept']
404
405        elif method.lower()=='ols':
406            if fitintercept:
407                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
408                                      self._y, rcond=None )
409            else:
410                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
411            slope = ols[0][0]
412            intercept = ols[0][1]
413
414        elif method.lower() in ['theil','sen','theilsen']:
415            fitintercept = True
416            fit = sen( self._x,
417                       self._y,
418                       **kwargs)
419            slope = fit.slope
420            intercept = fit.intercept
421
422        elif method.lower()=='siegel':
423            fitintercept = True
424            siegel = stats.siegelslopes( self._x,
425                                         self._y )
426            slope = siegel.slope
427            intercept = siegel.intercept
428
429        elif method.lower()=='wls':
430            raise NotImplementedError('WLS regression not implemented yet')
431
432        elif method.lower()=='york':
433            raise NotImplementedError('York regression not implemented yet')
434
435        else:
436            raise ValueError('Undefined method '+method)
437
438        line = dict( slope          = slope,
439                     intercept      = intercept,
440                     fittedvalues   = slope * self._x + intercept,
441                     residuals      = self._y - ( slope * self._x + intercept ),
442                     method         = method,
443                     fitintercept   = fitintercept )
444
445        return line
446
447    def slope(self,method='sma',intercept=True,**kwargs):
448        '''Compute slope of bivariate line fit
449        
450        Parameters
451        ----------
452        method : str
453            line fitting method: sma (default), ols, wls
454        intercept : bool
455            defines whether non-zero intercept should be fitted
456        **kwargs 
457            passed to `fitline`
458
459        Returns
460        -------
461        slope : float
462            value of y intercept
463        '''
464        return self.fitline(method,intercept,**kwargs)['slope']
465
466    def intercept(self,method='sma',intercept=True,**kwargs):
467        '''Compute intercept of bivariate line fit
468        
469        Parameters
470        ----------
471        method : str
472            line fitting method: sma (default) or ols
473        intercept : bool
474            defines whether non-zero intercept should be fitted
475        **kwargs 
476            passed to `fitline`
477
478        Returns
479        -------
480        intercept : float
481            value of y intercept
482        '''
483        return self.fitline(method,intercept,**kwargs)['intercept']
484
485    def _expand_variables(self,variables):
486        '''Expand special strings into a list of variables
487        
488        Parameter
489        ---------
490        variables : list or str, default='common'
491            Special strings ("all","common") will be expanded to a list of variables
492            list arguments will not be modified
493
494        Returns
495        -------
496        list 
497            variable names
498        '''
499        if variables is None:
500            variables='common'
501        if variables=='all':
502            variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD',
503                       'MLR','MALR',
504                       'MedD','MedAD','RMedD','RMedAD','MedRD',
505                       'MedLR','MedALR',
506                       'NMBF','NMAEF','RMSD','cov',
507                       'R','R2','spearmanr','slope','intercept',
508                       'fitline','n']
509        elif variables=='common':
510            variables=['MD','MAD','RMD','RMAD','MRD','R2','slope','n']
511        if not isinstance(variables,list):
512            raise ValueError(
513                'variables must be a list, None, or one of these strings: "all","common"')
514
515        return variables
516
517    def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'):
518        '''Summarize bivariate statistics into a dict
519
520        Parameters
521        ----------
522        vars : list or str, default='common'
523            names of attribute variables to include in summary
524            names are case insensitive            
525            The following strings are also accepted in place of a list 
526                "all" (displays all variables)
527                "common" (displays all measures of mean difference)
528        fitline_kw : dict, default=None
529            keywords passed to `fitline`
530        floatformat_fiteqn : str, default=floatformat
531            format specifier for slope and intercept (a,b) in y = a x + b
532        
533        Returns
534        -------
535        summary : dict
536            names and values of variables
537        '''
538
539        # List of variables
540        variables = self._expand_variables(variables)
541
542        if fitline_kw is None:
543            fitline_kw = {'method':'sma',
544                          'intercept':True}
545
546        # Construct the dict
547        summary = {}
548        for v in variables:
549            if v in ['slope','intercept']:
550                # These variables are object methods
551                func = getattr(self,v)
552                value = func(**fitline_kw)
553            elif v == 'fitline':
554                line = self.fitline(**fitline_kw)
555                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
556            else:
557                # Retrieve values
558                value = getattr(self,v.lower())
559
560            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
561            summary[v] = value
562
563        return summary
564
565    def summary(self, variables=None, fitline_kw=None,
566                intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None,
567                stringlength=None ):
568        '''Summarize bivariate statistics
569
570        Parameters
571        ----------
572        vars : list or str, default='common'
573            names of attribute variables to include in summary
574            names are case insensitive            
575            The following strings are also accepted in place of a list 
576                "all" (displays all variables)
577                "common" (displays all measures of mean difference)
578        fitline_kw : dict, default=None
579            keywords passed to `fitline`
580        intformat : str, default='{:d}'
581            format specifier for integer values
582        floatformat : str, default='{:.4f}'
583            format specifier for floating point values
584        floatformat_fiteqn : str, default=floatformat
585            format specifier for slope and intercept (a,b) in y = a x + b
586        stringlength : int, default=None
587            length of the variables on output
588            default (None) is to use the length of the longest variable name
589        
590        Returns
591        -------
592        summary : str
593            names and values of variables
594        '''
595        # List of variables
596        variables = self._expand_variables(variables)
597
598        if floatformat_fiteqn is None:
599            floatformat_fiteqn = floatformat
600        if stringlength is None:
601            stringlength = np.max([len(v) for v in variables])
602        stringformat = '{:'+str(stringlength)+'s}'
603
604        # Get a dict containing the needed variables
605        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
606
607        # Extract length of the float numbers from floatformat
608        # import re
609        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
610        #       floatformat )[0] ) ).astype(int)
611
612        # summary = (stringformat+'{:>10s}').format('Variable','Value')
613        summarytext = ''
614        for k,v in summarydict.items():
615            vstr = _number2str(v,intformat,floatformat)
616            summarytext += (stringformat+' = {:s}\n').format(k,vstr)
617
618        return summarytext
619
620    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
621                          intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None,
622                          loc=None, loc_units='axes',
623                          **kwargs):
624        '''Display bivariate statistics as a table inset on a plot axis
625
626        Parameters
627        ----------
628        ax : matplotlib.Figure.Axis 
629            axis where the table will be displayed
630        variables : list or str, default='common'
631            names of attribute variables to include in summary
632            names are case insensitive            
633            The following strings are also accepted in place of a list 
634                "all" (displays all variables)
635                "common" (displays all measures of mean difference)
636        fitline_kw : dict, default=None
637            keywords passed to `fitline`
638        intformat : str, default='{:d}'
639            format specifier for integer values
640        floatformat : str, default='{:.3f}'
641            format specifier for floating point values
642        floatformat_fiteqn : str, default=floatformat
643            format specifier for slope and intercept (a,b) in y = a x + b
644        loc : tuple (x0,y0), default=(0.85, 0.05)
645            location on the axis where the table will be drawn
646            can be in data units or axes units [0-1]
647        loc_units : {'axes' (default), 'data'}
648            specifies whether loc has 'data' units or 'axes' units [0-1]
649                    
650        Returns
651        -------
652        text1, text2 : matplotlib text object
653            Artist for the two text boxes        
654        '''
655        # List of variables
656        variables = self._expand_variables(variables)
657
658        if floatformat_fiteqn is None:
659            floatformat_fiteqn = floatformat
660
661        # Default location in lower right corner
662        if loc is None:
663            loc = (0.8,0.05)
664
665        # Coordinates for loc
666        if loc_units.lower()=='data':
667            coord=ax.transData
668        elif loc_units.lower() in ['axes','axis']:
669            coord=ax.transAxes
670        else:
671            raise ValueError('Display units should be "Data" or "Axes"')
672
673        # Get a dict containing the needed variables
674        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
675
676        # Column of label text
677        label_text = '\n'.join([_texify_name(key)
678                                for key in summarydict])
679        # Column of value text
680        value_text = '\n'.join([_number2str(v,intformat,floatformat)
681                                for v in summarydict.values()])
682
683        # Check if horizontal alignment keyword is used
684        ha=''
685        try:
686            ha = kwargs['ha']
687        except KeyError:
688            pass
689        try:
690            ha = kwargs['horizontalalignment']
691        except KeyError:
692            pass
693
694        # For right alignment, align on values first
695        # Otherwise, align on labels
696        if ha=='right':
697            first_text = value_text
698            second_text = label_text
699            sign = -1
700        else:
701            first_text = label_text
702            second_text = value_text
703            sign = +1
704
705        # Add first column of text
706        t1=ax.text(loc[0],loc[1],
707                first_text,
708                transform=coord,
709                **kwargs
710                )
711
712        # Get width of first text column
713        bbox = t1.get_window_extent().transformed(coord.inverted())
714        width = bbox.x1-bbox.x0
715
716        # Add second column of text
717        t2 = ax.text(loc[0]+width*sign,loc[1],
718                     second_text,
719                     transform=coord,
720                     **kwargs
721                     )
722
723        ##################################
724        # Early version of this function using matplotlib.table.table()
725
726        # if isinstance(loc,(tuple,list)):
727        #     # Create an inset axis to contain the table
728        #     tableaxis = ax.inset_axes(loc)
729        #     table_width=1
730        # else:
731        #     tableaxis = ax
732
733        # # Display the table on the axis
734        # return mtable.table(
735        #     tableaxis,
736        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
737        #     rowLabels=[texify_name(key) for key in summarydict],
738        #     colWidths=[table_width/2]*2,
739        #     edges=edges,
740        #     loc=loc, bbox=bbox
741        #     )
742
743        return [t1,t2]

A suite of common statistics to quantify bivariate relationships

Class method 'summary' provides a formatted summary of these statistics

Attributes
  • count, n (int): number of valid (not NaN) data value pairs
  • xmean, ymean (float): mean of x and y variables
  • xmedian, ymedian (float): median of x and y variables
  • xstd, ystd (float): standard deviation of x and y variables
  • mean_difference, md (float): ymean - xmean
  • mean_absolute_difference, mad (float): mean( |y-x| )
  • relative_mean_difference, rmd (float): md / xmean
  • relative_mean_absolute_difference, rmad (float): mad / xmean
  • standardized_mean_difference, smd (float): md / xstd
  • standardized_mean_absolute_difference, smad (float): mad /xstd
  • mean_relative_difference, mrd (float): mean(y/x) - 1
  • mean_log10_ratio, mlr (float): mean( log10(y/x) )
  • mean_absolute_log10_ratio, malr (float): mean( abs( log10(y/x) ) )
  • median_difference, medd (float): median(y-x)
  • median_absolute_difference, medad (float): median(|y-x|)
  • relative_median_difference, rmedd (float): median(y-x) / xmedian
  • relative_median_absolute_difference, rmedad (float): median(|y-x|) / xmedian
  • median_relative_difference, medianrd, medrd (float): median(y/x)-1
  • median_log10_ratio, medlr (float): median( log10(y/x) )
  • median_absolute_log10_ratio, medalr (float): median( abs( log10(y/x) ) )
  • normalized_mean_bias_factor, nmbf (float): see nmbf
  • normalized_mean_absolute_error_factor, nmaef (float): see nmaef
  • root_mean_square_difference, rmsd (float): $\sqrt{ \langle (y - x)^2 \rangle }$
  • covariance (float): cov(x,y)
  • correlation_pearson, correlation, pearsonr, R, r (float): Pearson linear correlation coefficient
  • correlation_spearman, spearmanr (float): Spearman, non-parametric rank correlation coefficient
  • R2, r2 (float): Linear coefficient of determination, $R^2$
BivariateStatistics(x, y, w=None, dropna=False, data=None)
251    def __init__(self,x,y,w=None,dropna=False,data=None):
252        '''Compute suite of bivariate statistics during initialization
253        
254        Statistic values are saved in attributes.
255        CAUTION: Weights w are ignored except in SMA fit
256
257        Parameters
258        ----------
259        x : ndarray or str
260            independent variable values
261        y : ndarray or str
262            dependent variable values, same size as x
263        w : ndarray or str, optional
264            weights for points (x,y), same size as x and y
265        dropna : bool, optional (default=False)
266            drops NaN values from x, y, and w
267        data : dict-like, optional
268            if x, y, or w are str, then they should be keys in data
269        '''
270
271        # Get values from data if needed
272        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
273            raise ValueError( 'Data argument must be used if x, y, or w is a string')
274        if isinstance(x,str):
275            x = data[x]
276        if isinstance(y,str):
277            y = data[y]
278        if isinstance(w,str):
279            w = data[w]
280
281        #Ensure that x and y have same length
282        if len(x) != len(y):
283            raise ValueError( 'Arguments x and y must have the same length' )
284        if w is None:
285            w = np.ones_like(x)
286        if len(w) != len(x):
287            raise ValueError( 'Argument w (if present) must have the same length as x' )
288
289        # Drop NaN values
290        if dropna:
291            isna = np.isnan(x*y*w)
292            x = x[~isna]
293            y = y[~isna]
294            w = w[~isna]
295
296        # Differences and ratios used repeatedly
297        diff = y - x
298        absdiff = np.abs( y - x )
299        # Ignore divide by zero and 0/0 while dividing
300        old_settings = np.seterr(divide='ignore',invalid='ignore')
301        ratio = y/x
302        log10ratio = np.log10(ratio)
303        np.seterr(**old_settings)
304
305        # Number of data points
306        self.count = self.n = len(x)
307
308        # Means, medians, and standard deviations
309        self.xmean = np.mean(x)
310        self.ymean = np.mean(y)
311        self.xmedian = np.median(x)
312        self.ymedian = np.median(y)
313        self.xstd   = np.std(x)
314        self.ystd   = np.std(y)
315
316        # Save values for use later
317        self._x = x
318        self._y = y
319        self._w = w
320
321        # Mean and mean absolute differences
322        self.mean_difference            = self.md   = self.ymean - self.xmean
323        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
324
325        # Relative and standardized differences
326        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
327        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
328        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
329        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
330
331        # Mean and median relative differences
332        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
333        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
334
335        # Median and median absolute differences
336        self.median_difference          = self.medd  = np.median( diff )
337        self.median_absolute_difference = self.medad = np.median( absdiff )
338
339        # Relative median differences
340        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
341        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
342
343        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
344        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
345
346        # Mean and mean absolute log ratio
347        self.mean_log10_ratio          = self.mlr  = np.mean( log10ratio )
348        self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) )
349        
350        # Median and median absolute log ratio
351        self.median_log10_ratio          = self.medlr  = np.median( log10ratio )
352        self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) )
353        
354        # RMS difference
355        self.root_mean_square_difference    = self.rmsd     = np.sqrt( np.mean( np.power( diff, 2) ) )
356
357        # Covariance, correlation
358        self.covariance = np.cov(x,y)[0][1]
359        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
360            np.corrcoef(x,y)[0][1]
361        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
362        self.R2 = self.r2 = self.R**2

Compute suite of bivariate statistics during initialization

Statistic values are saved in attributes. CAUTION: Weights w are ignored except in SMA fit

Parameters
  • x (ndarray or str): independent variable values
  • y (ndarray or str): dependent variable values, same size as x
  • w (ndarray or str, optional): weights for points (x,y), same size as x and y
  • dropna (bool, optional (default=False)): drops NaN values from x, y, and w
  • data (dict-like, optional): if x, y, or w are str, then they should be keys in data
xmean
ymean
xmedian
ymedian
xstd
ystd
covariance
def fitline(self, method='sma', intercept=True, **kwargs):
368    def fitline(self,method='sma',intercept=True,**kwargs):
369        '''Compute bivariate line fit
370        
371        Parameters
372        ----------
373        method : str
374            line fitting method: sma (default), ols, wls, York, sen, siegel
375        intercept : bool
376            defines whether non-zero intercept should be fitted
377        **kwargs 
378            passed to `acgc.stats.sma` (e.g. robust=True)
379
380        Returns
381        -------
382        result : dict
383            dictionary with keys:
384            - slope (float)
385                slope of fitted line
386            - intercept (float)
387                intercept of fitted line
388            - fittedvalues (array (N,))
389                values on fit line
390            - residuals (array (N,))
391                residual from fit line
392        '''
393
394        fitintercept = intercept
395
396        if method.lower()=='sma':
397            fit = sma(  self._x,
398                        self._y,
399                        self._w,
400                        intercept=fitintercept,
401                        **kwargs)
402            slope = fit['slope']
403            intercept= fit['intercept']
404
405        elif method.lower()=='ols':
406            if fitintercept:
407                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
408                                      self._y, rcond=None )
409            else:
410                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
411            slope = ols[0][0]
412            intercept = ols[0][1]
413
414        elif method.lower() in ['theil','sen','theilsen']:
415            fitintercept = True
416            fit = sen( self._x,
417                       self._y,
418                       **kwargs)
419            slope = fit.slope
420            intercept = fit.intercept
421
422        elif method.lower()=='siegel':
423            fitintercept = True
424            siegel = stats.siegelslopes( self._x,
425                                         self._y )
426            slope = siegel.slope
427            intercept = siegel.intercept
428
429        elif method.lower()=='wls':
430            raise NotImplementedError('WLS regression not implemented yet')
431
432        elif method.lower()=='york':
433            raise NotImplementedError('York regression not implemented yet')
434
435        else:
436            raise ValueError('Undefined method '+method)
437
438        line = dict( slope          = slope,
439                     intercept      = intercept,
440                     fittedvalues   = slope * self._x + intercept,
441                     residuals      = self._y - ( slope * self._x + intercept ),
442                     method         = method,
443                     fitintercept   = fitintercept )
444
445        return line

Compute bivariate line fit

Parameters
  • method (str): line fitting method: sma (default), ols, wls, York, sen, siegel
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to acgc.stats.sma (e.g. robust=True)
Returns
  • result (dict): dictionary with keys:
    • slope (float) slope of fitted line
    • intercept (float) intercept of fitted line
    • fittedvalues (array (N,)) values on fit line
    • residuals (array (N,)) residual from fit line
def slope(self, method='sma', intercept=True, **kwargs):
447    def slope(self,method='sma',intercept=True,**kwargs):
448        '''Compute slope of bivariate line fit
449        
450        Parameters
451        ----------
452        method : str
453            line fitting method: sma (default), ols, wls
454        intercept : bool
455            defines whether non-zero intercept should be fitted
456        **kwargs 
457            passed to `fitline`
458
459        Returns
460        -------
461        slope : float
462            value of y intercept
463        '''
464        return self.fitline(method,intercept,**kwargs)['slope']

Compute slope of bivariate line fit

Parameters
  • method (str): line fitting method: sma (default), ols, wls
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to fitline
Returns
  • slope (float): value of y intercept
def intercept(self, method='sma', intercept=True, **kwargs):
466    def intercept(self,method='sma',intercept=True,**kwargs):
467        '''Compute intercept of bivariate line fit
468        
469        Parameters
470        ----------
471        method : str
472            line fitting method: sma (default) or ols
473        intercept : bool
474            defines whether non-zero intercept should be fitted
475        **kwargs 
476            passed to `fitline`
477
478        Returns
479        -------
480        intercept : float
481            value of y intercept
482        '''
483        return self.fitline(method,intercept,**kwargs)['intercept']

Compute intercept of bivariate line fit

Parameters
  • method (str): line fitting method: sma (default) or ols
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to fitline
Returns
  • intercept (float): value of y intercept
def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'):
517    def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'):
518        '''Summarize bivariate statistics into a dict
519
520        Parameters
521        ----------
522        vars : list or str, default='common'
523            names of attribute variables to include in summary
524            names are case insensitive            
525            The following strings are also accepted in place of a list 
526                "all" (displays all variables)
527                "common" (displays all measures of mean difference)
528        fitline_kw : dict, default=None
529            keywords passed to `fitline`
530        floatformat_fiteqn : str, default=floatformat
531            format specifier for slope and intercept (a,b) in y = a x + b
532        
533        Returns
534        -------
535        summary : dict
536            names and values of variables
537        '''
538
539        # List of variables
540        variables = self._expand_variables(variables)
541
542        if fitline_kw is None:
543            fitline_kw = {'method':'sma',
544                          'intercept':True}
545
546        # Construct the dict
547        summary = {}
548        for v in variables:
549            if v in ['slope','intercept']:
550                # These variables are object methods
551                func = getattr(self,v)
552                value = func(**fitline_kw)
553            elif v == 'fitline':
554                line = self.fitline(**fitline_kw)
555                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
556            else:
557                # Retrieve values
558                value = getattr(self,v.lower())
559
560            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
561            summary[v] = value
562
563        return summary

Summarize bivariate statistics into a dict

Parameters
  • vars (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • fitline_kw (dict, default=None): keywords passed to fitline
  • floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
Returns
  • summary (dict): names and values of variables
def summary( self, variables=None, fitline_kw=None, intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None, stringlength=None):
565    def summary(self, variables=None, fitline_kw=None,
566                intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None,
567                stringlength=None ):
568        '''Summarize bivariate statistics
569
570        Parameters
571        ----------
572        vars : list or str, default='common'
573            names of attribute variables to include in summary
574            names are case insensitive            
575            The following strings are also accepted in place of a list 
576                "all" (displays all variables)
577                "common" (displays all measures of mean difference)
578        fitline_kw : dict, default=None
579            keywords passed to `fitline`
580        intformat : str, default='{:d}'
581            format specifier for integer values
582        floatformat : str, default='{:.4f}'
583            format specifier for floating point values
584        floatformat_fiteqn : str, default=floatformat
585            format specifier for slope and intercept (a,b) in y = a x + b
586        stringlength : int, default=None
587            length of the variables on output
588            default (None) is to use the length of the longest variable name
589        
590        Returns
591        -------
592        summary : str
593            names and values of variables
594        '''
595        # List of variables
596        variables = self._expand_variables(variables)
597
598        if floatformat_fiteqn is None:
599            floatformat_fiteqn = floatformat
600        if stringlength is None:
601            stringlength = np.max([len(v) for v in variables])
602        stringformat = '{:'+str(stringlength)+'s}'
603
604        # Get a dict containing the needed variables
605        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
606
607        # Extract length of the float numbers from floatformat
608        # import re
609        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
610        #       floatformat )[0] ) ).astype(int)
611
612        # summary = (stringformat+'{:>10s}').format('Variable','Value')
613        summarytext = ''
614        for k,v in summarydict.items():
615            vstr = _number2str(v,intformat,floatformat)
616            summarytext += (stringformat+' = {:s}\n').format(k,vstr)
617
618        return summarytext

Summarize bivariate statistics

Parameters
  • vars (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • fitline_kw (dict, default=None): keywords passed to fitline
  • intformat : str, default='{ (d}'): format specifier for integer values
  • floatformat : str, default='{ (.4f}'): format specifier for floating point values
  • floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
  • stringlength (int, default=None): length of the variables on output default (None) is to use the length of the longest variable name
Returns
  • summary (str): names and values of variables
def summary_fig_inset( self, ax, variables=None, fitline_kw=None, intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None, loc=None, loc_units='axes', **kwargs):
620    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
621                          intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None,
622                          loc=None, loc_units='axes',
623                          **kwargs):
624        '''Display bivariate statistics as a table inset on a plot axis
625
626        Parameters
627        ----------
628        ax : matplotlib.Figure.Axis 
629            axis where the table will be displayed
630        variables : list or str, default='common'
631            names of attribute variables to include in summary
632            names are case insensitive            
633            The following strings are also accepted in place of a list 
634                "all" (displays all variables)
635                "common" (displays all measures of mean difference)
636        fitline_kw : dict, default=None
637            keywords passed to `fitline`
638        intformat : str, default='{:d}'
639            format specifier for integer values
640        floatformat : str, default='{:.3f}'
641            format specifier for floating point values
642        floatformat_fiteqn : str, default=floatformat
643            format specifier for slope and intercept (a,b) in y = a x + b
644        loc : tuple (x0,y0), default=(0.85, 0.05)
645            location on the axis where the table will be drawn
646            can be in data units or axes units [0-1]
647        loc_units : {'axes' (default), 'data'}
648            specifies whether loc has 'data' units or 'axes' units [0-1]
649                    
650        Returns
651        -------
652        text1, text2 : matplotlib text object
653            Artist for the two text boxes        
654        '''
655        # List of variables
656        variables = self._expand_variables(variables)
657
658        if floatformat_fiteqn is None:
659            floatformat_fiteqn = floatformat
660
661        # Default location in lower right corner
662        if loc is None:
663            loc = (0.8,0.05)
664
665        # Coordinates for loc
666        if loc_units.lower()=='data':
667            coord=ax.transData
668        elif loc_units.lower() in ['axes','axis']:
669            coord=ax.transAxes
670        else:
671            raise ValueError('Display units should be "Data" or "Axes"')
672
673        # Get a dict containing the needed variables
674        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
675
676        # Column of label text
677        label_text = '\n'.join([_texify_name(key)
678                                for key in summarydict])
679        # Column of value text
680        value_text = '\n'.join([_number2str(v,intformat,floatformat)
681                                for v in summarydict.values()])
682
683        # Check if horizontal alignment keyword is used
684        ha=''
685        try:
686            ha = kwargs['ha']
687        except KeyError:
688            pass
689        try:
690            ha = kwargs['horizontalalignment']
691        except KeyError:
692            pass
693
694        # For right alignment, align on values first
695        # Otherwise, align on labels
696        if ha=='right':
697            first_text = value_text
698            second_text = label_text
699            sign = -1
700        else:
701            first_text = label_text
702            second_text = value_text
703            sign = +1
704
705        # Add first column of text
706        t1=ax.text(loc[0],loc[1],
707                first_text,
708                transform=coord,
709                **kwargs
710                )
711
712        # Get width of first text column
713        bbox = t1.get_window_extent().transformed(coord.inverted())
714        width = bbox.x1-bbox.x0
715
716        # Add second column of text
717        t2 = ax.text(loc[0]+width*sign,loc[1],
718                     second_text,
719                     transform=coord,
720                     **kwargs
721                     )
722
723        ##################################
724        # Early version of this function using matplotlib.table.table()
725
726        # if isinstance(loc,(tuple,list)):
727        #     # Create an inset axis to contain the table
728        #     tableaxis = ax.inset_axes(loc)
729        #     table_width=1
730        # else:
731        #     tableaxis = ax
732
733        # # Display the table on the axis
734        # return mtable.table(
735        #     tableaxis,
736        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
737        #     rowLabels=[texify_name(key) for key in summarydict],
738        #     colWidths=[table_width/2]*2,
739        #     edges=edges,
740        #     loc=loc, bbox=bbox
741        #     )
742
743        return [t1,t2]

Display bivariate statistics as a table inset on a plot axis

Parameters
  • ax (matplotlib.Figure.Axis): axis where the table will be displayed
  • variables (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • fitline_kw (dict, default=None): keywords passed to fitline
  • intformat : str, default='{ (d}'): format specifier for integer values
  • floatformat : str, default='{ (.3f}'): format specifier for floating point values
  • floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
  • loc (tuple (x0,y0), default=(0.85, 0.05)): location on the axis where the table will be drawn can be in data units or axes units [0-1]
  • loc_units ({'axes' (default), 'data'}): specifies whether loc has 'data' units or 'axes' units [0-1]
Returns
  • text1, text2 (matplotlib text object): Artist for the two text boxes
def nmb(x0, x1):
22def nmb( x0, x1 ):
23    '''Compute Normalized Mean Bias (NMB)
24
25    NMB = ( mean(x1) - mean(x0) ) / mean(x0)
26
27    Parameters
28    ----------
29    x0 : array_like
30        reference values
31    x1 : array_like
32        experiment values
33    '''
34
35    assert (len(x0) == len(x1)), \
36        "Parameters x0 and x1 must have the same length"
37
38    # Mean values
39    x0_mean = np.mean(x0)
40    x1_mean = np.mean(x1)
41
42    # Metric value
43    return x1_mean / x0_mean - 1

Compute Normalized Mean Bias (NMB)

NMB = ( mean(x1) - mean(x0) ) / mean(x0)

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmae(x0, x1):
45def nmae( x0, x1 ):
46    '''Compute Normalized Mean Absolute Error (NMAE)
47
48    NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
49
50    Parameters
51    ---------
52    x0 : array_like
53        reference values
54    x1 : array_like
55        experiment values
56    '''
57
58     # Mean values
59    x0_mean = np.mean(x0)
60
61    # Mean absolute difference
62    abs_diff = np.mean( np.abs(x1 - x0) )
63
64    # Metric value
65    return abs_diff / np.abs( x0_mean )

Compute Normalized Mean Absolute Error (NMAE)

NMAE = mean(abs(x1 - x0)) / abs(mean(x0))

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmbf(x0, x1):
68def nmbf( x0, x1 ):
69    '''Compute Normalized Mean Bias Factor (NMBF)
70
71    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
72
73    Parameters
74    ----------
75    x0 : array_like
76        reference values
77    x1 : array_like
78        experiment values
79    '''
80
81    # Ensure that arguments have the same length
82    assert (len(x0) == len(x1)), \
83        "Parameters x0 and x1 must have the same length"
84
85    # Mean values
86    x0_mean = np.mean(x0)
87    x1_mean = np.mean(x1)
88
89    # Metric value
90    if x1_mean >= x0_mean:
91        result = x1_mean / x0_mean - 1
92    else:
93        result= 1 - x0_mean / x1_mean
94    # Equivalent (faster?) implementation
95    #S = (mMean - oMean) / np.abs(mMean - oMean)
96    #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 )
97
98    return result

Compute Normalized Mean Bias Factor (NMBF)

Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmaef(x0, x1):
100def nmaef( x0, x1 ):
101    '''Compute Normalized Mean Absolute Error Factor (NMAEF)
102
103    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
104    
105    Parameters
106    ----------
107    x0 : array_like
108        reference values
109    x1 : array_like
110        experiment values
111    '''
112
113    # Ensure that arguments have the same length
114    assert (len(x0) == len(x1)), \
115        "Parameters x0 and x1 must have the same length"
116
117    # Mean values
118    x0_mean = np.mean(x0)
119    x1_mean = np.mean(x1)
120
121    # Mean absolute difference
122    abs_diff = np.mean( np.abs(x1 - x0))
123
124    # Metric value
125    if x1_mean >= x0_mean:
126        result = abs_diff / x0_mean 
127    else:
128        result = abs_diff / x1_mean
129    # Equivalent (faster?) implementation
130    #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean)
131    #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) )
132
133    return result

Compute Normalized Mean Absolute Error Factor (NMAEF)

Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values