@@ -3656,12 +3656,12 @@ def stineman_interp(xi,x,y,yp=None):
3656
3656
1 / (dy1 + dy2 ),))
3657
3657
return yi
3658
3658
3659
- def ksdensity ( dataset , bw_method = None ):
3659
+ class GaussianKDE ( object ):
3660
3660
"""
3661
3661
Representation of a kernel-density estimate using Gaussian kernels.
3662
3662
3663
3663
Call signature::
3664
- kde_dict = ksdensity (dataset, 'silverman')
3664
+ kde = gaussian_kde (dataset, 'silverman')
3665
3665
3666
3666
Parameters
3667
3667
----------
@@ -3676,10 +3676,10 @@ def ksdensity(dataset, bw_method=None):
3676
3676
Attributes
3677
3677
----------
3678
3678
dataset : ndarray
3679
- The dataset with which `ksdensity ` was initialized.
3680
- d : int
3679
+ The dataset with which `gaussian_kde ` was initialized.
3680
+ dim : int
3681
3681
Number of dimensions.
3682
- n : int
3682
+ num_dp : int
3683
3683
Number of datapoints.
3684
3684
factor : float
3685
3685
The bandwidth factor, obtained from `kde.covariance_factor`, with which
@@ -3690,117 +3690,127 @@ def ksdensity(dataset, bw_method=None):
3690
3690
inv_cov : ndarray
3691
3691
The inverse of `covariance`.
3692
3692
3693
- Returns
3693
+ Methods
3694
3694
-------
3695
- A dictionary mapping each various aspects of the computed KDE.
3696
- The dictionary has the following keys:
3697
-
3698
- xmin : number
3699
- The min of the input dataset
3700
- xmax : number
3701
- The max of the input dataset
3702
- mean : number
3703
- The mean of the result
3704
- median: number
3705
- The median of the result
3706
- result: (# of points,)-array
3707
- The array of the evaluated PDF estimation
3708
-
3709
- Raises
3710
- ------
3711
- ValueError : if the dimensionality of the input points is different than
3712
- the dimensionality of the KDE.
3695
+ kde.evaluate(points) : ndarray
3696
+ Evaluate the estimated pdf on a provided set of points.
3697
+ kde(points) : ndarray
3698
+ Same as kde.evaluate(points)
3713
3699
3714
3700
"""
3715
3701
3716
3702
# This implementation with minor modification was too good to pass up.
3717
3703
# from scipy: https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py
3718
3704
3719
- dataset = np .array (np .atleast_2d (dataset ))
3720
- xmin = dataset .min ()
3721
- xmax = dataset .max ()
3705
+ def __init__ (self , dataset , bw_method = None ):
3706
+ self .dataset = np .atleast_2d (dataset )
3707
+ if not np .array (self .dataset ).size > 1 :
3708
+ raise ValueError ("`dataset` input should have multiple elements." )
3709
+
3710
+ self .dim , self .num_dp = np .array (self .dataset ).shape
3711
+
3712
+ if bw_method is None :
3713
+ pass
3714
+ elif bw_method == 'scott' :
3715
+ self .covariance_factor = self .scotts_factor
3716
+ elif bw_method == 'silverman' :
3717
+ self .covariance_factor = self .silverman_factor
3718
+ elif np .isscalar (bw_method ):
3719
+ if not isinstance (bw_method , six .string_types ):
3720
+ self ._bw_method = 'use constant'
3721
+ self .covariance_factor = lambda : bw_method
3722
+ elif callable (bw_method ):
3723
+ self ._bw_method = bw_method
3724
+ self .covariance_factor = lambda : self ._bw_method (self )
3725
+ else :
3726
+ msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
3727
+ "or a callable."
3728
+ raise ValueError (msg )
3729
+
3730
+ # Computes the covariance matrix for each Gaussian kernel using
3731
+ # covariance_factor().
3732
+
3733
+ self .factor = self .covariance_factor ()
3734
+ # Cache covariance and inverse covariance of the data
3735
+ if not hasattr (self , '_data_inv_cov' ):
3736
+ self .data_covariance = np .atleast_2d (
3737
+ np .cov (
3738
+ self .dataset ,
3739
+ rowvar = 1 ,
3740
+ bias = False ))
3741
+ self .data_inv_cov = np .linalg .inv (self .data_covariance )
3742
+
3743
+ self .covariance = self .data_covariance * self .factor ** 2
3744
+ self .inv_cov = self .data_inv_cov / self .factor ** 2
3745
+ self .norm_factor = np .sqrt (
3746
+ np .linalg .det (
3747
+ 2 * np .pi * self .covariance )) * self .num_dp
3748
+
3749
+ def scotts_factor (self ):
3750
+ return np .power (self .num_dp , - 1. / (self .dim + 4 ))
3751
+
3752
+ def silverman_factor (self ):
3753
+ return np .power (
3754
+ self .num_dp * (self .dim + 2.0 ) / 4.0 , - 1. / (self .dim + 4 ))
3755
+
3756
+ # Default method to calculate bandwidth, can be overwritten by subclass
3757
+ covariance_factor = scotts_factor
3722
3758
3723
- if not dataset . size > 1 :
3724
- raise ValueError ( "`dataset` input should have multiple elements." )
3759
+ def evaluate ( self , points ) :
3760
+ """Evaluate the estimated pdf on a set of points.
3725
3761
3726
- dim , num_dp = dataset .shape
3762
+ Parameters
3763
+ ----------
3764
+ points : (# of dimensions, # of points)-array
3765
+ Alternatively, a (# of dimensions,) vector can be passed in and
3766
+ treated as a single point.
3727
3767
3728
- # ----------------------------------------------
3729
- # Set Bandwidth, defaulted to Scott's Factor
3730
- # ----------------------------------------------
3731
- scotts_factor = lambda : np .power (num_dp , - 1. / (dim + 4 ))
3732
- silverman_factor = lambda : np .power (num_dp * (dim + 2.0 )/ 4.0 , - 1. / (dim + 4 ))
3768
+ Returns
3769
+ -------
3770
+ values : (# of points,)-array
3771
+ The values at each point.
3733
3772
3734
- # Default method to calculate bandwidth, can be overwritten by subclass
3735
- covariance_factor = scotts_factor
3773
+ Raises
3774
+ ------
3775
+ ValueError : if the dimensionality of the input points is different
3776
+ than the dimensionality of the KDE.
3736
3777
3737
- if bw_method is None :
3738
- pass
3739
- elif bw_method == 'scott' :
3740
- covariance_factor = scotts_factor
3741
- elif bw_method == 'silverman' :
3742
- covariance_factor = silverman_factor
3743
- elif np .isscalar (bw_method ) and not isinstance (bw_method , six .string_types ):
3744
- covariance_factor = lambda : bw_method
3745
- else :
3746
- msg = "`bw_method` should be 'scott', 'silverman', or a scalar"
3747
- raise ValueError (msg )
3748
-
3749
- # ---------------------------------------------------------------
3750
- # Computes covariance matrix for each Gaussian kernel with factor
3751
- # ---------------------------------------------------------------
3752
- factor = covariance_factor ()
3753
-
3754
- # Cache covariance and inverse covariance of the data
3755
- data_covariance = np .atleast_2d (np .cov (dataset , rowvar = 1 , bias = False ))
3756
- data_inv_cov = np .linalg .inv (data_covariance )
3757
-
3758
- covariance = data_covariance * factor ** 2
3759
- inv_cov = data_inv_cov / factor ** 2
3760
- norm_factor = np .sqrt (np .linalg .det (2 * np .pi * covariance )) * num_dp
3761
-
3762
- # ----------------------------------------------
3763
- # Evaluate the estimated pdf on a set of points.
3764
- # ----------------------------------------------
3765
- points = np .atleast_2d (np .arange (xmin , xmax , (xmax - xmin )/ 100. ))
3766
-
3767
- dim_pts , num_dp_pts = np .array (points ).shape
3768
- if dim_pts != dim :
3769
- if dim_pts == 1 and num_dp_pts == num_dp :
3770
- # points was passed in as a row vector
3771
- points = np .reshape (points , (dim , 1 ))
3772
- num_dp_pts = 1
3778
+ """
3779
+ points = np .atleast_2d (points )
3780
+
3781
+ dim , num_m = np .array (points ).shape
3782
+ if dim != self .dim :
3783
+ if dim == 1 and num_m == self .dim :
3784
+ # points was passed in as a row vector
3785
+ points = np .reshape (points , (self .dim , 1 ))
3786
+ num_m = 1
3787
+ else :
3788
+ msg = "points have dimension %s, dataset has dimension %s" % (
3789
+ dim , self .dim )
3790
+ raise ValueError (msg )
3791
+
3792
+ result = np .zeros ((num_m ,), dtype = np .float )
3793
+
3794
+ if num_m >= self .num_dp :
3795
+ # there are more points than data, so loop over data
3796
+ for i in range (self .num_dp ):
3797
+ diff = self .dataset [:, i , np .newaxis ] - points
3798
+ tdiff = np .dot (self .inv_cov , diff )
3799
+ energy = np .sum (diff * tdiff , axis = 0 ) / 2.0
3800
+ result = result + np .exp (- energy )
3773
3801
else :
3774
- msg = "points have dimension %s,\
3775
- dataset has dimension %s" % (dim_pts , dim )
3776
- raise ValueError (msg )
3802
+ # loop over points
3803
+ for i in range (num_m ):
3804
+ diff = self .dataset - points [:, i , np .newaxis ]
3805
+ tdiff = np .dot (self .inv_cov , diff )
3806
+ energy = np .sum (diff * tdiff , axis = 0 ) / 2.0
3807
+ result [i ] = np .sum (np .exp (- energy ), axis = 0 )
3777
3808
3778
- result = np . zeros (( num_dp_pts ,), dtype = np . float )
3809
+ result = result / self . norm_factor
3779
3810
3780
- if num_dp_pts >= num_dp :
3781
- # there are more points than data, so loop over data
3782
- for i in range (num_dp ):
3783
- diff = dataset [:, i , np .newaxis ] - points
3784
- tdiff = np .dot (inv_cov , diff )
3785
- energy = np .sum (diff * tdiff , axis = 0 ) / 2.0
3786
- result = result + np .exp (- energy )
3787
- else :
3788
- # loop over points
3789
- for i in range (num_dp_pts ):
3790
- diff = dataset - points [:, i , np .newaxis ]
3791
- tdiff = np .dot (inv_cov , diff )
3792
- energy = np .sum (diff * tdiff , axis = 0 ) / 2.0
3793
- result [i ] = np .sum (np .exp (- energy ), axis = 0 )
3794
-
3795
- result = result / norm_factor
3796
-
3797
- return {
3798
- 'xmin' : xmin ,
3799
- 'xmax' : xmax ,
3800
- 'mean' : np .mean (dataset ),
3801
- 'median' : np .median (dataset ),
3802
- 'result' : result
3803
- }
3811
+ return result
3812
+
3813
+ __call__ = evaluate
3804
3814
3805
3815
##################################################
3806
3816
# Code related to things in and around polygons
0 commit comments