Enable integer value for PCA get_feature_names

maniteja123 · maniteja123 · commit cd600a14519b · 2016-03-03T23:04:21.000+05:30
diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
@@ -15,6 +15,7 @@
 from scipy import linalg
 from scipy.special import gammaln
 
+from ..externals import six
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_random_state, as_float_array
 from ..utils import check_array
@@ -226,10 +227,12 @@ def get_feature_names(self, input_features=None, full_output=False):
             String names for input features if available. By default,
             "x0", "x1", ... "xn_features" is used.
 
-        full_output : boolean, default False
+        full_output : boolean or integer, default False
             When it is "True", return the principal components as the
             combination of the input features. If "False", will be just
-            the component names
+            the component names. If it is an integer n, it returns the
+            sorted top n contributions to each component.
+
 
         Returns
         -------
@@ -243,18 +246,20 @@ def get_feature_names(self, input_features=None, full_output=False):
         >>> import numpy as np
         >>> from sklearn.decomposition import PCA
         >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
-        >>> pca = PCA(n_components=2)
-        >>> pca.fit(X)
-        PCA(copy=True, n_components=2, whiten=False)
+        >>> pca = PCA(n_components=2).fit(X)
         >>> pca.get_feature_names(full_output=True)
-        ['0.838 * x0 + 0.545 * x1', '0.545 * x0 - 0.838 * x1']
+        ['0.84*x0 + 0.54*x1', '0.54*x0 - 0.84*x1']
+        >>> pca.get_feature_names(full_output=1)
+        ['0.84*x0', '-0.84*x1']
         >>> pca.get_feature_names()
         ['pc0', 'pc1']
         """
         check_is_fitted(self, 'components_')
 
         n_features = self.components_.shape[1]
         components = self.components_
+        contribution = np.argsort(-np.abs(components), axis=1)
+
         if input_features is None:
             input_features = ['x%d' % i for i in range(n_features)]
         else:
@@ -264,17 +269,22 @@ def get_feature_names(self, input_features=None, full_output=False):
                     (len(input_features), self.n_features))
 
         def name_generator(coefficients, names):
-            yield "{0:.3f} * {1}".format(coefficients[0], names[0])
+            yield "{0:.2g}*{1}".format(coefficients[0], names[0])
             for c, n in zip(coefficients[1:], names[1:]):
-                yield "{0:s} {1:.3f} * {2}".format('-' if c < 0 else '+', abs(c), n)
+                yield "{0:s} {1:.2g}*{2}".format('-' if c < 0 else '+', abs(c), n)
 
-        if full_output:
-            feature_names = []
-            for component in range(self.n_components):
-                coefficients = components[component]                
-                feature_names.append(' '.join(name_generator(coefficients,input_features)))
-        else:
+        if full_output is True:
+            feature_names = [' '.join(name_generator(components[i],input_features))
+                            for i in range(self.n_components)]
+        elif full_output is False:
             feature_names = ['pc{0}'.format(i) for i in range(self.n_components)]
+        elif isinstance(full_output, six.integer_types) and full_output < n_features:
+            required = contribution[:,:full_output]
+            input_features = np.asarray(input_features)
+            feature_names = [' '.join(name_generator(components[i][required[i]], input_features[required[i]]))
+                            for i in range(self.n_components)]
+        else:
+            raise ValueError("full_output must be integer or boolean")
         return feature_names
         
 
diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
@@ -7,6 +7,7 @@
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_no_warnings
+from sklearn.utils.testing import assert_array_equal
 
 from sklearn import datasets
 from sklearn.decomposition import PCA
@@ -351,9 +352,17 @@ def test_pca_score3():
     assert_true(ll.argmax() == 1)
 
 def test_get_feature_names():
-    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
-    pca = PCA(n_components=2)
-    pca.fit(X)
-    assert_equal(pca.get_feature_names(), ['pc0', 'pc1'])
-    assert_equal(pca.get_feature_names(full_output=True),
-        ['0.838 * x0 + 0.545 * x1', '0.545 * x0 - 0.838 * x1'])
+    X1 = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
+    X2 = np.array([[-1, 1], [-2, 1], [-3, 2], [-1, 1], [-2, 1], [-3, 2]])
+    pca = PCA(n_components=2).fit(X1)
+    assert_array_equal(pca.get_feature_names(), ['pc0', 'pc1'])
+    assert_array_equal(pca.get_feature_names(full_output=True),
+        ['0.84*x0 + 0.54*x1', '0.54*x0 - 0.84*x1'])
+    assert_array_equal(pca.get_feature_names(full_output=1),
+        ['0.84*x0', '-0.84*x1'])
+    pca = PCA(n_components=2).fit(X2)
+    assert_array_equal(pca.get_feature_names(full_output=True),
+        ['-0.88*x0 + 0.47*x1', '0.47*x0 + 0.88*x1'])
+    assert_array_equal(pca.get_feature_names(full_output=1),
+        ['-0.88*x0', '0.88*x1'])
+