Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit cd600a1

Browse files
committed
Enable integer value for PCA get_feature_names
1 parent d8fed2b commit cd600a1

File tree

2 files changed

+39
-20
lines changed

2 files changed

+39
-20
lines changed

sklearn/decomposition/pca.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from scipy import linalg
1616
from scipy.special import gammaln
1717

18+
from ..externals import six
1819
from ..base import BaseEstimator, TransformerMixin
1920
from ..utils import check_random_state, as_float_array
2021
from ..utils import check_array
@@ -226,10 +227,12 @@ def get_feature_names(self, input_features=None, full_output=False):
226227
String names for input features if available. By default,
227228
"x0", "x1", ... "xn_features" is used.
228229
229-
full_output : boolean, default False
230+
full_output : boolean or integer, default False
230231
When it is "True", return the principal components as the
231232
combination of the input features. If "False", will be just
232-
the component names
233+
the component names. If it is an integer n, it returns the
234+
sorted top n contributions to each component.
235+
233236
234237
Returns
235238
-------
@@ -243,18 +246,20 @@ def get_feature_names(self, input_features=None, full_output=False):
243246
>>> import numpy as np
244247
>>> from sklearn.decomposition import PCA
245248
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
246-
>>> pca = PCA(n_components=2)
247-
>>> pca.fit(X)
248-
PCA(copy=True, n_components=2, whiten=False)
249+
>>> pca = PCA(n_components=2).fit(X)
249250
>>> pca.get_feature_names(full_output=True)
250-
['0.838 * x0 + 0.545 * x1', '0.545 * x0 - 0.838 * x1']
251+
['0.84*x0 + 0.54*x1', '0.54*x0 - 0.84*x1']
252+
>>> pca.get_feature_names(full_output=1)
253+
['0.84*x0', '-0.84*x1']
251254
>>> pca.get_feature_names()
252255
['pc0', 'pc1']
253256
"""
254257
check_is_fitted(self, 'components_')
255258

256259
n_features = self.components_.shape[1]
257260
components = self.components_
261+
contribution = np.argsort(-np.abs(components), axis=1)
262+
258263
if input_features is None:
259264
input_features = ['x%d' % i for i in range(n_features)]
260265
else:
@@ -264,17 +269,22 @@ def get_feature_names(self, input_features=None, full_output=False):
264269
(len(input_features), self.n_features))
265270

266271
def name_generator(coefficients, names):
267-
yield "{0:.3f} * {1}".format(coefficients[0], names[0])
272+
yield "{0:.2g}*{1}".format(coefficients[0], names[0])
268273
for c, n in zip(coefficients[1:], names[1:]):
269-
yield "{0:s} {1:.3f} * {2}".format('-' if c < 0 else '+', abs(c), n)
274+
yield "{0:s} {1:.2g}*{2}".format('-' if c < 0 else '+', abs(c), n)
270275

271-
if full_output:
272-
feature_names = []
273-
for component in range(self.n_components):
274-
coefficients = components[component]
275-
feature_names.append(' '.join(name_generator(coefficients,input_features)))
276-
else:
276+
if full_output is True:
277+
feature_names = [' '.join(name_generator(components[i],input_features))
278+
for i in range(self.n_components)]
279+
elif full_output is False:
277280
feature_names = ['pc{0}'.format(i) for i in range(self.n_components)]
281+
elif isinstance(full_output, six.integer_types) and full_output < n_features:
282+
required = contribution[:,:full_output]
283+
input_features = np.asarray(input_features)
284+
feature_names = [' '.join(name_generator(components[i][required[i]], input_features[required[i]]))
285+
for i in range(self.n_components)]
286+
else:
287+
raise ValueError("full_output must be integer or boolean")
278288
return feature_names
279289

280290

sklearn/decomposition/tests/test_pca.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from sklearn.utils.testing import assert_greater
88
from sklearn.utils.testing import assert_raises
99
from sklearn.utils.testing import assert_no_warnings
10+
from sklearn.utils.testing import assert_array_equal
1011

1112
from sklearn import datasets
1213
from sklearn.decomposition import PCA
@@ -351,9 +352,17 @@ def test_pca_score3():
351352
assert_true(ll.argmax() == 1)
352353

353354
def test_get_feature_names():
354-
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
355-
pca = PCA(n_components=2)
356-
pca.fit(X)
357-
assert_equal(pca.get_feature_names(), ['pc0', 'pc1'])
358-
assert_equal(pca.get_feature_names(full_output=True),
359-
['0.838 * x0 + 0.545 * x1', '0.545 * x0 - 0.838 * x1'])
355+
X1 = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
356+
X2 = np.array([[-1, 1], [-2, 1], [-3, 2], [-1, 1], [-2, 1], [-3, 2]])
357+
pca = PCA(n_components=2).fit(X1)
358+
assert_array_equal(pca.get_feature_names(), ['pc0', 'pc1'])
359+
assert_array_equal(pca.get_feature_names(full_output=True),
360+
['0.84*x0 + 0.54*x1', '0.54*x0 - 0.84*x1'])
361+
assert_array_equal(pca.get_feature_names(full_output=1),
362+
['0.84*x0', '-0.84*x1'])
363+
pca = PCA(n_components=2).fit(X2)
364+
assert_array_equal(pca.get_feature_names(full_output=True),
365+
['-0.88*x0 + 0.47*x1', '0.47*x0 + 0.88*x1'])
366+
assert_array_equal(pca.get_feature_names(full_output=1),
367+
['-0.88*x0', '0.88*x1'])
368+

0 commit comments

Comments
 (0)