Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 2530aef

Browse files
committed
FIX performance issue in _graph_connected_component
1 parent 51480fc commit 2530aef

File tree

2 files changed

+63
-18
lines changed

2 files changed

+63
-18
lines changed

sklearn/manifold/spectral_embedding_.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -42,23 +42,27 @@ def _graph_connected_component(graph, node_id):
4242
belonging to the largest connected components of the given query
4343
node
4444
"""
45-
connected_components_matrix = np.zeros(
46-
shape=(graph.shape[0]), dtype=np.bool)
47-
nodes_to_explore = np.zeros(shape=(graph.shape[0]), dtype=np.bool)
48-
nodes_to_explore[node_id] = True
4945
n_node = graph.shape[0]
50-
nodes_to_add = np.zeros(shape=(graph.shape[0]), dtype=np.bool)
51-
for i in range(n_node):
52-
nodes_to_add.fill(False)
53-
for i in np.where(nodes_to_explore)[0]:
54-
nodes_to_add = np.logical_or(nodes_to_add, graph[i] != 0)
55-
connected_components_matrix = np.logical_or(
56-
connected_components_matrix, nodes_to_explore)
57-
if not nodes_to_add.any():
46+
if sparse.issparse(graph):
47+
# speed up row-wise access to boolean connection mask
48+
graph = (graph != 0).tocsr()
49+
connected_nodes = np.zeros(n_node, dtype=np.bool)
50+
nodes_to_explore = np.zeros(n_node, dtype=np.bool)
51+
nodes_to_explore[node_id] = True
52+
for _ in range(n_node):
53+
last_num_component = connected_nodes.sum()
54+
np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
55+
if last_num_component >= connected_nodes.sum():
5856
break
59-
# Swap arrays
60-
nodes_to_explore, nodes_to_add = nodes_to_add, nodes_to_explore
61-
return connected_components_matrix
57+
indices = np.where(nodes_to_explore)[0]
58+
nodes_to_explore.fill(False)
59+
for i in indices:
60+
if sparse.issparse(graph):
61+
neighbors = graph[i].toarray().ravel()
62+
else:
63+
neighbors = graph[i]
64+
np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
65+
return connected_nodes
6266

6367

6468
def _graph_is_connected(graph):

sklearn/manifold/tests/test_spectral_embedding.py

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33

44
from scipy.sparse import csr_matrix
55
from scipy.sparse import csc_matrix
6+
from scipy.sparse import coo_matrix
67
from scipy.linalg import eigh
78
import numpy as np
89
from numpy.testing import assert_array_almost_equal
10+
from numpy.testing import assert_array_equal
911

1012
from nose.tools import assert_raises
1113
from nose.plugins.skip import SkipTest
@@ -48,12 +50,50 @@ def _check_with_col_sign_flipping(A, B, tol=0.0):
4850
return True
4951

5052

53+
def test_sparse_graph_connected_component():
54+
rng = np.random.RandomState(42)
55+
n_samples = 300
56+
boundaries = [0, 42, 121, 200, n_samples]
57+
p = rng.permutation(n_samples)
58+
connections = []
59+
60+
for start, stop in zip(boundaries[:-1], boundaries[1:]):
61+
group = p[start:stop]
62+
# Connect all elements within the group at least once via an
63+
# arbitrary path that spans the group.
64+
for i in range(len(group) - 1):
65+
connections.append((group[i], group[i + 1]))
66+
67+
# Add some more random connections within the group
68+
min_idx, max_idx = 0, len(group) - 1
69+
n_random_connections = 1000
70+
source = rng.randint(min_idx, max_idx, size=n_random_connections)
71+
target = rng.randint(min_idx, max_idx, size=n_random_connections)
72+
connections.extend(zip(group[source], group[target]))
73+
74+
# Build a symmetric affinity matrix
75+
row_idx, column_idx = tuple(np.array(connections).T)
76+
data = rng.uniform(.1, 42, size=len(connections))
77+
affinity = coo_matrix((data, (row_idx, column_idx)))
78+
affinity = 0.5 * (affinity + affinity.T)
79+
80+
for start, stop in zip(boundaries[:-1], boundaries[1:]):
81+
component_1 = _graph_connected_component(affinity, p[start])
82+
component_size = stop - start
83+
assert_equal(component_1.sum(), component_size)
84+
85+
# We should retrieve the same component mask by starting by both ends
86+
# of the group
87+
component_2 = _graph_connected_component(affinity, p[stop - 1])
88+
assert_equal(component_2.sum(), component_size)
89+
assert_array_equal(component_1, component_2)
90+
91+
5192
def test_spectral_embedding_two_components(seed=36):
5293
# Test spectral embedding with two components
5394
random_state = np.random.RandomState(seed)
5495
n_sample = 100
55-
affinity = np.zeros(shape=[n_sample * 2,
56-
n_sample * 2])
96+
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
5797
# first component
5898
affinity[0:n_sample,
5999
0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2
@@ -208,7 +248,8 @@ def test_spectral_embedding_deterministic():
208248

209249

210250
def test_spectral_embedding_unnormalized():
211-
# Test that spectral_embedding is also processing unnormalized laplacian correctly
251+
# Test that spectral_embedding is also processing unnormalized laplacian
252+
# correctly
212253
random_state = np.random.RandomState(36)
213254
data = random_state.randn(10, 30)
214255
sims = rbf_kernel(data)

0 commit comments

Comments
 (0)