Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG] FIX performance issue in _graph_connected_component #6268

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 19 additions & 15 deletions sklearn/manifold/spectral_embedding_.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,23 +42,27 @@ def _graph_connected_component(graph, node_id):
belonging to the largest connected components of the given query
node
"""
connected_components_matrix = np.zeros(
shape=(graph.shape[0]), dtype=np.bool)
nodes_to_explore = np.zeros(shape=(graph.shape[0]), dtype=np.bool)
nodes_to_explore[node_id] = True
n_node = graph.shape[0]
nodes_to_add = np.zeros(shape=(graph.shape[0]), dtype=np.bool)
for i in range(n_node):
nodes_to_add.fill(False)
for i in np.where(nodes_to_explore)[0]:
nodes_to_add = np.logical_or(nodes_to_add, graph[i] != 0)
connected_components_matrix = np.logical_or(
connected_components_matrix, nodes_to_explore)
if not nodes_to_add.any():
if sparse.issparse(graph):
# speed up row-wise access to boolean connection mask
graph = graph.tocsr()
connected_nodes = np.zeros(n_node, dtype=np.bool)
nodes_to_explore = np.zeros(n_node, dtype=np.bool)
nodes_to_explore[node_id] = True
for _ in range(n_node):
last_num_component = connected_nodes.sum()
np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
if last_num_component >= connected_nodes.sum():
break
# Swap arrays
nodes_to_explore, nodes_to_add = nodes_to_add, nodes_to_explore
return connected_components_matrix
indices = np.where(nodes_to_explore)[0]
nodes_to_explore.fill(False)
for i in indices:
if sparse.issparse(graph):
neighbors = graph[i].toarray().ravel()
else:
neighbors = graph[i]
np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
return connected_nodes


def _graph_is_connected(graph):
Expand Down
47 changes: 44 additions & 3 deletions sklearn/manifold/tests/test_spectral_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@

from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix
from scipy.sparse import coo_matrix
from scipy.linalg import eigh
import numpy as np
from numpy.testing import assert_array_almost_equal
from numpy.testing import assert_array_equal

from nose.tools import assert_raises
from nose.plugins.skip import SkipTest
Expand Down Expand Up @@ -48,12 +50,50 @@ def _check_with_col_sign_flipping(A, B, tol=0.0):
return True


def test_sparse_graph_connected_component():
rng = np.random.RandomState(42)
n_samples = 300
boundaries = [0, 42, 121, 200, n_samples]
p = rng.permutation(n_samples)
connections = []

for start, stop in zip(boundaries[:-1], boundaries[1:]):
group = p[start:stop]
# Connect all elements within the group at least once via an
# arbitrary path that spans the group.
for i in range(len(group) - 1):
connections.append((group[i], group[i + 1]))

# Add some more random connections within the group
min_idx, max_idx = 0, len(group) - 1
n_random_connections = 1000
source = rng.randint(min_idx, max_idx, size=n_random_connections)
target = rng.randint(min_idx, max_idx, size=n_random_connections)
connections.extend(zip(group[source], group[target]))

# Build a symmetric affinity matrix
row_idx, column_idx = tuple(np.array(connections).T)
data = rng.uniform(.1, 42, size=len(connections))
affinity = coo_matrix((data, (row_idx, column_idx)))
affinity = 0.5 * (affinity + affinity.T)

for start, stop in zip(boundaries[:-1], boundaries[1:]):
component_1 = _graph_connected_component(affinity, p[start])
component_size = stop - start
assert_equal(component_1.sum(), component_size)

# We should retrieve the same component mask by starting by both ends
# of the group
component_2 = _graph_connected_component(affinity, p[stop - 1])
assert_equal(component_2.sum(), component_size)
assert_array_equal(component_1, component_2)


def test_spectral_embedding_two_components(seed=36):
# Test spectral embedding with two components
random_state = np.random.RandomState(seed)
n_sample = 100
affinity = np.zeros(shape=[n_sample * 2,
n_sample * 2])
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
# first component
affinity[0:n_sample,
0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2
Expand Down Expand Up @@ -208,7 +248,8 @@ def test_spectral_embedding_deterministic():


def test_spectral_embedding_unnormalized():
# Test that spectral_embedding is also processing unnormalized laplacian correctly
# Test that spectral_embedding is also processing unnormalized laplacian
# correctly
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
Expand Down