Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
4935fc5
Added the omicron-print clustering method to EventTable class
dcorre Jan 14, 2020
485b5ee
Include omicron clustering algorithm in existing EventTable.cluster()…
dcorre Jan 16, 2020
c608b8a
Removed some empty lines
dcorre Jan 16, 2020
3fc2120
Respect pep8 format
dcorre Jan 16, 2020
4457b4b
Respect pep8 format
dcorre Jan 16, 2020
398b7fa
Remove return from the if else conditions
dcorre Mar 10, 2020
b87999e
Merge remote-tracking branch 'upstream/master' into cluster
dcorre Mar 10, 2020
1511a5d
Merge remote-tracking branch 'upstream/master' into cluster
dcorre Mar 24, 2020
c9de20f
Tsart and Tend of a cluster is the min and max of all the tiles now
dcorre May 1, 2020
77d8180
added some tests for the omicron clustering method
dcorre May 1, 2020
0934de2
Merge remote-tracking branch 'upstream/master'
dcorre May 1, 2020
976fa47
respect pep8 format
dcorre May 1, 2020
4076fa3
Adding pytest.fixture to test omicron clustering method
dcorre May 1, 2020
afd051c
fixed table shape for clustertable_omicron
dcorre May 1, 2020
575051f
Merge remote-tracking branch 'upstream/master'
dcorre May 5, 2020
f7f467e
respect pep8 format
dcorre May 1, 2020
d2b8c6a
Adding pytest.fixture to test omicron clustering method
dcorre May 1, 2020
ca692a9
fixed table shape for clustertable_omicron
dcorre May 1, 2020
f4edd0f
Added method in EventTable to keep track of the triggers within a giv…
dcorre May 5, 2020
837e31d
fix conflicts
dcorre May 5, 2020
a169853
Added some checks for missing trigger at the beginning and end of Eve…
dcorre May 5, 2020
ce2de61
Added a new test to compare omicron and standard method
dcorre May 5, 2020
716301d
Merge remote-tracking branch 'upstream/master' into cluster
dcorre Jul 8, 2020
2ba9afd
Updated identification of clusters methods in EventTable class
dcorre Jul 15, 2020
555e821
EventTable: moved test ensuring window is positive to avoid duplicates
dcorre Jul 16, 2020
a626a18
Merge remote-tracking branch 'upstream/master' into cluster
dcorre Jul 16, 2020
f959ee6
Test EventTable, use utils.assert_arra_equal instead of numpy.testing…
dcorre Jul 16, 2020
56ea67e
table.py, removed useless import
dcorre Jul 16, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
300 changes: 269 additions & 31 deletions gwpy/table/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,20 +720,244 @@ def filter(self, *column_filters):
"""
return filter_table(self, *column_filters)

def _identify_omicron_clusters(self, window):
""" Identify clusters following the omicron method.
If a tile (i.e. row) has its Tstart fulfilling the condition
Tstart - Cluster_Tend < window
include this tile in the cluster.
Otherwise create a new cluster.

Add the list of identified clusters to `self`.

Parameters
----------
window : `float`
window to use when clustering data points, will raise
ValueError if `window > 0` is not satisfied

Returns
-------
None

"""

# Sort table by tstart column
orderidx = numpy.argsort(self['tstart'])

tstart = self['tstart'][orderidx]
tend = self['tend'][orderidx]

# Initialise lists and Tend of the first cluster
clustersList = []
singleClusterIdxList = []
cluster_Tend = tend[0]
# Loop over the triggers sorted by tstart column
for i in range(len(self)):
# Same cluster
if tstart[i] - cluster_Tend <= window:
singleClusterIdxList.append(i)
if tend[i] > cluster_Tend:
cluster_Tend = tend[i]
# Start new cluster
else:
clustersList.append(singleClusterIdxList)
singleClusterIdxList = []
singleClusterIdxList.append(i)
cluster_Tend = tend[i]

# Append last cluster list of indexes
clustersList.append(singleClusterIdxList)

self.clustersList = clustersList

# keep track of the order indexes
self.orderidx = orderidx

return True

def _identify_downselect_clusters(self, index, window):
""" Identify clusters of this `EventTable` over a given column,
`index`.

The clustering algorithm uses a pooling method to identify groups
of points that are all separated in `index` by less than `window`.

Add the list of identified clusters to `self`.

Parameters
----------
index : `str`
name of the column which is used to search for clusters.
If index == 'omicron': use omicron clustering algortihm

window : `float`
window to use when clustering data points, will raise
ValueError if `window > 0` is not satisfied

Returns
-------
None

"""

# sort table by the grequired column
orderidx = numpy.argsort(self[index])
col = self[index][orderidx]

# Find all points where the index vector changes by less than
# window and divide the resulting array into clusters of
# adjacent points
clusterpoints = numpy.where(numpy.diff(col) <= window)[0]
sublists = numpy.split(clusterpoints,
numpy.where(
numpy.diff(clusterpoints) > 1)[0]+1)

# Add end-points to each cluster and find the index of the maximum
# point in each list
clustersList = [numpy.append(s, numpy.array([s[-1]+1]))
for s in sublists]
# Find triggers that are between two clusters
# Happens when successive triggers do not fill the window
# criteria. N triggers missing --> add N clusters
missing_trigger = []
missing_trigger_idx = []
for i in range(len(clustersList)-1):
if clustersList[i+1][0] - clustersList[i][-1] > 1:
missing_trigger_idx.append(i+1)
missing_trigger.append(numpy.arange(
clustersList[i][-1]+1,
clustersList[i+1][0]))
# Insert them in the cluster list
# Need to reverse the list, otherwise the list size update
# is a problem
for i in range(len(missing_trigger))[::-1]:
for j in range(len(missing_trigger[i])):
clustersList.insert(missing_trigger_idx[i]+j,
numpy.atleast_1d(
missing_trigger[i][j]))

# Check if there are some missing points at the beginning
# and end of this list
if clustersList[0][0] != 0:
missing_trigger = numpy.arange(0, clustersList[0][0])
for i in range(len(missing_trigger)):
clustersList.insert(i,
numpy.atleast_1d(missing_trigger[i]))
if clustersList[-1][-1] != len(self):
missing_trigger = numpy.arange(clustersList[-1][-1]+1,
len(self))
for i in range(len(missing_trigger)):
clustersList.insert(len(clustersList)+i,
numpy.atleast_1d(missing_trigger[i]))

self.clustersList = clustersList

# keep track of the order indexes
self.orderidx = orderidx

return True
Comment on lines +723 to +858
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes more sense to me to have these functions return the list of clusters, rather than storing them in another internal property, since the list is only ever used internal to the identify_clusters method.

Can you please consider that change?


def identify_clusters(self, index, window):
"""Identify clusters for this `EventTable` either using
omicron method or a 'downselect' method which clusters
over a given column, `index`.

To use the clustering algorithm implemented in omicron, set `index` to
'omicron'

As a result, a numpy array is added to `self`, containing the ID
of the cluster a point belongs to, for each point (i.e. row) of the
original `EventTable`.

Parameters
----------
index : `str`
name of the column which is used to search for clusters.
If index == 'omicron': use omicron clustering algortihm

window : `float`
window to use when clustering data points, will raise
ValueError if `window > 0` is not satisfied

Returns
-------
None

Examples
--------
To cluster an `EventTable` (``table``) whose `index` is
`end_time`, `window` is `0.1`:

>>> table.identify_clusters('end_time', 0.1)

To identify cluster an `EventTable` (``table``) with the omicron
clustering algorithm, with a `window` of `0.1`:

>>> table.identify_clusters('omicron', 0.1)

It creates a numpy array containing the ID of the cluster
a point belongs to, for each point (i.e. row) of the original
`EventTable`.
You can retrieve it through:

>>> table.tile_cluster_id

"""
# Ensure window is a positive number.
if window <= 0.0:
raise ValueError('Window must be a positive value')

# Use same algorithm as omicron
if index == 'omicron':
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be nice at this point to break out into two private methods that perform the different clustering routines, rather than having a large if block. Something like:

    def identify_clusters(...)
        if index == "omicron":
            return self._identify_omicron_clusters(window)
        return self._identify_downselect_clusters(index, window)

Having smaller, more modular, private methods should make this easier to maintain, and would make adding additional clustering algorithms easier in the future.

self._identify_omicron_clusters(window)
else:
self._identify_downselect_clusters(index, window)

# Create a given ID for each cluster
cluster_id = numpy.arange(len(self.clustersList))
# Associate each tile (i.e. row in the sorted EventTable)
# to a cluster ID
tile_cluster_id = []
for i, s in enumerate(self.clustersList):
tile_cluster_id.extend([cluster_id[i]] * len(s))

tile_cluster_id = numpy.array(tile_cluster_id)

# Pay attention that it corresponds to rows in the sorted EventTable
# in _identify_omicron_clusters() or _identify_downselect_clusters()
# and not the rows in the original EventTable.

# In order to apply it to the original EventTable, one need to reverse
# the ordering.
self.tile_cluster_id = tile_cluster_id[numpy.argsort(self.orderidx)]

return True

def cluster(self, index, rank, window):
"""Cluster this `EventTable` over a given column, `index`, maximizing
over a specified column in the table, `rank`.
"""Return the clusters in `EventTable`.

Cluster identification is done either using omicron method or
a 'downselect' method which clusters over a given column, `index`,
maximizing over a specified column in the table, `rank`.

The clustering algorithm uses a pooling method to identify groups
of points that are all separated in `index` by less than `window`.

Each cluster of nearby points is replaced by the point in that cluster
with the maximum value of `rank`.

With omicron, each cluster is defined with min/max values of tstart,
tend, fstart, fend considering all point in a cluster.
All other parameters are taken from the point maximising the `snr`.

To use the clustering algorithm implemented in omicron, set `index` to
'omicron'

Parameters
----------
index : `str`
name of the column which is used to search for clusters
name of the column which is used to search for clusters.
If index == 'omicron': use omicron clustering algortihm

rank : `str`
name of the column to maximize over in each cluster
Expand All @@ -744,41 +968,55 @@ def cluster(self, index, rank, window):

Returns
-------
table : `EventTable`
a new table that has had the clustering algorithm applied via
slicing of the original
table : `Table`
a new table that has had the clustering algorithm applied

Examples
--------
To cluster an `EventTable` (``table``) whose `index` is
`end_time`, `window` is `0.1`, and maximize over `snr`:

>>> table.cluster('end_time', 'snr', 0.1)
"""
if window <= 0.0:
raise ValueError('Window must be a positive value')

# Generate index and rank vectors that are ordered
orderidx = numpy.argsort(self[index])
col = self[index][orderidx]
param = self[rank][orderidx]
To cluster an `EventTable` (``table``) with the omicron clustering
algorithm, with a `window` of `0.1`, and maximize over `snr`:

# Find all points where the index vector changes by less than window
# and divide the resulting array into clusters of adjacent points
clusterpoints = numpy.where(numpy.diff(col) <= window)[0]
sublists = numpy.split(clusterpoints,
numpy.where(numpy.diff(clusterpoints) > 1)[0]+1)

# Add end-points to each cluster and find the index of the maximum
# point in each list
padded_sublists = [numpy.append(s, numpy.array([s[-1]+1]))
for s in sublists]
maxidx = [s[numpy.argmax(param[s])] for s in padded_sublists]

# Construct a mask that removes all points within clusters and
# replaces them with the maximum point from each cluster
mask = numpy.ones_like(col, dtype=bool)
mask[numpy.concatenate(padded_sublists)] = False
mask[maxidx] = True
>>> table.cluster('omicron','snr', 0.1)
"""

return self[orderidx[mask]]
# Identify the clusters with the chosen method.
self.identify_clusters(index, window)

# Initialise output table.
# Same format as the input EventTable but only one row
# for a given cluster.
out_Table = Table(names=self.colnames)

# store column to maximise over
param = self[rank]

# Get total number of clusters
nb_clusters = numpy.max(self.tile_cluster_id) + 1

# Loop over each cluster
# Get index of the trigger with highest value in column rank
# for each cluster
# Also get tstart_min and tend_max
for idx in numpy.arange(nb_clusters):
mask = self.tile_cluster_id == idx
out_Table.add_row(list(self[mask][numpy.argmax(param[mask])]))

# Modify some parameters for best representing a cluster.
# Only for omicron, so far.
# might be also better to use it for the other method as well,
# otherwise tstart, tend, fstart and fend do not make much sense.
if index == 'omicron':
out_Table['tstart'][idx] = numpy.min(self[mask]['tstart'])
out_Table['tend'][idx] = numpy.max(self[mask]['tend'])
out_Table['fstart'][idx] = numpy.min(self[mask]['fstart'])
out_Table['fend'][idx] = numpy.max(self[mask]['fend'])

# Add new column for cluster IDs
out_Table['cluster_id'] = numpy.arange(nb_clusters)

return out_Table
Loading