[SciPy-user] indices of consecutive elements
Pierre GM
pgmdevlist at gmail.com
Tue Dec 2 12:13:16 EST 2008
Daniel,
I coded a generic class that does what you want. It's not optimize,
but at least should get you started. Let me know if you find it useful
and if you find ways to tweak it...
Cheers,
P.
_____
class Cluster(object):
"""
Groups consecutive data from an array according to a clustering
condition.
A cluster is defined as a group of consecutive values differing
by at most the
increment value.
Missing values are **not** handled: the input sequence must
therefore be free
of missing values.
Parameters
----------
darray : ndarray
Input data array to clusterize.
increment : {float}, optional
Increment between two consecutive values to group.
By default, use a value of 1.
operator : {function}, optional
Comparison operator for the definition of clusters.
By default, use :func:`numpy.less_equal`.
Attributes
----------
inishape
Shape of the argument array (stored for resizing).
inisize
Size of the argument array.
uniques : sequence
List of unique cluster values, as they appear in
chronological order.
slices : sequence
List of the slices corresponding to each cluster of data.
starts : ndarray
Array of the indices at which the clusters start.
clustered : list
List of clustered data.
Examples
--------
>>> A = [0, 0, 1, 2, 2, 2, 3, 4, 3, 4, 4, 4]
>>> klust = cluster(A,0)
>>> [list(_) for _ in klust.clustered]
[[0, 0], [1], [2, 2, 2], [3], [4], [3], [4, 4, 4]]
>>> klust.uniques
array([0, 1, 2, 3, 4, 3, 4])
>>> x = [ 1.8, 1.3, 2.4, 1.2, 2.5, 3.9, 1. , 3.8, 4.2, 3.3,
... 1.2, 0.2, 0.9, 2.7, 2.4, 2.8, 2.7, 4.7, 4.2, 0.4]
>>> Cluster(x,1).starts
array([ 0, 2, 3, 4, 5, 6, 7, 10, 11, 13, 17, 19])
>>> Cluster(x,1.5).starts
array([ 0, 6, 7, 10, 13, 17, 19])
>>> Cluster(x,2.5).starts
array([ 0, 6, 7, 19])
>>> Cluster(x,2.5,greater).starts
array([ 0, 1, 2, 3, 4, 5, 8, 9, 10,
... 11, 12, 13, 14, 15, 16, 17, 18])
>>> y = [ 0, -1, 0, 0, 0, 1, 1, -1, -1, -1, 1, 1, 0, 0, 0, 0, 1,
1, 0, 0]
>>> Cluster(y,1).starts
array([ 0, 1, 2, 5, 7, 10, 12, 16, 18])
"""
def __init__(self,darray,increment=1,operator=np.less_equal):
"""
Initializes instance.
Parameters
----------
darray : ndarray
Input data array to clusterize.
increment : {float}, optional
Increment between two consecutive values to group.
By default, use a value of 1.
operator : {function}, optional
Comparison operator for the definition of clusters.
By default, use :func:`np.less_equal`
"""
if hasattr(darray,'mask') and darray.mask.any():
raise ma.MAError("Masked arrays should be filled prior
clustering.")
else:
darray = np.asanyarray(darray)
n = darray.size
self.inishape = darray.shape
self.inisize = darray.size
clustercond = 1 -
operator(np.absolute(np.diff(darray.ravel())),
increment)
sid = np.r_[[0,], np.arange(1,n).compress(clustercond), [n,]]
slobj = np.asarray([slice(i,d)
for (i,d) in
np.broadcast(sid[:-1],sid[1:])])
#
self.uniques = darray.ravel()[sid[:-1]]
self.clustered = [darray[k] for k in slobj]
self.sizes = np.asarray(np.diff(sid))
self.slices = slobj
self.starts = sid[:-1]
def markonsize(self,operator,sizethresh):
"""
Creates a **mask** for the clusters that do not meet a size
requirement.
Thus, outputs ``False`` if the size requirement is met, ``True``
otherwise.
Parameters
----------
operator : function
Comparison operator
sizethresh : float
Requirement for the sizes of the clusters
"""
resmask = np.empty(self.inisize, dtype=bool)
resmask[:] = True
# for k in self.slices.compress(operator(self.sizes,sizethresh)):
for k in self.slices[operator(self.sizes,sizethresh)]:
resmask[k] = False
return resmask.reshape(self.inishape)
def mark_greaterthan(self,sizemin):
"""
Shortcut for :meth:`markonsize(greater_equal,sizemin)`.
Thus, the command outputs ``False`` for clusters larger than
``sizemin``, and
``True`` for clusters smaller than ``sizemin``.
Parameters
----------
sizemin : int
Minimum size of the clusters.
See Also
--------
:meth:`markonsize`
Creates a **mask** for the clusters that do not meet a size
requirement.
"""
return self.markonsize(np.greater_equal,sizemin)
def grouped_slices(self):
"""
Returns a dictionary with the unique values of ``self`` as keys,
and a list
of slices for the corresponding values.
See Also
--------
:meth:`~Cluster.grouped_limits`
that does the same thing
"""
#
output = dict([(k,[]) for k in np.unique1d(self.uniques)])
for (k,v) in zip(self.uniques, self.slices):
output[k].append(v)
return output
def grouped_limits(self):
"""
Returns a dictionary with the unique values of ``self`` as keys,
and a list
of tuples (starting index, ending index) for the corresponding
values.
See Also
--------
:meth:`~Cluster.grouped_slices`
"""
output = dict([(k,[]) for k in np.unique1d(self.uniques)])
for (k,v) in zip(self.uniques, self.slices):
output[k].append((v.start, v.stop))
for k in output:
output[k] = np.array(output[k])
return output
_____
On Dec 2, 2008, at 11:43 AM, Daniel Ashbrook wrote:
> I'm trying to figure out a way to return the indices of the start and
> end of a run of consecutive elements that match some condition, but
> only
> if there are more than a certain number.
>
> For example, take the array (with indices in comment for clarity):
>
> #0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
> [0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0]
>
> I want to find the start and end indices of all runs of 1s with length
> of 4 or longer; so here the answer would be:
>
> [[2,5], [15,18]]
>
> Is there a reasonable way to do this without looping? I've been
> playing
> around with diff() and where() but without too much progress.
>
> Thanks,
>
>
> dan
> _______________________________________________
> SciPy-user mailing list
> SciPy-user at scipy.org
> http://projects.scipy.org/mailman/listinfo/scipy-user
More information about the SciPy-User
mailing list