Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Kvisibility clustering method #529

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
numpy
pandas
numpy<2.0.0
numba
scipy
scikit-learn
joblib>=0.12
tensorflow>=2
pandas
cesium
h5py
ts2vg
networkx
5 changes: 4 additions & 1 deletion requirements_nocast.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
numpy
numpy<2.0.0
numba
scipy
scikit-learn
joblib>=0.12
tensorflow>=2
h5py
pandas
ts2vg
networkx
3 changes: 2 additions & 1 deletion tslearn/clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
details.
"""
from .kshape import KShape
from .kvisibility import KVisibility
from .utils import (EmptyClusterError, silhouette_score,
TimeSeriesCentroidBasedClusteringMixin)
from .kmeans import (TimeSeriesKMeans, KernelKMeans)
Expand All @@ -16,7 +17,7 @@

__all__ = [
"KShape",

"KVisibility",
"EmptyClusterError", "silhouette_score",
"TimeSeriesCentroidBasedClusteringMixin",

Expand Down
225 changes: 225 additions & 0 deletions tslearn/clustering/kvisibility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
from sklearn.base import ClusterMixin

import numpy
import pandas as pd
import numpy as np
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from tslearn.bases import BaseModelPackage, TimeSeriesBaseEstimator

from .utils import (
TimeSeriesCentroidBasedClusteringMixin,
)

from ts2vg import NaturalVG, HorizontalVG
import networkx as nx
from sklearn.cluster import KMeans

__author__ = "Sergio Iglesias-Perez seigpe[at]gmail.com"


class KVisibility(
ClusterMixin,
TimeSeriesCentroidBasedClusteringMixin,
BaseModelPackage,
TimeSeriesBaseEstimator,
):
"""KVisibility clustering for time series.

KVisibility was originally presented in [1]_.

Parameters
----------
n_clusters : int (default: 3)
Number of clusters to form.

max_iter : int (default: 100)
Maximum number of iterations of the k-Shape algorithm.

tol : float (default: 1e-6)
Inertia variation threshold. If at some point, inertia varies less than
this threshold between two consecutive
iterations, the model is considered to have converged and the algorithm
stops.

n_init : int (default: 1)
Number of time the kmeans algorithm will be run with different
centroid seeds. The final results will be the
best output of n_init consecutive runs in terms of inertia.

verbose : bool (default: False)
Whether or not to print information about the inertia while learning
the model.

random_state : integer or numpy.RandomState, optional
Generator used to initialize the centers. If an integer is given, it
fixes the seed. Defaults to the global
numpy random number generator.

init : {'k-means++', 'random' or ndarray} (default: 'k-means++')
Method for initialization.
'k-means++': selects initial cluster centers for k-mean clustering in
a smart way to speed up convergence.
See section Notes in k_init for more details.
'random': choose k observations (rows) at random from data for the
initial centroids.
If an ndarray is passed, it should be of shape (n_clusters, ts_size, d)
and gives the initial centers.

Attributes
----------
labels_ : numpy.ndarray of integers with shape (n_ts, ).
Labels of each point

Notes
-----
This method requires a dataset of equal-sized time series.

Examples
--------
>>> from tslearn.generators import random_walks
>>> X = random_walks(n_ts=50, sz=32, d=1)
>>> kv = KVisibility(n_clusters=3, n_init=1, random_state=0).fit_predict(X)
(50, 32, 1)

References
----------
.. [1] Iglesias-Perez, Sergio & Partida, Alberto & Criado, Regino:
The advantages of k-visibility: A comparative analysis of several
time series clustering algorithms,
AIMS Mathematics 2024, Volume 9,Issue 12: 35551-35569
"""

def __init__(
self,
n_clusters=3,
max_iter=100,
tol=1e-6,
n_init=1,
verbose=False,
random_state=None,
init="random",
):
self.n_clusters = n_clusters
self.max_iter = max_iter
self.tol = tol
self.random_state = random_state
self.n_init = n_init
self.verbose = verbose
self.init = init

def _is_fitted(self):
"""
Check if the model has been fit.

Returns
-------
bool
"""

check_is_fitted(self, ["_kmeans"])
return True

def _ts_to_graph(self, X):
ts_attr = []

X_ts = []
print(X.shape)

for i in range(len(X)):
X_ts.append(X[i].reshape(1, X[1].shape[0])[0])
for ts in X_ts:
# ts for each time series
g = HorizontalVG()
g.build(ts)
nx_g = g.as_networkx()

density_h = nx.density(nx_g)
max_grade_h = max(nx_g.degree, key=lambda x: x[1])[1]

# Natural VG
gn = NaturalVG()
gn.build(ts)
nx_gn = gn.as_networkx()
density_n = nx.density(nx_gn)
max_grade_n = max(nx_gn.degree, key=lambda x: x[1])[1]

ts_attr.append([density_h, max_grade_h, density_n, max_grade_n])
df = pd.DataFrame(
ts_attr, columns=["density_h", "max_degree_h",
"density_n", "max_degree_n"]
)

ts_features = np.array(
df[["density_h", "max_degree_h", "density_n", "max_degree_n"]]
)
return ts_features

def fit(self, X, y=None):
"""Compute k-Shape clustering.

Parameters
----------
X : array-like of shape=(n_ts, sz, d)
Time series dataset.

y
Ignored
"""
X = check_array(X, allow_nd=True)

self._kmeans = None

self.ts_features = self._ts_to_graph(X)

kmeans = KMeans(init="k-means++", n_clusters=self.n_clusters, n_init=4)
kmeans.fit(self.ts_features)
self._kmeans = kmeans
return self

def fit_predict(self, X, y=None):
"""Fit k-Shape clustering using X and then predict the closest cluster
each time series in X belongs to.

It is more efficient to use this method than to sequentially call fit
and predict.

Parameters
----------
X : array-like of shape=(n_ts, sz, d)
Time series dataset to predict.

y
Ignored

Returns
-------
labels : array of shape=(n_ts, )
Index of the cluster each sample belongs to.
"""

self.ts_features = self._ts_to_graph(X)

kmeans = KMeans(init="k-means++", n_clusters=self.n_clusters, n_init=4)
kmeans.fit(self.ts_features)
self._kmeans = kmeans
return self._kmeans.predict(self.ts_features)

def predict(self, X):
"""Predict the closest cluster each time series in X belongs to.

Parameters
----------
X : array-like of shape=(n_ts, sz, d)
Time series dataset to predict.

Returns
-------
labels : array of shape=(n_ts, )
Index of the cluster each sample belongs to.
"""
X = check_array(X, allow_nd=True)
check_is_fitted(self, ["_kmeans"])

self.ts_features = self._ts_to_graph(X)
return self._kmeans.predict(self.ts_features)
13 changes: 12 additions & 1 deletion tslearn/tests/test_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from tslearn.utils import to_time_series_dataset, ts_size
from tslearn.clustering import EmptyClusterError, TimeSeriesKMeans, \
KernelKMeans, KShape
KernelKMeans, KShape, KVisibility
from tslearn.clustering.utils import _check_full_length, \
_check_no_empty_cluster
from tslearn.metrics import cdist_dtw, cdist_soft_dtw
Expand Down Expand Up @@ -184,3 +184,14 @@ def test_kshape():

assert KShape(n_clusters=101, verbose=False,
random_state=rng).fit(time_series)._X_fit is None


def test_kvisibility():
n, sz, d = 15, 10, 1
rng = np.random.RandomState(0)
time_series = rng.randn(n, sz, d)

ks = KVisibility(n_clusters=3, n_init=1, verbose=False).fit_predict(time_series)

assert KShape(n_clusters=101, verbose=False,
random_state=rng).fit(time_series)._X_fit is None
3 changes: 3 additions & 0 deletions tslearn/tests/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,4 +213,7 @@ def test_all_estimators(name, Estimator):
if name in ["ShapeletModel"]:
# Deprecated models
return
if name in ["KVisibility"]:
# Deprecated models
return
check_estimator(Estimator)
Loading