Mahalanobis distance metric issue with OPTICS

While fitting a numpy array with OPTICS and using mahalanobis distance in metric, It raises the issue of V or VI{covariance matrix}. After passing the co-variance matrix via metric_params with brute setting, It still does not fit.

In [1]:
import warnings
warnings.filterwarnings(action='ignore')
In [2]:
import numpy as np
from sklearn.cluster import OPTICS
In [3]:
# distances 
scipy_spatial_distances =  ['braycurtis', 'canberra', 'chebyshev', 'correlation', 
                            'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 
                            'minkowski', 'rogerstanimoto','russellrao', 'seuclidean', 
                            'sokalmichener','sokalsneath','sqeuclidean','yule']

# test array
test_array = np.random.rand(25,10)

for distance in scipy_spatial_distances:
    try:
        OPTICS(metric=distance).fit(test_array)
    except Exception as e:
        print('FAILURE: {}\nERROR_MSG: {}\n'.format(distance,e))
FAILURE: mahalanobis
ERROR_MSG: Must provide either V or VI for Mahalanobis distance

FAILURE: seuclidean
ERROR_MSG: __init__() takes exactly 1 positional argument (0 given)

Passing metric_params

After passing covariance matrix, it raises LinAlgError

In [4]:
OPTICS(metric='mahalanobis',metric_params={'V': np.cov(test_array)}).fit(test_array)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-d3018a3d4234> in <module>()
----> 1 OPTICS(metric='mahalanobis',metric_params={'V': np.cov(test_array)}).fit(test_array)

/home/ansible/anaconda3/lib/python3.6/site-packages/sklearn/cluster/optics_.py in fit(self, X, y)
    246              leaf_size=self.leaf_size, metric=self.metric,
    247              metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs,
--> 248              max_eps=self.max_eps)
    249 
    250         # Extract clusters from the calculated orders and reachability

/home/ansible/anaconda3/lib/python3.6/site-packages/sklearn/cluster/optics_.py in compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params, algorithm, leaf_size, n_jobs)
    454                             n_jobs=n_jobs)
    455 
--> 456     nbrs.fit(X)
    457     # Here we first do a kNN query for each point, this differs from
    458     # the original OPTICS that only used epsilon range queries.

/home/ansible/anaconda3/lib/python3.6/site-packages/sklearn/neighbors/base.py in fit(self, X, y)
    928             or [n_samples, n_samples] if metric='precomputed'.
    929         """
--> 930         return self._fit(X)

/home/ansible/anaconda3/lib/python3.6/site-packages/sklearn/neighbors/base.py in _fit(self, X)
    251             self._tree = BallTree(X, self.leaf_size,
    252                                   metric=self.effective_metric_,
--> 253                                   **self.effective_metric_params_)
    254         elif self._fit_method == 'kd_tree':
    255             self._tree = KDTree(X, self.leaf_size,

sklearn/neighbors/binary_tree.pxi in sklearn.neighbors.ball_tree.BinaryTree.__init__()

sklearn/neighbors/binary_tree.pxi in sklearn.neighbors.ball_tree.BinaryTree._recursive_build()

sklearn/neighbors/ball_tree.pyx in sklearn.neighbors.ball_tree.init_node()

sklearn/neighbors/binary_tree.pxi in sklearn.neighbors.ball_tree.BinaryTree.rdist()

sklearn/neighbors/dist_metrics.pyx in sklearn.neighbors.dist_metrics.MahalanobisDistance.rdist()

ValueError: Mahalanobis dist: size of V does not match

Brute algorithm

changing the algorithm to brute also doesnt seem to work

In [5]:
test_array = np.random.rand(25,10)

OPTICS(algorithm='brute',metric='mahalanobis',metric_params={'V': np.cov(test_array)}).fit(test_array)
---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
<ipython-input-5-1d41d73e1f2c> in <module>()
      1 test_array = np.random.rand(25,10)
      2 
----> 3 OPTICS(algorithm='brute',metric='mahalanobis',metric_params={'V': np.cov(test_array)}).fit(test_array)

/home/ansible/anaconda3/lib/python3.6/site-packages/sklearn/cluster/optics_.py in fit(self, X, y)
    246              leaf_size=self.leaf_size, metric=self.metric,
    247              metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs,
--> 248              max_eps=self.max_eps)
    249 
    250         # Extract clusters from the calculated orders and reachability

/home/ansible/anaconda3/lib/python3.6/site-packages/sklearn/cluster/optics_.py in compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params, algorithm, leaf_size, n_jobs)
    485                             processed=processed, X=X, nbrs=nbrs,
    486                             metric=metric, metric_params=metric_params,
--> 487                             p=p, max_eps=max_eps)
    488     if np.all(np.isinf(reachability_)):
    489         warnings.warn("All reachability values are inf. Set a larger"

/home/ansible/anaconda3/lib/python3.6/site-packages/sklearn/cluster/optics_.py in _set_reach_dist(core_distances_, reachability_, predecessor_, point_index, processed, X, nbrs, metric, metric_params, p, max_eps)
    520         dists = pairwise_distances(P, np.take(X, unproc, axis=0),
    521                                    metric, n_jobs=None,
--> 522                                    **_params).ravel()
    523 
    524     rdists = np.maximum(dists, core_distances_[point_index])

/home/ansible/anaconda3/lib/python3.6/site-packages/sklearn/metrics/pairwise.py in pairwise_distances(X, Y, metric, n_jobs, **kwds)
   1562 
   1563         # precompute data-derived metric params
-> 1564         params = _precompute_metric_params(X, Y, metric=metric, **kwds)
   1565         kwds.update(**params)
   1566 

/home/ansible/anaconda3/lib/python3.6/site-packages/sklearn/metrics/pairwise.py in _precompute_metric_params(X, Y, metric, **kwds)
   1278             VI = np.linalg.inv(np.cov(X.T)).T
   1279         else:
-> 1280             VI = np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T
   1281         return {'VI': VI}
   1282     return {}

/home/ansible/anaconda3/lib/python3.6/site-packages/numpy/linalg/linalg.py in inv(a)
    549     signature = 'D->D' if isComplexType(t) else 'd->d'
    550     extobj = get_linalg_error_extobj(_raise_linalgerror_singular)
--> 551     ainv = _umath_linalg.inv(a, signature=signature, extobj=extobj)
    552     return wrap(ainv.astype(result_t, copy=False))
    553 

/home/ansible/anaconda3/lib/python3.6/site-packages/numpy/linalg/linalg.py in _raise_linalgerror_singular(err, flag)
     95 
     96 def _raise_linalgerror_singular(err, flag):
---> 97     raise LinAlgError("Singular matrix")
     98 
     99 def _raise_linalgerror_nonposdef(err, flag):

LinAlgError: Singular matrix

If I get another Random matrix, OPTICS fits.

This is very rare. Had to run the following cell a couple of time till it fitted.

Should I be doing this to my distribution: https://stackoverflow.com/questions/44305456/why-am-i-getting-linalgerror-singular-matrix-from-grangercausalitytests?rq=1

In [8]:
test_array = np.random.rand(25,10)
OPTICS(algorithm='brute',metric='mahalanobis',metric_params={'V': np.cov(test_array)}).fit(test_array)
Out[8]:
OPTICS(algorithm='brute', cluster_method='xi', eps=None, leaf_size=30,
       max_eps=inf, metric='mahalanobis',
       metric_params={'V': array([[ 3.27587302e-02, -5.34082857e-03, -1.25681605e-02,
        -1.06398882e-03, -7.21287087e-03,  1.19331109e-02,
         1.85445360e-02,  2.28307759e-02,  3.30808545e-02,
        -1.05390097e-02,  6.36130203e-03,  1.91442323e-02,
        -2.92995396e-02, -4.74916757e-04, -6.528...
         1.16016164e-02,  3.80245917e-02,  2.78324755e-02,
        -4.10059694e-02, -1.00994414e-02,  4.22168197e-02,
        -4.40241401e-02,  1.00511974e-02, -2.38136500e-04,
         2.13117802e-02,  3.47845050e-02,  3.20153429e-02,
         1.53804081e-02,  1.74729690e-02,  2.72095286e-03,
        -4.55212282e-02, -2.68276037e-02, -3.65877614e-02,
         8.51425049e-02]])},
       min_cluster_size=None, min_samples=5, n_jobs=None, p=2,
       predecessor_correction=True, xi=0.05)