-
Notifications
You must be signed in to change notification settings - Fork 1
/
URC_computeClusterIndex_V4.py
171 lines (148 loc) · 7.35 KB
/
URC_computeClusterIndex_V4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import sys
#sys.path.append('/media/enrique/Disk1/Proyectos/UnsupervisedRippleClassification/Code/Python')
import matplotlib.pyplot as plt
import numpy as np
from scipy import sparse, linalg
import pandas as pd
from sklearn.cluster import SpectralClustering
from sklearn.neighbors import kneighbors_graph, NearestNeighbors
from sklearn.metrics import pairwise_distances
from sklearn.manifold import Isomap
#import URC_get_data_contour
def filter_noisy_outliers(data):
D = pairwise_distances(data)
np.fill_diagonal(D, np.nan)
nnDist = np.sum(D < np.nanpercentile(D,1), axis=1) - 1
noiseIdx = np.where(nnDist < np.percentile(nnDist, 20))[0]
return noiseIdx
def generate_graph_laplacian(data, nn):
"""Generate graph Laplacian from data."""
# Adjacency Matrix.
connectivity = kneighbors_graph(X=data, n_neighbors=nn, mode='connectivity')
adjacency_matrix_s = (1/2)*(connectivity + connectivity.T)
# Graph Laplacian.
graph_laplacian_s = sparse.csgraph.laplacian(csgraph=adjacency_matrix_s, normed=False)
graph_laplacian = graph_laplacian_s.toarray()
return graph_laplacian
def compute_spectrum_graph_laplacian(graph_laplacian):
"""Compute eigenvalues and eigenvectors and project
them onto the real numbers.
"""
if graph_laplacian.shape[0] > 50:
kvec = 50
else:
kvec = graph_laplacian.shape[0] - 1
if np.all(np.asmatrix(graph_laplacian).H == graph_laplacian):
eigenvals, eigenvcts = sparse.linalg.eigsh(graph_laplacian, k=kvec, which='SM')
else:
eigenvals, eigenvcts = sparse.linalg.eigs(graph_laplacian, k=kvec, which='SM')
eigenvals = np.real(eigenvals)
eigenvcts = np.real(eigenvcts)
return eigenvals, eigenvcts
def compute_connected_components(data):
if data.shape[0] >= 20:
data = filter_noisy_outliers(data)
graph_laplacian = generate_graph_laplacian(data, 8)
eigenvals, eigenvcts = compute_spectrum_graph_laplacian(graph_laplacian)
kn = len(np.argwhere(abs(eigenvals) < 1e-5))
else:
kn = 1
return kn
def assign_labels(data, kn):
if bool(kn) and kn > 1:
clusModel = SpectralClustering(n_clusters=kn, affinity='nearest_neighbors', assign_labels='kmeans')
clusModel.fit(data)
labels = clusModel.labels_ + 1
else:
labels = np.zeros(data.shape[0]) + 1
return labels
def compute_pointCloudsOverlap(cloud1, cloud2, k, distance_metric):
#Stack both clouds
cloud_all = np.vstack((cloud1, cloud2))
#Create cloud label
cloud_label = np.hstack((np.ones(cloud1.shape[0]), np.ones(cloud2.shape[0])*2))
#Compute k neighbours graph wieghted by cloud label
if distance_metric == 'euclidean':
connectivity = kneighbors_graph(X=cloud_all, n_neighbors=k, mode='connectivity', include_self=False).toarray() * cloud_label
elif distance_metric == 'geodesic':
model_iso = Isomap(n_components = 1)
emb = model_iso.fit_transform(cloud_all)
dist_mat = model_iso.dist_matrix_
knn_distance_based = (NearestNeighbors(n_neighbors=3, metric="precomputed").fit(dist_mat))
connectivity = knn_distance_based.kneighbors_graph(mode='connectivity').toarray() * cloud_label
#Compute the degree of each point of cloud 1
degree = np.sum(connectivity, axis=1)[cloud_label==1]
#Compute overlap percentage
overlap = (np.sum(degree > k) / degree.shape[0])*100
return overlap
def computeClusterIndex_V4(emb, label, nBins, dimNames, plotCluster, overlapThreshold=0.5, distance_metric='euclidean', **kwargs):
#Preprocess data
emb = emb[dimNames].to_numpy()
# for d in range(emb.shape[1]):
# emb[:,d] = (emb[:,d] - np.nanmean(emb[:,d])) / np.nanstd(emb[:,d])
#Delete nan values from label and emb
emb = np.delete(emb, np.where(np.isnan(label))[0], axis=0)
label = np.delete(label, np.where(np.isnan(label))[0], axis=0)
#If there is a predefined max or min delete all points out of bounds
if 'vmin' in kwargs:
#emb = np.delete(emb, np.where(label<kwargs['vmin'])[0], axis=0)
label[np.where(label<kwargs['vmin'])[0]] = kwargs['vmin']
#label = np.delete(label, np.where(label<kwargs['vmin'])[0], axis=0)
if 'vmax' in kwargs:
#emb = np.delete(emb, np.where(label>kwargs['vmax'])[0], axis=0)
label[np.where(label>kwargs['vmax'])[0]] = kwargs['vmax']
#label = np.delete(label, np.where(label>kwargs['vmax'])[0], axis=0)
#Create the bin edges
if 'vmin' in kwargs and 'vmax' in kwargs:
binSize = (kwargs['vmax'] - kwargs['vmin']) / nBins
binEdges = np.column_stack((np.linspace(kwargs['vmin'],kwargs['vmin']+binSize*(nBins-1),nBins),
np.linspace(kwargs['vmin'],kwargs['vmin']+binSize*(nBins-1),nBins)+binSize))
else:
binSize = (np.max(label) - np.min(label)) / nBins
binEdges = np.column_stack((np.linspace(np.min(label),np.min(label)+binSize*(nBins-1),nBins),
np.linspace(np.min(label),np.min(label)+binSize*(nBins-1),nBins)+binSize))
#Create binLabel
binLabel = np.zeros(label.shape)
for b in range(nBins-1):
binLabel[np.logical_and(label >= binEdges[b,0], label<binEdges[b,1])] = 1 + int(np.max(binLabel))
binLabel[np.logical_and(label >= binEdges[nBins-1,0], label<=binEdges[nBins-1,1])] = 1 + int(np.max(binLabel))
#Clean outliers from each cluster if specified in kwargs
if 'filterNoise' in kwargs and kwargs['filterNoise']:
for l in np.unique(binLabel):
noiseIdx = filter_noisy_outliers(emb[binLabel==l,:])
noiseIdx = np.where(binLabel==l)[0][noiseIdx]
binLabel[noiseIdx] = 0
#Discard outlier clusters (nPoints < 1%)
#Compute number of points in each cluster
nPoints = np.array([np.sum(binLabel==value) for value in np.unique(binLabel)])
#Get the clusters that meet criteria and delete them
delLabels = np.where(nPoints < label.size*1/100)[0]
delLabels = np.where(nPoints < 9)[0] #n_neighbor * 3
#Delete outlier clusters
for delInd in delLabels:
binLabel[binLabel==delInd+1] = 0
#Renumber bin labels from 1 to n clusters
uniqueVal = np.unique(binLabel)
if 0 in np.unique(binLabel):
for idx in range(1,len(uniqueVal)):
binLabel[binLabel==uniqueVal[idx]]= idx
if emb.shape[1]==2 and plotCluster:
f, ax = plt.subplots(1,1)
for l in np.unique(binLabel)[np.unique(binLabel)>0]:
contour = URC_get_data_contour.get_data_contour(emb[binLabel==l,:],20)
cent = np.mean(contour, axis=0)
ax.plot(contour[:,0], contour[:,1])
ax.scatter(cent[0], cent[1], s=60)
ax.legend(['Clus%d'%x for x in np.unique(binLabel)[np.unique(binLabel)>0]])
#Compute the cluster index
#Compute overlap between clusters pairwise
overlapMat = np.zeros((np.sum(np.unique(binLabel) > 0), np.sum(np.unique(binLabel) > 0)))
for ii in range(overlapMat.shape[0]):
for jj in range(overlapMat.shape[1]):
if ii != jj:
overlap = compute_pointCloudsOverlap(emb[binLabel==ii+1], emb[binLabel==jj+1], 3, distance_metric)
overlapMat[ii,jj] = overlap/100
#Symetrize overlap matrix
overlapMat = (overlapMat + overlapMat.T) / 2
clusterIndex = 1 - np.mean(np.sum(1*(overlapMat>=overlapThreshold), axis=0))/(overlapMat.shape[0]-1)
return clusterIndex, binLabel, overlapMat