Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 209 additions & 6 deletions dadac/dadac.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import time
from wurlitzer import sys_pipes
from copy import deepcopy
import matplotlib.pyplot as plt


ct_float_t = ct.c_double
Expand Down Expand Up @@ -225,6 +226,7 @@ def __init__(self):
self._export_borders = self.lib.export_borders
self._export_borders.argtypes = [
ct.POINTER(clusters),
ct.POINTER(datapoint_info),
np.ctypeslib.ndpointer(np.int32),
np.ctypeslib.ndpointer(ct_float_t),
np.ctypeslib.ndpointer(ct_float_t),
Expand Down Expand Up @@ -801,25 +803,33 @@ def _get_borders(self):
if self.state["clustering"]:
self._log_den_bord = np.zeros(
(self.N_clusters, self.N_clusters), self._ftype
)
) - 1
self._log_den_bord_err = np.zeros(
(self.N_clusters, self.N_clusters), self._ftype
)
self._border_indices = (
np.zeros((self.N_clusters, self.N_clusters), np.int32) - 1
)

self._log_den_bord = np.ascontiguousarray(self._log_den_bord)
self._log_den_bord_err = np.ascontiguousarray(self._log_den_bord_err)
self._border_indices = np.ascontiguousarray(self._border_indices)

self._export_borders(
self._clusters,
self._datapoints,
self._border_indices,
self._log_den_bord,
self._log_den_bord_err,
)
# ugly thing to retrieve correct thing in dadapy as border density
# ugly thing to retrieve correct dadapy border density
# DENSITY where no border exists should be either 0 (or 1 if log)
# Densities in dadapy are a MESS

mm = np.zeros_like(self._log_den_bord)
f = np.where(self._border_indices == -1)
mm[f] = -1
mm += np.eye(self._N_clusters)
self._log_den_bord -= mm
mm = np.min(self.log_den - self.Z*self.log_den_err) - 1
mm -= np.eye(self._N_clusters)
self._log_den_bord += mm
else:
raise ValueError(
"Borders not computed yet use `Data.compute_clustering_[DP,ADP,...]()`"
Expand Down Expand Up @@ -917,3 +927,196 @@ def __del__(self):
self._free_datapoints(self._datapoints, self.n)
if not self._clusters is None:
self._free_clusters(ct.pointer(self._clusters))


def get_dendrogram(Data, cmap="viridis", savefig="", logscale=True):
"""Generate a visualisation of the topography computed with ADP.

This visualisation fundamentally corresponds to a hierarchy of the clusters built
with Single Linkage taking as similarity measure the density at the
border between clusters.
At difference from classical dendrograms, where all the branches have the same height,
in this case the height of the branches is proportional to the density of the cluster
centre.
To convey more information, the distance in the x-axis between
clusters is proportional to the population (or its logarithm).

Args:
Data: A dadapy data object for which ADP has been already run.
cmap: (optional) The color map for representing the different clusters,
the default is "viridis".
savefig: (str, optional) A string with the name of the file in which the dendrogram
will be saved. The default is empty, so no file is generated.
logscale: (bool, optional) Makes the distances in the x-axis between clusters proportional
to the logarithm of the population of the clusters instead of
proportional to the population itself. In very unbalanced clusterings,
it makes the dendrogram more human readable. The default is True.

Returns:

"""
# Prepare some auxiliary lists
e1 = []
e2 = []
d12 = []
L = []
Li1 = []
Li2 = []
Ldis = []
Fmax = max(Data.log_den)
Rho_bord_m = np.copy(Data.log_den_bord)
# Obtain populations of the clusters for fine tunning the x-axis
_, pop = np.unique(Data.cluster_assignment, return_counts=True)

if logscale:
pop = np.log(pop)
xr = np.sum(pop)
# Obtain distances in list format from topography
for i in range(Data.N_clusters - 1):
for j in range(i + 1, Data.N_clusters):
dis12 = Fmax - Rho_bord_m[i][j]
e1.append(i)
e2.append(j)
d12.append(dis12)

# Obtain the dendrogram in form of links
nlinks = 0
clnew = Data.N_clusters
for j in range(Data.N_clusters - 1):
aa = np.argmin(d12)
nlinks = nlinks + 1
L.append(clnew + nlinks)
Li1.append(e1[aa])
Li2.append(e2[aa])
Ldis.append(d12[aa])
# update distance matrix
t = 0
fe = Li1[nlinks - 1]
fs = Li2[nlinks - 1]
newname = L[nlinks - 1]
# list of untouched clusters
unt = []
for _ in d12:
if (e1[t] != fe) & (e1[t] != fs):
unt.append(e1[t])
if (e2[t] != fe) & (e2[t] != fs):
unt.append(e2[t])
t = t + 1
myset = set(unt)
unt = list(myset)
# Build a new distance matrix
e1new = []
e2new = []
d12new = []
for j in unt:
t = 0
dmin = 9.9e99
for _ in d12:
if (e1[t] == j) | (e2[t] == j):
if (e1[t] == fe) | (e2[t] == fe) | (e1[t] == fs) | (e2[t] == fs):
if d12[t] < dmin:
dmin = d12[t]
t = t + 1
e1new.append(j)
e2new.append(newname)
d12new.append(dmin)

t = 0
for _ in d12:
if (unt.count(e1[t])) & (unt.count(e2[t])):
e1new.append(e1[t])
e2new.append(e2[t])
d12new.append(d12[t])
t = t + 1

e1 = e1new
e2 = e2new
d12 = d12new

# Get the order in which the elements should be displayed
sorted_elements = []
sorted_elements.append(L[nlinks - 1])

for jj in range(len(L)):
j = len(L) - jj - 1
for i in range(len(sorted_elements)):
if sorted_elements[i] == L[j]:
sorted_elements[i] = Li2[j]
sorted_elements.insert(i, Li1[j])

add = 0.0
x = []
y = []
label = []
join_distance = []
for i in range(len(sorted_elements)):
label.append(sorted_elements[i])
j = Data.cluster_centers[label[i]]
y.append(Data.log_den[j])
x.append(add + 0.5 * pop[sorted_elements[i]])
add = add + pop[sorted_elements[i]]
join_distance.append(add)

xs = x.copy()
ys = y.copy()
labels = label.copy()
zorder = 0
for jj in range(len(L)):
c1 = label.index(Li1[jj])
c2 = label.index(Li2[jj])
label.append(L[jj])
if c1 < len(sorted_elements):
x.append(join_distance[c1])
else:
x.append((x[c1] + x[c2]) / 2.0)
ynew = Fmax - Ldis[jj]
y.append(ynew)
x1 = x[c1]
y1 = y[c1]
x2 = x[c2]
y2 = y[c2]
zorder = zorder + 1
plt.plot(
[x1, x1], [y1, ynew], color="k", linestyle="-", linewidth=2, zorder=zorder
)
zorder = zorder + 1
plt.plot(
[x2, x2], [y2, ynew], color="k", linestyle="-", linewidth=2, zorder=zorder
)
zorder = zorder + 1
plt.plot(
[x1, x2], [ynew, ynew], color="k", linestyle="-", linewidth=2, zorder=zorder
)

zorder = zorder + 1
cmal = plt.get_cmap(cmap, Data.N_clusters)
colors = cmal(np.arange(0, cmal.N))
plt.scatter(xs, ys, c=labels, s=100, zorder=zorder, cmap=cmap)
for i in range(Data.N_clusters):
zorder = zorder + 1
cc = "k"
r = colors[labels[i]][0]
g = colors[labels[i]][1]
b = colors[labels[i]][2]
luma = (0.2126 * r + 0.7152 * g + 0.0722 * b) * 255
if luma < 156:
cc = "w"
plt.annotate(
labels[i],
(xs[i], ys[i]),
horizontalalignment="center",
verticalalignment="center",
zorder=zorder,
c=cc,
weight="bold",
)
plt.xlim([-0.02 * xr, xr])
xname = "Population"
if logscale:
xname = "ln(Population)"
plt.xlim([0, xr])
plt.xlabel(xname)
plt.ylabel(r"ln($\rho$)")
if savefig != "":
plt.savefig(savefig)
plt.show()
16 changes: 9 additions & 7 deletions dadac/src/dadac.c
Original file line number Diff line number Diff line change
Expand Up @@ -3153,7 +3153,7 @@ void export_cluster_assignment(datapoint_info* points, int* labels, idx_t n)
for(idx_t i = 0; i < n; ++i) labels[i] = points[i].cluster_idx;
}

void export_borders(clusters* clusters, int* border_idx, float_t* border_den, float_t* border_err)
void export_borders(clusters* clusters, datapoint_info* points, int* border_idx, float_t* border_den, float_t* border_err)
{
idx_t nclus = clusters -> centers.count;
if(clusters->use_sparse_borders)
Expand All @@ -3163,9 +3163,10 @@ void export_borders(clusters* clusters, int* border_idx, float_t* border_den, fl
{
idx_t j = clusters -> sparse_borders[i].data[el].i;
idx_t p = i*nclus + j;
border_idx[p] = (int)j;
border_den[p] = clusters -> sparse_borders[i].data[el].density;
border_err[p] = clusters -> sparse_borders[i].data[el].error;
idx_t index = clusters -> sparse_borders[i].data[el].idx;
border_idx[p] = (int)index;
border_den[p] = points[index].log_rho_c;
border_err[p] = points[index].log_rho_err;
}
}
else
Expand All @@ -3174,9 +3175,10 @@ void export_borders(clusters* clusters, int* border_idx, float_t* border_den, fl
for(idx_t j = 0; j < nclus; ++j)
{
idx_t p = i*nclus + j;
border_idx[p] = (int)clusters -> __borders_data[p].idx;
border_den[p] = clusters -> __borders_data[p].density;
border_err[p] = clusters -> __borders_data[p].error;
idx_t index = clusters -> __borders_data[p].idx;
border_idx[p] = (int)index;
border_den[p] = points[index].log_rho_c;
border_err[p] = points[index].log_rho_err;
}
}
}
Expand Down
262 changes: 203 additions & 59 deletions demo.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
numpy
scikit-learn
wurlitzer
matlplotlib
Loading