Source code for clustering

# Copyright 2021-2023 The DADApy Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""
The *clustering* module contains the *Clustering* class.

Density-based clustering algorithms are implemented as methods of this class.
"""

import multiprocessing
import time
import warnings

import numpy as np
import scipy as sp

from dadapy._cython import cython_clustering as cf
from dadapy._cython import cython_clustering_v2 as cf2
from dadapy.density_estimation import DensityEstimation

cores = multiprocessing.cpu_count()


[docs] class Clustering(DensityEstimation): """Perform clustering using various density-based clustering algorithms. Inherits from the DensityEstimation class. Attributes: N_clusters (int): Number of clusters found cluster_assignment (list(int)): A list of length N containing the cluster assignment of each point as an integer from 0 to N_clusters-1. cluster_centers (list(int)): Indices of the centroids of each cluster (density peak) cluster_indices (list(list(int))): A list of lists. Each sublist contains the indices belonging to the corresponding cluster. log_den_bord (np.ndarray(float)): A matrix of dimensions N_clusters x N_clusters containing the estimated log density of the saddle point between each couple of peaks. log_den_bord_err (np.ndarray(float)): A matrix of dimensions N_clusters x N_clusters containing the estimated error on the log density of the saddle point between each couple of peaks. bord_indices (np.ndarray(float)): A matrix of dimensions N_clusters x N_clusters containing the indices of the saddle point between each couple of peaks. """ def __init__( self, coordinates=None, distances=None, maxk=None, verbose=False, n_jobs=cores ): """Initialise the Clustering class.""" super().__init__( coordinates=coordinates, distances=distances, maxk=maxk, verbose=verbose, n_jobs=n_jobs, ) self.cluster_indices = None self.N_clusters = None self.cluster_assignment = None self.cluster_centers = None self.log_den_bord_err = None self.log_den_bord = None self.bord_indices = None self.delta = None # Minimum distance from an element with higher density self.ref = None # Index of the nearest element with higher density
[docs] def compute_clustering_ADP(self, Z=1.65, halo=False, v2=False): """Compute clustering according to the algorithm DPA. The only free parameter is the merging factor Z, which controls how the different density peaks are merged together. The higher the Z, the more aggressive the merging, the smaller the number of clusters. The calculation is optimized though cython Args: Z(float): merging parameter halo (bool): compute (or not) the halo points Returns: cluster_assignment (np.ndarray(int)): assignment of points to specific clusters References: M. d’Errico, E. Facco, A. Laio, A. Rodriguez, Automatic topography of high-dimensional data sets by non-parametric density peak clustering, Information Sciences 560 (2021) 476–492 """ if self.log_den is None: self.compute_density_PAk() assert not np.isnan(np.sum(self.log_den)), "log density contains nan values" assert not np.isnan( np.sum(self.log_den_err) ), "log error density contains nan values" if self.verb: print("Clustering started") if v2 is True: warnings.warn( """using adp implementation v2: this requires less memory but can be two times slower than the original implementation""", stacklevel=2, ) # Make all values of log_den positives (this is important to help convergence) # even when subtracting the value Z*log_den_err log_den_min = np.min(self.log_den - Z * self.log_den_err) log_den_c = self.log_den - log_den_min + 1 # Putative modes of the PDF as preliminary clusters g = log_den_c - self.log_den_err # centers are point of max density (max(g) ) within their optimal neighborhood (defined by kstar) seci = time.time() if v2: out = cf2._compute_clustering( Z, halo, self.kstar, self.dist_indices.astype(int), self.maxk, self.verb, self.log_den_err, log_den_c, g, self.N, ) else: out = cf._compute_clustering( Z, halo, self.kstar, self.dist_indices.astype(int), self.maxk, self.verb, self.log_den_err, log_den_c, g, self.N, ) secf = time.time() self.cluster_indices = out[0] self.N_clusters = out[1] self.cluster_assignment = out[2] self.cluster_centers = out[3] self.log_den_bord = out[4] + log_den_min - 1 self.log_den_bord_err = out[5] self.bord_indices = out[6] if self.verb: print(f"Clustering finished, {self.N_clusters} clusters found") print(f"total time is, {secf - seci}") return self.cluster_assignment
[docs] def compute_DecGraph(self): """Compute the decision graph.""" assert self.log_den is not None, "Compute density before" assert self.X is not None self.delta = np.zeros(self.N) self.ref = np.zeros(self.N, dtype="int") tt = np.arange(self.N) imax = [] ncalls = 0 for i in range(self.N): ll = tt[((self.log_den > self.log_den[i]) & (tt != i))] if ll.shape[0] > 0: a1, a2, a3 = np.intersect1d( self.dist_indices[i, :], ll, return_indices=True, assume_unique=True ) if a1.shape[0] > 0: aa = np.min(a2) self.delta[i] = self.distances[i, aa] self.ref[i] = self.dist_indices[i, aa] else: ncalls = ncalls + 1 dd = self.X[((self.log_den > self.log_den[i]) & (tt != i))] ds = np.transpose( sp.spatial.distance.cdist([np.transpose(self.X[i, :])], dd) ) j = np.argmin(ds) self.ref[i] = ll[j] self.delta[i] = ds[j] else: self.delta[i] = -100.0 imax.append(i) self.delta[imax] = 1.05 * np.max(self.delta) print("Number of points for which self.delta needed call to cdist=", ncalls)
[docs] def compute_clustering_DP(self, dens_cut=0.0, delta_cut=0.0, halo=False): """Compute clustering using the Density Peak algorithm. Args: dens_cut (float): cutoff on density values delta_cut (float): cutoff on distance values halo (bool): use or not halo points Returns: cluster_assignment (np.ndarray(int)): assignment of points to specific clusters References: A. Rodriguez, A. Laio, Clustering by fast search and find of density peaks, Science 344 (6191) (2014) 1492–1496. """ assert self.delta is not None ordered = np.argsort(-self.log_den) self.cluster_assignment = np.zeros(self.N, dtype="int") tt = np.arange(self.N) center_label = np.zeros(self.N, dtype="int") ncluster = -1 for i in range(self.N): j = ordered[i] if (self.log_den[j] > dens_cut) & (self.delta[j] > delta_cut): ncluster = ncluster + 1 self.cluster_assignment[j] = ncluster center_label[j] = ncluster else: self.cluster_assignment[j] = self.cluster_assignment[self.ref[j]] center_label[j] = -1 self.centers = tt[(center_label != -1)] if halo: bord = np.zeros(self.N, dtype="int") halo = np.copy(self.cluster_assignment) for i in range(self.N): for j in self.dist_indices[i, :][(self.distances[i, :] <= self.dc[i])]: if self.cluster_assignment[i] != self.cluster_assignment[j]: bord[i] = 1 halo_cutoff = np.zeros(ncluster + 1) halo_cutoff[:] = np.min(self.log_den) - 1 for j in range(ncluster + 1): td = self.log_den[((bord == 1) & (self.cluster_assignment == j))] if td.size != 0: halo_cutoff[j] = np.max(td) halo[tt[(self.log_den < halo_cutoff[self.cluster_assignment])]] = -1 self.cluster_assignment = halo return self.cluster_assignment
[docs] def compute_clustering_ADP_pure_python( # noqa: C901 self, Z=1.65, halo=False, v2=False ): """Compute ADP clustering, but without the cython optimization.""" if self.log_den is None: self.compute_density_PAk() assert not np.isnan(np.sum(self.log_den)), "log density contains nan values" assert not np.isnan( np.sum(self.log_den_err) ), "log error density contains nan values" if self.verb: print("Clustering started") if v2 is True: warnings.warn( """using adp implementation v2: this requires less memory but can be two times slower than the original implementation""", stacklevel=2, ) # Make all values of log_den positives (this is important to help convergence) # even when subtracting the value Z*log_den_err log_den_min = np.min(self.log_den - Z * self.log_den_err) # define a "log_den_c" specific to perform clustering log_den_c = self.log_den - log_den_min + 1 # Find the putative modes of the PDF as preliminary clusters sec = time.time() g = log_den_c - self.log_den_err centers, removed_centers = self._find_density_modes(g) Nclus = len(centers) if self.verb: print("Number of clusters before multimodality test=", Nclus) cluster_init, cl_struct = self._preliminary_cluster_assignment( g, centers, removed_centers ) sec2 = time.time() if self.verb: print( "{0:0.2f} seconds clustering before multimodality test".format( sec2 - sec ) ) # Find border points between putative clusters sec = time.time() if not v2: ( log_den_bord, log_den_bord_err, bord_index, ) = self._find_borders_between_clusters( Nclus, g, cl_struct, centers, cluster_init, log_den_c ) else: saddle_density, saddle_indices = self._find_borders_between_clusters_v2( Nclus, g, cl_struct, centers, cluster_init, log_den_c ) sec2 = time.time() if self.verb: print("{0:0.2f} seconds identifying the borders".format(sec2 - sec)) sec = time.time() if not v2: surviving_clusters, bord_index = self._multimodality_test( Nclus, Z, log_den_c, centers, cl_struct, log_den_bord, log_den_bord_err, bord_index, ) else: ( surviving_clusters, saddle_density, saddle_indices, ) = self._multimodality_test_v2( Nclus, Z, log_den_c, centers, cl_struct, saddle_density, saddle_indices ) sec2 = time.time() if self.verb: print("{0:0.2f} seconds with multimodality test".format(sec2 - sec)) if not v2: ( N_clusters, cluster_assignment, cluster_indices, cluster_centers, log_den_bord, log_den_bord_err, bord_indices, ) = self._finalise_clustering( Nclus, halo, surviving_clusters, cl_struct, centers, log_den_bord, log_den_bord_err, bord_index, log_den_c, ) else: ( N_clusters, cluster_assignment, cluster_indices, cluster_centers, log_den_bord, log_den_bord_err, bord_indices, ) = self._finalise_clustering_v2( Nclus, halo, surviving_clusters, cl_struct, centers, saddle_density, saddle_indices, log_den_c, ) self.cluster_indices = cluster_indices self.N_clusters = N_clusters self.cluster_assignment = cluster_assignment self.cluster_centers = cluster_centers self.log_den_bord = ( log_den_bord + log_den_min - 1 ) # remove wrong normalisation introduced earlier self.log_den_bord_err = log_den_bord_err self.bord_indices = bord_indices if self.verb: print("Clustering finished, {} clusters found".format(self.N_clusters)) return self.cluster_assignment
# ------------ helper methods for compute_clustering_ADP_pure_python ------------ # def _find_density_modes(self, g): """Find the modes of the density.""" centers = [] for i in range(self.N): t = 0 for j in range(1, self.kstar[i] + 1): if g[i] < g[self.dist_indices[i, j]]: t = 1 break # "i" is a center if it has no point at a higher density if t == 0: centers.append(i) centers_iter = centers.copy() removed_centers = [] for i_center in centers_iter: l, m = np.where(self.dist_indices == i_center) # keep only neighborhoods where i_center is within kstar mask = m <= self.kstar[l] l, m = l[mask], m[mask] # index of the point of maximum density in these neighborhoods max_rho = np.argmax(g[l]) max_rho = l[max_rho] # if the density of 'max_rho' is higher than that of i_center, # remove i_center and store [i_center, max_rho] (see later) if g[max_rho] > g[i_center]: # check if this max_rho is already in removed centers if len(removed_centers) > 0: is_valid = np.where(np.array(removed_centers)[:, 0] == max_rho)[0] # if current max_rho is in removed centers use its max_rho if is_valid.size > 0: max_rho = removed_centers[is_valid[0]][1] removed_centers.append([i_center, max_rho]) centers.remove(i_center) return centers, np.array(removed_centers) def _preliminary_cluster_assignment(self, g, centers, removed_centers): """Find a preliminary assignment of points to the closest density peak. Args: g: scaled log density of points centers: preliminary density peaks Returns: cluster_init (list(int)): preliminary assignations of points to clusters cl_struct (list(list(int))): list of points in each cluster """ # all points assigned to no clusters initially cluster_init = [-1] * self.N # assign centers to their own cluster for i in centers: cluster_init[i] = centers.index(i) # Get the rank of the elements in the g vector # sorted in decreasing order. sortg = np.argsort(-g) # Perform preliminary assignation to clusters maxk = self.dist_indices.shape[1] for j in range(self.N): ele = sortg[j] nn = 0 while cluster_init[ele] == -1 and nn < maxk - 1: nn = nn + 1 cluster_init[ele] = cluster_init[self.dist_indices[ele, nn]] # if in ther first maxk there is no point assigned to a cluster, # assign the point to the nearby center of max density if cluster_init[ele] == -1: # all the neighbors of ele (ele included!) ele_neighbors = self.dist_indices[ele] # indices (in 'removed_centers') of removed centers in the neighbor of ele (may include ele itself!) # (all_removed_centers --> indices of the centers that have been removed) all_removed_centers = removed_centers[:, 0] _, _, ind_removed_centers_ele = np.intersect1d( ele_neighbors, all_removed_centers, return_indices=True ) # indices (in 'removed_centers') of the centers of higher density # in the neighborhood of 'removed centers' # (higher_density_centers --> centers of higher density that have a # 'removed center' in their neighborhood) higher_density_centers = removed_centers[:, 1] higher_density_centers_ele = higher_density_centers[ ind_removed_centers_ele ] # index (in 'cluster_init') of the 'maximum density center' of such neighboring centers max_center = higher_density_centers_ele[ np.argmax(g[higher_density_centers_ele]) ] cluster_init[ele] = cluster_init[max_center] # useful list of points in the clusters cl_struct = [] for i in range(len(centers)): x1 = [] for j in range(self.N): if cluster_init[j] == i: x1.append(j) cl_struct.append(x1) return cluster_init, cl_struct def _find_borders_between_clusters( # noqa: C901 self, Nclus, g, cl_struct, centers, cluster_init, log_den_c ): """Find saddle points between clusters. Assume i and j are points belonging to the clusters c(i) and c(j). Then, point i is a border point between c(i) and c(j) if: a) It has in its neighborhood a point j belonging to other cluster. b) There are no other points belonging to c(i) nearer from point j c) It has the maximum density among the points that fulfill a) & b) Args: Nclus: number of clusters g: scaled log density of points cl_struct: points assigned to each cluster centers: density peaks cluster_init: cluster assignment log_den_c: log density of points Returns: log_den_bord: log density of the saddle points log_den_bord_err: error on the log density of saddle points bord_index: square matrix providing the index of the saddle point between any two peaks """ # this implementation can be vry costly if Nclus is > 10^4 if Nclus > 10000: warnings.warn( """There are > 10k initial putative clusters: the matrices of the saddle points may cause out of memory error (3x Nclus x Nclus --> > 2.4 GB required). If this is the case, call compute_clustering_ADP_pure_python(v2 = True). Ignore the warning otherwise.""", stacklevel=2, ) try: log_den_bord = np.zeros((Nclus, Nclus), dtype=np.float64) log_den_bord_err = np.zeros((Nclus, Nclus), dtype=np.float64) # set all bord indices to -1 (no points) initially bord_index = np.ones((Nclus, Nclus), dtype=int) * -1 except MemoryError: print( f"Nclus = {Nclus}. Out of memory error: call compute_clustering_ADP_pure_python(v2 = True)" ) for c in range(Nclus): for p1 in cl_struct[c]: # p1 points in a given cluster if p1 in centers: pp = -1 else: # a point p2 in the neighborhood of p1 belongs to a cluster cp different from p1 for k in range(1, self.kstar[p1] + 1): p2 = self.dist_indices[p1, k] pp = -1 if ( cluster_init[p2] != c ): # a point in the neighborhood of p1 belongs to another cluster pp = p2 cp = cluster_init[pp] # neighbor cluster index break if pp != -1: for k in range(1, self.maxk): po = self.dist_indices[pp, k] # pp here is p2 if po == p1: # p1 is the closest point of p2 belonging to c break if ( cluster_init[po] == c ): # there is a point of c closest to p2 than p1 pp = -1 break # here check if the border point is a saddle point if pp != -1: if ( g[p1] > log_den_bord[c][cp] ): # g = log_den_c - self.log_den_err different from next log_den_bord without log_den_err log_den_bord[c][cp] = g[p1] log_den_bord[cp][c] = g[p1] bord_index[cp][c] = p1 bord_index[c][cp] = p1 # fill in matrices of bord densities and errors in a symmetric way for i in range(Nclus - 1): for j in range(i + 1, Nclus): if bord_index[i][j] != -1: log_den_bord[i][j] = log_den_c[bord_index[i][j]] log_den_bord[j][i] = log_den_c[bord_index[j][i]] log_den_bord_err[i][j] = self.log_den_err[bord_index[i][j]] log_den_bord_err[j][i] = self.log_den_err[bord_index[j][i]] # set diagonal to -1 for i in range(Nclus): log_den_bord[i][i] = -1.0 log_den_bord_err[i][i] = 0.0 return log_den_bord, log_den_bord_err, bord_index def _multimodality_test( self, Nclus, Z, log_den_c, centers, cl_struct, log_den_bord, log_den_bord_err, bord_index, ): """Merge couples of peaks if these are not statistically significant.""" check = 1 # all clusters are initialised as "surviving" surviving_clusters = [1] * Nclus while check == 1: # density and position of borders to be merged pos = [] ipos = [] jpos = [] check = 0 # check whether there are statistically not-significant couples of peaks for i in range(Nclus - 1): for j in range(i + 1, Nclus): # differences in peaks i->j and j->i a1 = log_den_c[centers[i]] - log_den_bord[i][j] a2 = log_den_c[centers[j]] - log_den_bord[i][j] # statistical thresholds e1 = Z * (self.log_den_err[centers[i]] + log_den_bord_err[i][j]) e2 = Z * (self.log_den_err[centers[j]] + log_den_bord_err[i][j]) # if two peaks are not statistically significant, save their border if a1 < e1 or a2 < e2: check = 1 pos.append(log_den_bord[i][j]) ipos.append(i) jpos.append(j) # merging process if check == 1: # start merging from border of higher density barrier_index = pos.index(max(pos)) imod = ipos[barrier_index] jmod = jpos[barrier_index] # normalised log density of first and second peak to be merged c1 = (log_den_c[centers[imod]] - log_den_bord[imod][jmod]) / ( self.log_den_err[centers[imod]] + log_den_bord_err[imod][jmod] ) c2 = (log_den_c[centers[jmod]] - log_den_bord[imod][jmod]) / ( self.log_den_err[centers[jmod]] + log_den_bord_err[imod][jmod] ) # by default, the second peak is removed (c2) if however c1 < c2 the indices are # exchanged and the first cluster is removed instead if c1 < c2: tmp = jmod jmod = imod imod = tmp # cluster jmod is removed surviving_clusters[jmod] = 0 # log_den_bord between the removed peaks is set to -1 and error to 0 log_den_bord[imod][jmod] = -1.0 log_den_bord[jmod][imod] = -1.0 log_den_bord_err[imod][jmod] = 0.0 log_den_bord_err[jmod][imod] = 0.0 # the points of the jmod peak are added to the imod peak cl_struct[imod].extend(cl_struct[jmod]) cl_struct[jmod] = [] # recompute the borders with the other peaks for i in range(Nclus): if i != imod and i != jmod: if log_den_bord[imod][i] < log_den_bord[jmod][i]: bord_index[imod][i] = bord_index[jmod][i] bord_index[i][imod] = bord_index[imod][i] log_den_bord[imod][i] = log_den_bord[jmod][i] log_den_bord[i][imod] = log_den_bord[imod][i] log_den_bord_err[imod][i] = log_den_bord_err[jmod][i] log_den_bord_err[i][imod] = log_den_bord_err[imod][i] log_den_bord[jmod][i] = -1 log_den_bord[i][jmod] = log_den_bord[jmod][i] log_den_bord_err[jmod][i] = 0 log_den_bord_err[i][jmod] = log_den_bord_err[jmod][i] return surviving_clusters, bord_index def _finalise_clustering( # noqa: C901 self, Nclus, halo, surviving_clusters, cl_struct, centers, log_den_bord, log_den_bord_err, bord_index, log_den_c, ): """Finalise clustering.""" # compute final N_clusters, cluster_indices and cluster_centers N_clusters = 0 cluster_indices = [] cluster_centers = [] nnum = [] for j in range(Nclus): nnum.append(-1) if surviving_clusters[j] == 1: nnum[j] = N_clusters N_clusters += 1 cluster_indices.append(cl_struct[j]) cluster_centers.append(centers[j]) # initialise the final arrays bord_indices_m = np.zeros((N_clusters, N_clusters), dtype=int) log_den_bord_m = np.zeros((N_clusters, N_clusters), dtype=float) log_den_bord_err_m = np.zeros((N_clusters, N_clusters), dtype=float) for j in range(Nclus): if surviving_clusters[j] == 1: jj = nnum[j] for k in range(Nclus): if surviving_clusters[k] == 1: kk = nnum[k] log_den_bord_m[jj][kk] = log_den_bord[j][k] log_den_bord_err_m[jj][kk] = log_den_bord_err[j][k] bord_indices_m[jj][kk] = bord_index[j][k] Last_cls = np.empty(self.N, dtype=int) for j in range(N_clusters): for k in cluster_indices[j]: Last_cls[k] = j Last_cls_halo = np.copy(Last_cls) nh = 0 for j in range(N_clusters): log_den_halo = max(log_den_bord_m[j]) for k in cluster_indices[j]: if log_den_c[k] < log_den_halo: nh = nh + 1 Last_cls_halo[k] = -1 if halo: cluster_assignment = Last_cls_halo else: cluster_assignment = Last_cls log_den_bord_m = np.copy(log_den_bord_m) return ( N_clusters, cluster_assignment, cluster_indices, cluster_centers, log_den_bord_m, log_den_bord_err_m, bord_indices_m, ) def _find_borders_between_clusters_v2( # noqa: C901 self, Nclus, g, cl_struct, centers, cluster_init, log_den_c ): """Find saddle points between clusters. Assume i and j are points belonging to the clusters c(i) and c(j). Then, point i is a border point between c(i) and c(j) if: a) It has in its neighborhood a point j belonging to other cluster. b) There are no other points belonging to c(i) nearer from point j c) It has the maximum density among the points that fulfill a) & b) Args: Nclus: number of clusters int g: scaled log density of points np.array(N) cl_struct: points assigned to each cluster list(list()) centers: data index of density peaks np.array(Nclus) cluster_init: cluster assignment np.array(N) log_den_c: log density of points np.array(N) Returns: log_den_bord: log density of the saddle points log_den_bord_err: error on the log density of saddle points bord_index: square matrix providing the index of the saddle point between any two peaks """ saddle_count = 0 for c in range(Nclus): # saddle point index, saddle point density, saddle point error, cluster1 index, cluster2 index, valid_saddle # create another array of indices saddle_density_tmp = np.zeros( (Nclus, 3), dtype=float ) # density, density_error, normalized_density saddle_indices_tmp = -np.ones( (Nclus, 4), dtype=int ) # saddle point, cluster1, cluster2, is_valid saddle if c == 0: saddle_density = saddle_density_tmp saddle_indices = saddle_indices_tmp # array of saddle points of cluster c (will be eventually much less than Nclus) else: saddle_density = np.concatenate( (saddle_density, saddle_density_tmp), axis=0 ) saddle_indices = np.concatenate( (saddle_indices, saddle_indices_tmp), axis=0 ) for p1 in cl_struct[c]: # p1 point in a given cluster if p1 in centers: pp = -1 else: # a point p2 in the neighborhood of p1 belongs to a cluster cp different from p1 for k in range(1, self.kstar[p1] + 1): p2 = self.dist_indices[p1, k] pp = -1 if ( cluster_init[p2] != c ): # a point in the neighborhood of p1 belongs to another cluster pp = p2 cp = cluster_init[pp] # neighbor cluster index break if pp != -1: # p1 is the closest point in c to p2 for k in range(1, self.maxk): # why maxk? po = self.dist_indices[pp, k] # pp here is p2 if po == p1: # p1 is the closest point of p2 belonging to c break if ( cluster_init[po] == c ): # p1 is NOT the closest point of p2 belonging to c pp = -1 break # here check if the border point is a saddle point if pp != -1: # maybe contrunct a matrix of border points [p, g, c1, c2] # in log_den_bord_tmp the cluster are coupled with indices that go from smaller to bigger if c > cp: c1 = cp c2 = c else: c1 = c c2 = cp flag = 0 for i in range(saddle_count): if c1 == saddle_indices[i, 1] and c2 == saddle_indices[i, 2]: flag = 1 # there is already a border point between c and cp if g[p1] > saddle_density[i, 2]: saddle_indices[i, 0] = p1 # useful at the end saddle_density[i] = np.array( [log_den_c[p1], self.log_den_err[p1], g[p1]] ) break if flag == 0: saddle_indices[saddle_count] = np.array( [p1, c1, c2, 1], dtype=int ) saddle_density[saddle_count] = np.array( [log_den_c[p1], self.log_den_err[p1], g[p1]] ) saddle_count += 1 saddle_density = saddle_density[:saddle_count] saddle_indices = saddle_indices[:saddle_count] sort_indices = saddle_density[:, 0].argsort()[::-1] saddle_density = saddle_density[sort_indices] saddle_indices = saddle_indices[sort_indices] return saddle_density[:, :2], saddle_indices def _multimodality_test_v2( self, Nclus, Z, log_den_c, centers, cl_struct, saddle_density, saddle_indices ): """Merge couples of peaks if these are not statistically significant. Args: Nclus: number of clusters int Z: scaled log density of points int log_den_c: log density of points np.array(N) centers: data index of density peaks np.array(Nclus) cl_struct: points assigned to each cluster list(list()) log_den_bord, np.array((Nclus, 6)) """ check = 1 # all clusters are initialised as "surviving" surviving_clusters = np.ones(Nclus, dtype=int) nsaddles = len(saddle_density) while check == 1: check = 0 current_saddle = -1.0 for i in range(nsaddles): # log_den_bord already sorted if saddle_indices[i, 3] == 1: clus1, clus2 = ( saddle_indices[i, 1], saddle_indices[i, 2], ) # cluster indices center1, center2 = ( centers[clus1], centers[clus2], ) # data indices peak of clus1, clus2 a1 = ( log_den_c[center1] - saddle_density[i, 0] ) # log_den_c is an array of the density of all the points a2 = log_den_c[center2] - saddle_density[i, 0] sum_err1 = self.log_den_err[center1] + saddle_density[i, 1] sum_err2 = self.log_den_err[center2] + saddle_density[i, 1] # statistical thresholds e1 = Z * sum_err1 e2 = Z * sum_err2 if a1 < e1 or a2 < e2: check = 1 if current_saddle < saddle_density[i, 0]: max_a1, max_a2 = a1, a2 max_sum_err1, max_sum_err2 = sum_err1, sum_err2 to_remove = i current_saddle = saddle_density[i, 0] if check == 1: saddle_indices[ to_remove, -1 ] = 0 # the couple center1, center2 is removed margin1 = max_a1 / max_sum_err1 margin2 = max_a2 / max_sum_err2 # only the peak with the highest margin is kept: # by convention we set this peak to be center1 if margin1 < margin2: c1, c2 = saddle_indices[to_remove, 2], saddle_indices[to_remove, 1] else: c1, c2 = saddle_indices[to_remove, 1], saddle_indices[to_remove, 2] surviving_clusters[c2] = 0 # the second peak is removed cl_struct[c1].extend(cl_struct[c2]) cl_struct[c2] = [] # select the clusters that are neighbors to cluster1 and cluster2 # cluster label, saddle label, position of c1/c2 in saddle_indices[:, 1:3] saddle_indices = self._find_neighbor_clusters_v2( saddle_density, saddle_indices, to_remove, nsaddles, c1, c2 ) return surviving_clusters, saddle_density, saddle_indices def _finalise_clustering_v2( # noqa: C901 self, Nclus, halo, surviving_clusters, cl_struct, centers, saddle_density, saddle_indices, log_den_c, ): """Finalise clustering.""" N_clusters = 0 mapping = -np.ones(Nclus, dtype=int) # all original centers cluster_indices, cluster_centers = [], [] for i in range(Nclus): if surviving_clusters[i] == 1: mapping[i] = N_clusters # center is surviving N_clusters += 1 # !!!!!! check the index i is correct? cluster_indices.append(cl_struct[i]) cluster_centers.append(centers[i]) bord_indices_m = -np.ones((N_clusters, N_clusters), dtype=int) log_den_bord_m = np.zeros((N_clusters, N_clusters), dtype=float) log_den_bord_err_m = np.zeros((N_clusters, N_clusters), dtype=float) for i in range(len(saddle_density)): # all original saddles if saddle_indices[i, 3] == 1: # a valid saddle is between two valid centers j, k = ( mapping[saddle_indices[i, 1]], mapping[saddle_indices[i, 2]], ) # map the two valid centers in the corresponding valid new cluster labels log_den_bord_m[j, k] = saddle_density[i, 0] log_den_bord_m[k, j] = log_den_bord_m[j, k] log_den_bord_err_m[j, k] = saddle_density[i, 1] log_den_bord_err_m[k, j] = log_den_bord_err_m[j, k] bord_indices_m[j, k] = saddle_indices[i, 0] bord_indices_m[k, j] = bord_indices_m[j, k] log_den_bord_m[np.diag_indices(N_clusters)] = -1 Last_cls = np.empty(self.N, dtype=int) for j in range(N_clusters): for k in cluster_indices[j]: Last_cls[k] = j Last_cls_halo = np.copy(Last_cls) nh = 0 for j in range(N_clusters): log_den_halo = max(log_den_bord_m[j]) for k in cluster_indices[j]: if log_den_c[k] < log_den_halo: nh = nh + 1 Last_cls_halo[k] = -1 if halo: cluster_assignment = Last_cls_halo else: cluster_assignment = Last_cls log_den_bord_m = np.copy(log_den_bord_m) return ( N_clusters, cluster_assignment, cluster_indices, cluster_centers, log_den_bord_m, log_den_bord_err_m, bord_indices_m, ) def _find_neighbor_clusters_v2( # noqa: C901 self, saddle_density, saddle_indices, to_remove, nsaddles, c1, c2, ): neighbors1 = np.zeros((len(saddle_density), 3), dtype=int) neighbors2 = np.zeros((len(saddle_density), 3), dtype=int) len_neigh1, len_neigh2 = 0, 0 for i in range(nsaddles): if i != to_remove and saddle_indices[i, 3] == 1: cluster_couple = saddle_indices[i, 1:3] if c1 in cluster_couple: neighbors1[len_neigh1] = np.array([cluster_couple[0], i, 2]) if cluster_couple[0] == c1: neighbors1[len_neigh1] = np.array([cluster_couple[1], i, 1]) len_neigh1 += 1 elif c2 in cluster_couple: neighbors2[len_neigh2] = np.array([cluster_couple[0], i, 2]) if cluster_couple[0] == c2: neighbors2[len_neigh2] = np.array([cluster_couple[1], i, 1]) len_neigh2 += 1 neighbors1, neighbors2 = neighbors1[:len_neigh1], neighbors2[:len_neigh2] # check which are the common neighbors between cluster1 and cluster2 neighbors1 = neighbors1[neighbors1[:, 0].argsort()] neighbors2 = neighbors2[neighbors2[:, 0].argsort()] i, j = 0, 0 while i < len(neighbors1) and j < len(neighbors2): if neighbors1[i, 0] == neighbors2[j, 0]: # there is a common element index1, index2 = ( neighbors1[i, 1], neighbors2[j, 1], ) # index common neighbor w.r.t cluster1 if saddle_density[index1, 0] < saddle_density[index2, 0]: # if the density of the saddle is higher between cluster2 and common neighbor, # use this as new saddle between cluster1 and common neighbor saddle_density[index1, 0] = saddle_density[ index2, 0 ] # saddle density saddle_density[index1, 1] = saddle_density[ index2, 1 ] # saddle error # the couples of common elements with the c2 are "deleted" saddle_indices[index2, -1] = 0 i += 1 j += 1 elif neighbors1[i, 0] < neighbors2[j, 0]: i += 1 else: # neighbors of c2 which were not neighbors of c1 become neighbors of c1 index2, c2_index = neighbors2[j, 1], neighbors2[j, 2] saddle_indices[index2, c2_index] = c1 j += 1 # some js in the comparison may not have been taken into account while j < len(neighbors2): index2, c2_index = neighbors2[j, 1], neighbors2[j, 2] saddle_indices[index2, c2_index] = c1 j += 1 return saddle_indices