Source code for clustering

# Copyright 2021-2023 The DADApy Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""
The *clustering* module contains the *Clustering* class.

Density-based clustering algorithms are implemented as methods of this class.
"""

import multiprocessing
import time
import warnings

import numpy as np
import scipy as sp

from dadapy._cython import cython_clustering as cf
from dadapy._cython import cython_clustering_v2 as cf2
from dadapy.density_estimation import DensityEstimation

cores = multiprocessing.cpu_count()



[docs]
class Clustering(DensityEstimation):
    """Perform clustering using various density-based clustering algorithms.

    Inherits from the DensityEstimation class.

    Attributes:
        N_clusters (int): Number of clusters found
        cluster_assignment (list(int)): A list of length N containing the cluster assignment of each point as an
            integer from 0 to N_clusters-1.
        cluster_centers (list(int)): Indices of the centroids of each cluster (density peak)
        cluster_indices (list(list(int))): A list of lists. Each sublist contains the indices belonging to the
            corresponding cluster.
        log_den_bord (np.ndarray(float)): A matrix of dimensions N_clusters x N_clusters containing
            the estimated log density of the saddle point between each couple of peaks.
        log_den_bord_err (np.ndarray(float)): A matrix of dimensions N_clusters x N_clusters containing
            the estimated error on the log density of the saddle point between each couple of peaks.
        bord_indices (np.ndarray(float)): A matrix of dimensions N_clusters x N_clusters containing the indices of
            the saddle point between each couple of peaks.

    """

    def __init__(
        self, coordinates=None, distances=None, maxk=None, verbose=False, n_jobs=cores
    ):
        """Initialise the Clustering class."""
        super().__init__(
            coordinates=coordinates,
            distances=distances,
            maxk=maxk,
            verbose=verbose,
            n_jobs=n_jobs,
        )

        self.cluster_indices = None
        self.N_clusters = None
        self.cluster_assignment = None
        self.cluster_centers = None
        self.log_den_bord_err = None
        self.log_den_bord = None
        self.bord_indices = None

        self.delta = None  # Minimum distance from an element with higher density
        self.ref = None  # Index of the nearest element with higher density


[docs]
    def compute_clustering_ADP(self, Z=1.65, halo=False, v2=False):
        """Compute clustering according to the algorithm DPA.

        The only free parameter is the merging factor Z, which controls how the different density peaks are merged
        together. The higher the Z, the more aggressive the merging, the smaller the number of clusters.
        The calculation is optimized though cython

        Args:
            Z(float): merging parameter
            halo (bool): compute (or not) the halo points

        Returns:
            cluster_assignment (np.ndarray(int)): assignment of points to specific clusters

        References:
            M. d’Errico, E. Facco, A. Laio, A. Rodriguez, Automatic topography  of  high-dimensional  data  sets  by
                non-parametric  density peak clustering, Information Sciences 560 (2021) 476–492

        """
        if self.log_den is None:
            self.compute_density_PAk()

        assert not np.isnan(np.sum(self.log_den)), "log density contains nan values"
        assert not np.isnan(
            np.sum(self.log_den_err)
        ), "log error density contains nan values"

        if self.verb:
            print("Clustering started")

        if v2 is True:
            warnings.warn(
                """using adp implementation v2: this requires less memory but can
                be two times slower than the original implementation""",
                stacklevel=2,
            )

        # Make all values of log_den positives (this is important to help convergence)
        # even when subtracting the value Z*log_den_err
        log_den_min = np.min(self.log_den - Z * self.log_den_err)
        log_den_c = self.log_den - log_den_min + 1

        # Putative modes of the PDF as preliminary clusters
        g = log_den_c - self.log_den_err

        # centers are point of max density  (max(g) ) within their optimal neighborhood (defined by kstar)
        seci = time.time()

        if v2:
            out = cf2._compute_clustering(
                Z,
                halo,
                self.kstar,
                self.dist_indices.astype(int),
                self.maxk,
                self.verb,
                self.log_den_err,
                log_den_c,
                g,
                self.N,
            )
        else:
            out = cf._compute_clustering(
                Z,
                halo,
                self.kstar,
                self.dist_indices.astype(int),
                self.maxk,
                self.verb,
                self.log_den_err,
                log_den_c,
                g,
                self.N,
            )

        secf = time.time()

        self.cluster_indices = out[0]
        self.N_clusters = out[1]
        self.cluster_assignment = out[2]
        self.cluster_centers = out[3]
        self.log_den_bord = out[4] + log_den_min - 1
        self.log_den_bord_err = out[5]
        self.bord_indices = out[6]

        if self.verb:
            print(f"Clustering finished, {self.N_clusters} clusters found")
            print(f"total time is, {secf - seci}")

        return self.cluster_assignment



[docs]
    def compute_DecGraph(self):
        """Compute the decision graph."""
        assert self.log_den is not None, "Compute density before"
        assert self.X is not None
        self.delta = np.zeros(self.N)
        self.ref = np.zeros(self.N, dtype="int")
        tt = np.arange(self.N)
        imax = []
        ncalls = 0
        for i in range(self.N):
            ll = tt[((self.log_den > self.log_den[i]) & (tt != i))]
            if ll.shape[0] > 0:
                a1, a2, a3 = np.intersect1d(
                    self.dist_indices[i, :], ll, return_indices=True, assume_unique=True
                )
                if a1.shape[0] > 0:
                    aa = np.min(a2)
                    self.delta[i] = self.distances[i, aa]
                    self.ref[i] = self.dist_indices[i, aa]
                else:
                    ncalls = ncalls + 1
                    dd = self.X[((self.log_den > self.log_den[i]) & (tt != i))]
                    ds = np.transpose(
                        sp.spatial.distance.cdist([np.transpose(self.X[i, :])], dd)
                    )
                    j = np.argmin(ds)
                    self.ref[i] = ll[j]
                    self.delta[i] = ds[j]
            else:
                self.delta[i] = -100.0
                imax.append(i)
        self.delta[imax] = 1.05 * np.max(self.delta)
        print("Number of points for which self.delta needed call to cdist=", ncalls)



[docs]
    def compute_clustering_DP(self, dens_cut=0.0, delta_cut=0.0, halo=False):
        """Compute clustering using the Density Peak algorithm.

        Args:
            dens_cut (float): cutoff on density values
            delta_cut (float): cutoff on distance values
            halo (bool): use or not halo points

        Returns:
            cluster_assignment (np.ndarray(int)): assignment of points to specific clusters

        References:
            A. Rodriguez, A. Laio, Clustering by fast search and find of density peaks,
            Science 344 (6191) (2014) 1492–1496.
        """
        assert self.delta is not None
        ordered = np.argsort(-self.log_den)
        self.cluster_assignment = np.zeros(self.N, dtype="int")
        tt = np.arange(self.N)
        center_label = np.zeros(self.N, dtype="int")
        ncluster = -1
        for i in range(self.N):
            j = ordered[i]
            if (self.log_den[j] > dens_cut) & (self.delta[j] > delta_cut):
                ncluster = ncluster + 1
                self.cluster_assignment[j] = ncluster
                center_label[j] = ncluster
            else:
                self.cluster_assignment[j] = self.cluster_assignment[self.ref[j]]
                center_label[j] = -1
        self.centers = tt[(center_label != -1)]
        if halo:
            bord = np.zeros(self.N, dtype="int")
            halo = np.copy(self.cluster_assignment)

            for i in range(self.N):
                for j in self.dist_indices[i, :][(self.distances[i, :] <= self.dc[i])]:
                    if self.cluster_assignment[i] != self.cluster_assignment[j]:
                        bord[i] = 1
            halo_cutoff = np.zeros(ncluster + 1)
            halo_cutoff[:] = np.min(self.log_den) - 1
            for j in range(ncluster + 1):
                td = self.log_den[((bord == 1) & (self.cluster_assignment == j))]
                if td.size != 0:
                    halo_cutoff[j] = np.max(td)
            halo[tt[(self.log_den < halo_cutoff[self.cluster_assignment])]] = -1
            self.cluster_assignment = halo

        return self.cluster_assignment



[docs]
    def compute_clustering_ADP_pure_python(  # noqa: C901
        self, Z=1.65, halo=False, v2=False
    ):
        """Compute ADP clustering, but without the cython optimization."""
        if self.log_den is None:
            self.compute_density_PAk()

        assert not np.isnan(np.sum(self.log_den)), "log density contains nan values"
        assert not np.isnan(
            np.sum(self.log_den_err)
        ), "log error density contains nan values"

        if self.verb:
            print("Clustering started")

        if v2 is True:
            warnings.warn(
                """using adp implementation v2: this requires less memory but
                can be two times slower than the original implementation""",
                stacklevel=2,
            )

        # Make all values of log_den positives (this is important to help convergence)
        # even when subtracting the value Z*log_den_err
        log_den_min = np.min(self.log_den - Z * self.log_den_err)
        # define a "log_den_c" specific to perform clustering
        log_den_c = self.log_den - log_den_min + 1

        # Find the putative modes of the PDF as preliminary clusters
        sec = time.time()

        g = log_den_c - self.log_den_err

        centers, removed_centers = self._find_density_modes(g)

        Nclus = len(centers)

        if self.verb:
            print("Number of clusters before multimodality test=", Nclus)

        cluster_init, cl_struct = self._preliminary_cluster_assignment(
            g, centers, removed_centers
        )

        sec2 = time.time()
        if self.verb:
            print(
                "{0:0.2f} seconds clustering before multimodality test".format(
                    sec2 - sec
                )
            )

        # Find border points between putative clusters
        sec = time.time()
        if not v2:
            (
                log_den_bord,
                log_den_bord_err,
                bord_index,
            ) = self._find_borders_between_clusters(
                Nclus, g, cl_struct, centers, cluster_init, log_den_c
            )
        else:
            saddle_density, saddle_indices = self._find_borders_between_clusters_v2(
                Nclus, g, cl_struct, centers, cluster_init, log_den_c
            )

        sec2 = time.time()
        if self.verb:
            print("{0:0.2f} seconds identifying the borders".format(sec2 - sec))

        sec = time.time()
        if not v2:
            surviving_clusters, bord_index = self._multimodality_test(
                Nclus,
                Z,
                log_den_c,
                centers,
                cl_struct,
                log_den_bord,
                log_den_bord_err,
                bord_index,
            )
        else:
            (
                surviving_clusters,
                saddle_density,
                saddle_indices,
            ) = self._multimodality_test_v2(
                Nclus, Z, log_den_c, centers, cl_struct, saddle_density, saddle_indices
            )

        sec2 = time.time()
        if self.verb:
            print("{0:0.2f} seconds with multimodality test".format(sec2 - sec))

        if not v2:
            (
                N_clusters,
                cluster_assignment,
                cluster_indices,
                cluster_centers,
                log_den_bord,
                log_den_bord_err,
                bord_indices,
            ) = self._finalise_clustering(
                Nclus,
                halo,
                surviving_clusters,
                cl_struct,
                centers,
                log_den_bord,
                log_den_bord_err,
                bord_index,
                log_den_c,
            )

        else:
            (
                N_clusters,
                cluster_assignment,
                cluster_indices,
                cluster_centers,
                log_den_bord,
                log_den_bord_err,
                bord_indices,
            ) = self._finalise_clustering_v2(
                Nclus,
                halo,
                surviving_clusters,
                cl_struct,
                centers,
                saddle_density,
                saddle_indices,
                log_den_c,
            )

        self.cluster_indices = cluster_indices
        self.N_clusters = N_clusters
        self.cluster_assignment = cluster_assignment
        self.cluster_centers = cluster_centers
        self.log_den_bord = (
            log_den_bord + log_den_min - 1
        )  # remove wrong normalisation introduced earlier
        self.log_den_bord_err = log_den_bord_err
        self.bord_indices = bord_indices

        if self.verb:
            print("Clustering finished, {} clusters found".format(self.N_clusters))

        return self.cluster_assignment


    # ------------ helper methods for compute_clustering_ADP_pure_python ------------ #

    def _find_density_modes(self, g):
        """Find the modes of the density."""
        centers = []
        for i in range(self.N):
            t = 0
            for j in range(1, self.kstar[i] + 1):
                if g[i] < g[self.dist_indices[i, j]]:
                    t = 1
                    break

            # "i" is a center if it has no point at a higher density
            if t == 0:
                centers.append(i)

        centers_iter = centers.copy()

        removed_centers = []
        for i_center in centers_iter:
            l, m = np.where(self.dist_indices == i_center)

            # keep only neighborhoods where i_center is within kstar
            mask = m <= self.kstar[l]
            l, m = l[mask], m[mask]
            # index of the point of maximum density in these neighborhoods
            max_rho = np.argmax(g[l])
            max_rho = l[max_rho]

            # if the density of 'max_rho' is higher than that of i_center,
            # remove i_center and store [i_center,  max_rho] (see later)
            if g[max_rho] > g[i_center]:
                # check if this max_rho is already in removed centers
                if len(removed_centers) > 0:
                    is_valid = np.where(np.array(removed_centers)[:, 0] == max_rho)[0]
                    # if current max_rho is in removed centers use its max_rho
                    if is_valid.size > 0:
                        max_rho = removed_centers[is_valid[0]][1]

                removed_centers.append([i_center, max_rho])
                centers.remove(i_center)

        return centers, np.array(removed_centers)

    def _preliminary_cluster_assignment(self, g, centers, removed_centers):
        """Find a preliminary assignment of points to the closest density peak.

        Args:
            g: scaled log density of points
            centers: preliminary density peaks

        Returns:
            cluster_init (list(int)): preliminary assignations of points to clusters
            cl_struct (list(list(int))): list of points in each cluster
        """
        # all points assigned to no clusters initially
        cluster_init = [-1] * self.N

        # assign centers to their own cluster
        for i in centers:
            cluster_init[i] = centers.index(i)

        # Get the rank of the elements in the g vector
        # sorted in decreasing order.
        sortg = np.argsort(-g)

        # Perform preliminary assignation to clusters
        maxk = self.dist_indices.shape[1]
        for j in range(self.N):
            ele = sortg[j]
            nn = 0
            while cluster_init[ele] == -1 and nn < maxk - 1:
                nn = nn + 1
                cluster_init[ele] = cluster_init[self.dist_indices[ele, nn]]

            # if in ther first maxk there is no point assigned to a cluster,
            # assign the point to the nearby center of max density
            if cluster_init[ele] == -1:
                # all the neighbors of ele (ele included!)
                ele_neighbors = self.dist_indices[ele]

                # indices (in 'removed_centers') of removed centers in the neighbor of ele (may include ele itself!)
                # (all_removed_centers --> indices of the centers that have been removed)
                all_removed_centers = removed_centers[:, 0]

                _, _, ind_removed_centers_ele = np.intersect1d(
                    ele_neighbors, all_removed_centers, return_indices=True
                )

                # indices (in 'removed_centers') of the centers of higher density
                # in the neighborhood of 'removed centers'
                # (higher_density_centers --> centers of higher density that have a
                # 'removed center' in their neighborhood)
                higher_density_centers = removed_centers[:, 1]
                higher_density_centers_ele = higher_density_centers[
                    ind_removed_centers_ele
                ]

                # index (in 'cluster_init') of the 'maximum density center' of such neighboring centers
                max_center = higher_density_centers_ele[
                    np.argmax(g[higher_density_centers_ele])
                ]

                cluster_init[ele] = cluster_init[max_center]

        # useful list of points in the clusters
        cl_struct = []
        for i in range(len(centers)):
            x1 = []
            for j in range(self.N):
                if cluster_init[j] == i:
                    x1.append(j)
            cl_struct.append(x1)

        return cluster_init, cl_struct

    def _find_borders_between_clusters(  # noqa: C901
        self, Nclus, g, cl_struct, centers, cluster_init, log_den_c
    ):
        """Find saddle points between clusters.

        Assume i and j are points belonging to the clusters c(i) and c(j).
        Then, point i is a border point between c(i) and c(j) if:
            a) It has in its neighborhood a point j belonging to other cluster.
            b) There are no other points belonging to c(i) nearer from point j
            c) It has the maximum density among the points that fulfill a) & b)

        Args:
            Nclus: number of clusters
            g: scaled log density of points
            cl_struct: points assigned to each cluster
            centers: density peaks
            cluster_init: cluster assignment
            log_den_c: log density of points

        Returns:
            log_den_bord: log density of the saddle points
            log_den_bord_err: error on the log density of saddle points
            bord_index: square matrix providing the index of the saddle point between any two peaks
        """
        # this implementation can be vry costly if Nclus is > 10^4
        if Nclus > 10000:
            warnings.warn(
                """There are > 10k initial putative clusters:
            the matrices of the saddle points may cause out of memory error (3x Nclus x Nclus --> > 2.4 GB required).
            If this is the case, call compute_clustering_ADP_pure_python(v2 = True). Ignore the warning otherwise.""",
                stacklevel=2,
            )

        try:
            log_den_bord = np.zeros((Nclus, Nclus), dtype=np.float64)
            log_den_bord_err = np.zeros((Nclus, Nclus), dtype=np.float64)
            # set all bord indices to -1 (no points) initially
            bord_index = np.ones((Nclus, Nclus), dtype=int) * -1
        except MemoryError:
            print(
                f"Nclus = {Nclus}. Out of memory error: call compute_clustering_ADP_pure_python(v2 = True)"
            )

        for c in range(Nclus):
            for p1 in cl_struct[c]:  # p1 points in a given cluster
                if p1 in centers:
                    pp = -1

                else:
                    # a point p2 in the neighborhood of p1 belongs to a cluster cp different from p1
                    for k in range(1, self.kstar[p1] + 1):
                        p2 = self.dist_indices[p1, k]
                        pp = -1
                        if (
                            cluster_init[p2] != c
                        ):  # a point in the neighborhood of p1 belongs to another cluster
                            pp = p2
                            cp = cluster_init[pp]  # neighbor cluster index
                            break
                if pp != -1:
                    for k in range(1, self.maxk):
                        po = self.dist_indices[pp, k]  # pp here is p2
                        if po == p1:  # p1 is the closest point of p2 belonging to c
                            break
                        if (
                            cluster_init[po] == c
                        ):  # there is a point of c closest to p2 than p1
                            pp = -1
                            break

                # here check if the border point is a saddle point
                if pp != -1:
                    if (
                        g[p1] > log_den_bord[c][cp]
                    ):  # g = log_den_c - self.log_den_err different from next log_den_bord without log_den_err
                        log_den_bord[c][cp] = g[p1]
                        log_den_bord[cp][c] = g[p1]
                        bord_index[cp][c] = p1
                        bord_index[c][cp] = p1

        # fill in matrices of bord densities and errors in a symmetric way
        for i in range(Nclus - 1):
            for j in range(i + 1, Nclus):
                if bord_index[i][j] != -1:
                    log_den_bord[i][j] = log_den_c[bord_index[i][j]]
                    log_den_bord[j][i] = log_den_c[bord_index[j][i]]
                    log_den_bord_err[i][j] = self.log_den_err[bord_index[i][j]]
                    log_den_bord_err[j][i] = self.log_den_err[bord_index[j][i]]

        # set diagonal to -1
        for i in range(Nclus):
            log_den_bord[i][i] = -1.0
            log_den_bord_err[i][i] = 0.0

        return log_den_bord, log_den_bord_err, bord_index

    def _multimodality_test(
        self,
        Nclus,
        Z,
        log_den_c,
        centers,
        cl_struct,
        log_den_bord,
        log_den_bord_err,
        bord_index,
    ):
        """Merge couples of peaks if these are not statistically significant."""
        check = 1

        # all clusters are initialised as "surviving"
        surviving_clusters = [1] * Nclus

        while check == 1:
            # density and position of borders to be merged
            pos = []
            ipos = []
            jpos = []

            check = 0

            # check whether there are statistically not-significant couples of peaks
            for i in range(Nclus - 1):
                for j in range(i + 1, Nclus):
                    # differences in peaks i->j and j->i
                    a1 = log_den_c[centers[i]] - log_den_bord[i][j]
                    a2 = log_den_c[centers[j]] - log_den_bord[i][j]

                    # statistical thresholds
                    e1 = Z * (self.log_den_err[centers[i]] + log_den_bord_err[i][j])
                    e2 = Z * (self.log_den_err[centers[j]] + log_den_bord_err[i][j])

                    # if two peaks are not statistically significant, save their border
                    if a1 < e1 or a2 < e2:
                        check = 1
                        pos.append(log_den_bord[i][j])
                        ipos.append(i)
                        jpos.append(j)

            # merging process
            if check == 1:
                # start merging from border of higher density
                barrier_index = pos.index(max(pos))
                imod = ipos[barrier_index]
                jmod = jpos[barrier_index]

                # normalised log density of first and second peak to be merged
                c1 = (log_den_c[centers[imod]] - log_den_bord[imod][jmod]) / (
                    self.log_den_err[centers[imod]] + log_den_bord_err[imod][jmod]
                )
                c2 = (log_den_c[centers[jmod]] - log_den_bord[imod][jmod]) / (
                    self.log_den_err[centers[jmod]] + log_den_bord_err[imod][jmod]
                )

                # by default, the second peak is removed (c2) if however c1 < c2 the indices are
                # exchanged and the first cluster is removed instead
                if c1 < c2:
                    tmp = jmod
                    jmod = imod
                    imod = tmp

                # cluster jmod is removed
                surviving_clusters[jmod] = 0
                # log_den_bord between the removed peaks is set to -1 and error to 0
                log_den_bord[imod][jmod] = -1.0
                log_den_bord[jmod][imod] = -1.0
                log_den_bord_err[imod][jmod] = 0.0
                log_den_bord_err[jmod][imod] = 0.0

                # the points of the jmod peak are added to the imod peak
                cl_struct[imod].extend(cl_struct[jmod])
                cl_struct[jmod] = []

                # recompute the borders with the other peaks
                for i in range(Nclus):
                    if i != imod and i != jmod:
                        if log_den_bord[imod][i] < log_den_bord[jmod][i]:
                            bord_index[imod][i] = bord_index[jmod][i]
                            bord_index[i][imod] = bord_index[imod][i]

                            log_den_bord[imod][i] = log_den_bord[jmod][i]
                            log_den_bord[i][imod] = log_den_bord[imod][i]

                            log_den_bord_err[imod][i] = log_den_bord_err[jmod][i]
                            log_den_bord_err[i][imod] = log_den_bord_err[imod][i]

                        log_den_bord[jmod][i] = -1
                        log_den_bord[i][jmod] = log_den_bord[jmod][i]
                        log_den_bord_err[jmod][i] = 0
                        log_den_bord_err[i][jmod] = log_den_bord_err[jmod][i]

        return surviving_clusters, bord_index

    def _finalise_clustering(  # noqa: C901
        self,
        Nclus,
        halo,
        surviving_clusters,
        cl_struct,
        centers,
        log_den_bord,
        log_den_bord_err,
        bord_index,
        log_den_c,
    ):
        """Finalise clustering."""
        # compute final N_clusters, cluster_indices and cluster_centers
        N_clusters = 0
        cluster_indices = []
        cluster_centers = []
        nnum = []
        for j in range(Nclus):
            nnum.append(-1)
            if surviving_clusters[j] == 1:
                nnum[j] = N_clusters
                N_clusters += 1
                cluster_indices.append(cl_struct[j])
                cluster_centers.append(centers[j])

        # initialise the final arrays
        bord_indices_m = np.zeros((N_clusters, N_clusters), dtype=int)
        log_den_bord_m = np.zeros((N_clusters, N_clusters), dtype=float)
        log_den_bord_err_m = np.zeros((N_clusters, N_clusters), dtype=float)

        for j in range(Nclus):
            if surviving_clusters[j] == 1:
                jj = nnum[j]
                for k in range(Nclus):
                    if surviving_clusters[k] == 1:
                        kk = nnum[k]
                        log_den_bord_m[jj][kk] = log_den_bord[j][k]
                        log_den_bord_err_m[jj][kk] = log_den_bord_err[j][k]
                        bord_indices_m[jj][kk] = bord_index[j][k]

        Last_cls = np.empty(self.N, dtype=int)
        for j in range(N_clusters):
            for k in cluster_indices[j]:
                Last_cls[k] = j
        Last_cls_halo = np.copy(Last_cls)
        nh = 0
        for j in range(N_clusters):
            log_den_halo = max(log_den_bord_m[j])
            for k in cluster_indices[j]:
                if log_den_c[k] < log_den_halo:
                    nh = nh + 1
                    Last_cls_halo[k] = -1
        if halo:
            cluster_assignment = Last_cls_halo
        else:
            cluster_assignment = Last_cls

        log_den_bord_m = np.copy(log_den_bord_m)

        return (
            N_clusters,
            cluster_assignment,
            cluster_indices,
            cluster_centers,
            log_den_bord_m,
            log_den_bord_err_m,
            bord_indices_m,
        )

    def _find_borders_between_clusters_v2(  # noqa: C901
        self, Nclus, g, cl_struct, centers, cluster_init, log_den_c
    ):
        """Find saddle points between clusters.

        Assume i and j are points belonging to the clusters c(i) and c(j).
        Then, point i is a border point between c(i) and c(j) if:
            a) It has in its neighborhood a point j belonging to other cluster.
            b) There are no other points belonging to c(i) nearer from point j
            c) It has the maximum density among the points that fulfill a) & b)

        Args:
            Nclus: number of clusters                       int
            g: scaled log density of points                 np.array(N)
            cl_struct: points assigned to each cluster      list(list())
            centers: data index of density peaks            np.array(Nclus)
            cluster_init: cluster assignment                np.array(N)
            log_den_c: log density of points                np.array(N)

        Returns:
            log_den_bord: log density of the saddle points
            log_den_bord_err: error on the log density of saddle points
            bord_index: square matrix providing the index of the saddle point between any two peaks
        """
        saddle_count = 0
        for c in range(Nclus):
            # saddle point index, saddle point density, saddle point error, cluster1 index, cluster2 index, valid_saddle
            # create another array of indices

            saddle_density_tmp = np.zeros(
                (Nclus, 3), dtype=float
            )  # density, density_error, normalized_density
            saddle_indices_tmp = -np.ones(
                (Nclus, 4), dtype=int
            )  # saddle point, cluster1, cluster2, is_valid saddle

            if c == 0:
                saddle_density = saddle_density_tmp
                saddle_indices = saddle_indices_tmp
                # array of saddle points of cluster  c (will be eventually much less than Nclus)
            else:
                saddle_density = np.concatenate(
                    (saddle_density, saddle_density_tmp), axis=0
                )
                saddle_indices = np.concatenate(
                    (saddle_indices, saddle_indices_tmp), axis=0
                )

            for p1 in cl_struct[c]:  # p1 point in a given cluster
                if p1 in centers:
                    pp = -1

                else:
                    # a point p2 in the neighborhood of p1 belongs to a cluster cp different from p1
                    for k in range(1, self.kstar[p1] + 1):
                        p2 = self.dist_indices[p1, k]
                        pp = -1
                        if (
                            cluster_init[p2] != c
                        ):  # a point in the neighborhood of p1 belongs to another cluster
                            pp = p2
                            cp = cluster_init[pp]  # neighbor cluster index
                            break

                if pp != -1:
                    # p1 is the closest point in c to p2
                    for k in range(1, self.maxk):  # why maxk?
                        po = self.dist_indices[pp, k]  # pp here is p2
                        if po == p1:  # p1 is the closest point of p2 belonging to c
                            break
                        if (
                            cluster_init[po] == c
                        ):  # p1 is NOT the closest point of p2 belonging to c
                            pp = -1
                            break

                # here check if the border point is a saddle point
                if pp != -1:
                    # maybe contrunct a matrix of border points [p, g, c1, c2]
                    # in log_den_bord_tmp the cluster are coupled with indices that go from smaller to bigger
                    if c > cp:
                        c1 = cp
                        c2 = c
                    else:
                        c1 = c
                        c2 = cp

                    flag = 0
                    for i in range(saddle_count):
                        if c1 == saddle_indices[i, 1] and c2 == saddle_indices[i, 2]:
                            flag = 1  # there is already a border point between c and cp
                            if g[p1] > saddle_density[i, 2]:
                                saddle_indices[i, 0] = p1  # useful at the end
                                saddle_density[i] = np.array(
                                    [log_den_c[p1], self.log_den_err[p1], g[p1]]
                                )
                                break

                    if flag == 0:
                        saddle_indices[saddle_count] = np.array(
                            [p1, c1, c2, 1], dtype=int
                        )
                        saddle_density[saddle_count] = np.array(
                            [log_den_c[p1], self.log_den_err[p1], g[p1]]
                        )
                        saddle_count += 1

            saddle_density = saddle_density[:saddle_count]
            saddle_indices = saddle_indices[:saddle_count]

        sort_indices = saddle_density[:, 0].argsort()[::-1]
        saddle_density = saddle_density[sort_indices]
        saddle_indices = saddle_indices[sort_indices]

        return saddle_density[:, :2], saddle_indices

    def _multimodality_test_v2(
        self, Nclus, Z, log_den_c, centers, cl_struct, saddle_density, saddle_indices
    ):
        """Merge couples of peaks if these are not statistically significant.

        Args:
            Nclus: number of clusters                       int
            Z: scaled log density of points                 int
            log_den_c: log density of points                np.array(N)
            centers: data index of density peaks            np.array(Nclus)
            cl_struct: points assigned to each cluster      list(list())
            log_den_bord,                                   np.array((Nclus, 6))
        """
        check = 1
        # all clusters are initialised as "surviving"
        surviving_clusters = np.ones(Nclus, dtype=int)
        nsaddles = len(saddle_density)

        while check == 1:
            check = 0
            current_saddle = -1.0
            for i in range(nsaddles):  # log_den_bord already sorted
                if saddle_indices[i, 3] == 1:
                    clus1, clus2 = (
                        saddle_indices[i, 1],
                        saddle_indices[i, 2],
                    )  # cluster indices
                    center1, center2 = (
                        centers[clus1],
                        centers[clus2],
                    )  # data indices peak of clus1, clus2

                    a1 = (
                        log_den_c[center1] - saddle_density[i, 0]
                    )  # log_den_c is an array of the density of all the points
                    a2 = log_den_c[center2] - saddle_density[i, 0]
                    sum_err1 = self.log_den_err[center1] + saddle_density[i, 1]
                    sum_err2 = self.log_den_err[center2] + saddle_density[i, 1]

                    # statistical thresholds
                    e1 = Z * sum_err1
                    e2 = Z * sum_err2
                    if a1 < e1 or a2 < e2:
                        check = 1
                        if current_saddle < saddle_density[i, 0]:
                            max_a1, max_a2 = a1, a2
                            max_sum_err1, max_sum_err2 = sum_err1, sum_err2
                            to_remove = i
                            current_saddle = saddle_density[i, 0]

            if check == 1:
                saddle_indices[
                    to_remove, -1
                ] = 0  # the couple center1, center2 is removed
                margin1 = max_a1 / max_sum_err1
                margin2 = max_a2 / max_sum_err2

                # only the peak with the highest margin is kept:
                # by convention we set this peak to be center1
                if margin1 < margin2:
                    c1, c2 = saddle_indices[to_remove, 2], saddle_indices[to_remove, 1]
                else:
                    c1, c2 = saddle_indices[to_remove, 1], saddle_indices[to_remove, 2]

                surviving_clusters[c2] = 0  # the second peak is removed
                cl_struct[c1].extend(cl_struct[c2])
                cl_struct[c2] = []

                # select the clusters that are neighbors to cluster1 and cluster2
                # cluster label, saddle label, position of c1/c2 in saddle_indices[:, 1:3]

                saddle_indices = self._find_neighbor_clusters_v2(
                    saddle_density, saddle_indices, to_remove, nsaddles, c1, c2
                )

        return surviving_clusters, saddle_density, saddle_indices

    def _finalise_clustering_v2(  # noqa: C901
        self,
        Nclus,
        halo,
        surviving_clusters,
        cl_struct,
        centers,
        saddle_density,
        saddle_indices,
        log_den_c,
    ):
        """Finalise clustering."""
        N_clusters = 0
        mapping = -np.ones(Nclus, dtype=int)  # all original centers
        cluster_indices, cluster_centers = [], []
        for i in range(Nclus):
            if surviving_clusters[i] == 1:
                mapping[i] = N_clusters  # center is surviving
                N_clusters += 1
                # !!!!!! check the index i is correct?
                cluster_indices.append(cl_struct[i])
                cluster_centers.append(centers[i])

        bord_indices_m = -np.ones((N_clusters, N_clusters), dtype=int)
        log_den_bord_m = np.zeros((N_clusters, N_clusters), dtype=float)
        log_den_bord_err_m = np.zeros((N_clusters, N_clusters), dtype=float)

        for i in range(len(saddle_density)):  # all original saddles
            if saddle_indices[i, 3] == 1:  # a valid saddle is between two valid centers
                j, k = (
                    mapping[saddle_indices[i, 1]],
                    mapping[saddle_indices[i, 2]],
                )  # map the two valid centers in the corresponding valid new cluster labels
                log_den_bord_m[j, k] = saddle_density[i, 0]
                log_den_bord_m[k, j] = log_den_bord_m[j, k]

                log_den_bord_err_m[j, k] = saddle_density[i, 1]
                log_den_bord_err_m[k, j] = log_den_bord_err_m[j, k]

                bord_indices_m[j, k] = saddle_indices[i, 0]
                bord_indices_m[k, j] = bord_indices_m[j, k]

        log_den_bord_m[np.diag_indices(N_clusters)] = -1

        Last_cls = np.empty(self.N, dtype=int)
        for j in range(N_clusters):
            for k in cluster_indices[j]:
                Last_cls[k] = j
        Last_cls_halo = np.copy(Last_cls)
        nh = 0
        for j in range(N_clusters):
            log_den_halo = max(log_den_bord_m[j])
            for k in cluster_indices[j]:
                if log_den_c[k] < log_den_halo:
                    nh = nh + 1
                    Last_cls_halo[k] = -1
        if halo:
            cluster_assignment = Last_cls_halo
        else:
            cluster_assignment = Last_cls

        log_den_bord_m = np.copy(log_den_bord_m)

        return (
            N_clusters,
            cluster_assignment,
            cluster_indices,
            cluster_centers,
            log_den_bord_m,
            log_den_bord_err_m,
            bord_indices_m,
        )

    def _find_neighbor_clusters_v2(  # noqa: C901
        self,
        saddle_density,
        saddle_indices,
        to_remove,
        nsaddles,
        c1,
        c2,
    ):
        neighbors1 = np.zeros((len(saddle_density), 3), dtype=int)
        neighbors2 = np.zeros((len(saddle_density), 3), dtype=int)
        len_neigh1, len_neigh2 = 0, 0
        for i in range(nsaddles):
            if i != to_remove and saddle_indices[i, 3] == 1:
                cluster_couple = saddle_indices[i, 1:3]

                if c1 in cluster_couple:
                    neighbors1[len_neigh1] = np.array([cluster_couple[0], i, 2])
                    if cluster_couple[0] == c1:
                        neighbors1[len_neigh1] = np.array([cluster_couple[1], i, 1])
                    len_neigh1 += 1

                elif c2 in cluster_couple:
                    neighbors2[len_neigh2] = np.array([cluster_couple[0], i, 2])
                    if cluster_couple[0] == c2:
                        neighbors2[len_neigh2] = np.array([cluster_couple[1], i, 1])
                    len_neigh2 += 1

        neighbors1, neighbors2 = neighbors1[:len_neigh1], neighbors2[:len_neigh2]

        # check which are the common neighbors between cluster1 and cluster2
        neighbors1 = neighbors1[neighbors1[:, 0].argsort()]
        neighbors2 = neighbors2[neighbors2[:, 0].argsort()]
        i, j = 0, 0
        while i < len(neighbors1) and j < len(neighbors2):
            if neighbors1[i, 0] == neighbors2[j, 0]:  # there is a common element
                index1, index2 = (
                    neighbors1[i, 1],
                    neighbors2[j, 1],
                )  # index common neighbor w.r.t cluster1
                if saddle_density[index1, 0] < saddle_density[index2, 0]:
                    # if the density of the saddle is higher between cluster2 and common neighbor,
                    # use this as new saddle between cluster1 and common neighbor
                    saddle_density[index1, 0] = saddle_density[
                        index2, 0
                    ]  # saddle density
                    saddle_density[index1, 1] = saddle_density[
                        index2, 1
                    ]  # saddle error
                # the couples of common elements with the c2 are "deleted"
                saddle_indices[index2, -1] = 0
                i += 1
                j += 1

            elif neighbors1[i, 0] < neighbors2[j, 0]:
                i += 1
            else:
                # neighbors of c2 which were not neighbors of c1 become neighbors of c1
                index2, c2_index = neighbors2[j, 1], neighbors2[j, 2]
                saddle_indices[index2, c2_index] = c1
                j += 1

        # some js in the comparison may not have been taken into account
        while j < len(neighbors2):
            index2, c2_index = neighbors2[j, 1], neighbors2[j, 2]
            saddle_indices[index2, c2_index] = c1
            j += 1

        return saddle_indices