Source code for density_estimation

# Copyright 2021-2023 The DADApy Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""
The *density_estimation* module contains the *DensityEstimation* class.

The different algorithms of density estimation are implemented as methods of this class.
"""

import multiprocessing
import time
import warnings

import numpy as np

from dadapy._cython import cython_density as cd
from dadapy._utils.density_estimation import (
    return_not_normalised_density_kstarNN,
    return_not_normalised_density_PAk,
    return_not_normalised_density_PAk_optimized,
)
from dadapy._utils.utils import compute_cross_nn_distances
from dadapy.kstar import KStar

cores = multiprocessing.cpu_count()


[docs] class DensityEstimation(KStar): """Computes the log-density and its error at each point and other properties. Inherits from class KStar. Can compute the log-density and its error at each point choosing among various kNN-based methods. Attributes: log_den (np.array(float), optional): array containing the N log-densities log_den_err (np.array(float), optional): array containing the N errors on the log_den """ def __init__( self, coordinates=None, distances=None, maxk=None, verbose=False, n_jobs=cores ): """Initialise the DensityEstimation class.""" super().__init__( coordinates=coordinates, distances=distances, maxk=maxk, verbose=verbose, n_jobs=n_jobs, ) self.log_den = None self.log_den_err = None # ----------------------------------------------------------------------------------------------
[docs] def set_kstar(self, k=0): """Set all elements of kstar to a fixed value k. Overload the set_kstar method from the superior class. First, call the set_kstar from the superior class. Then also reset all other DensityEstimation attributes depending on kstar to None. Args: k: number of neighbours used to compute the density. It can be an iteger or an array of integers """ super().set_kstar(k) self.log_den = None self.log_den_err = None
# ----------------------------------------------------------------------------------------------
[docs] def compute_density_kNN(self, k=10, bias=False): """Compute the density of each point using a simple kNN estimator. Args: k (int): number of neighbours used to compute the density Returns: log_den (np.ndarray(float)): estimated log density log_den_err (np.ndarray(float)): estimated error on log density """ if self.intrinsic_dim is None: _ = self.compute_id_2NN() if self.verb: print(f"k-NN density estimation started (k={k})") self.set_kstar(k) log_den, log_den_err, dc = return_not_normalised_density_kstarNN( self.distances, self.intrinsic_dim, self.kstar, interpolation=False, bias=bias, ) # Normalise density log_den -= np.log(self.N) self.log_den = log_den self.log_den_err = log_den_err self.dc = dc if self.verb: print("k-NN density estimation finished") return self.log_den, self.log_den_err
# ----------------------------------------------------------------------------------------------
[docs] def compute_density_kstarNN(self, Dthr=23.92812698, bias=False): """Compute the density of each point using a simple kNN estimator with an optimal choice of k. Args: Dthr (float): Likelihood ratio parameter used to compute optimal k, the value of Dthr=23.92 corresponds to a p-value of 1e-6. Returns: log_den (np.ndarray(float)): estimated log density log_den_err (np.ndarray(float)): estimated error on log density """ if self.kstar is None: self.compute_kstar(Dthr=Dthr) if self.verb: print("kstar-NN density estimation started") log_den, log_den_err, dc = return_not_normalised_density_kstarNN( self.distances, self.intrinsic_dim, self.kstar, interpolation=False, bias=bias, ) # Normalise density log_den -= np.log(self.N) self.log_den = log_den self.log_den_err = log_den_err self.dc = dc if self.verb: print("k-NN density estimation finished") return self.log_den, self.log_den_err
# ----------------------------------------------------------------------------------------------
[docs] def compute_density_kpeaks(self, Dthr=23.92812698): """Compute the density of each point as proportional to the optimal k value found for that point. This method is mostly useful for the kpeaks clustering algorithm. Args: Dthr: Likelihood ratio parameter used to compute optimal k, the value of Dthr=23.92 corresponds to a p-value of 1e-6. Returns: log_den (np.ndarray(float)): estimated log density log_den_err (np.ndarray(float)): estimated error on log density """ self.compute_kstar(Dthr) if self.verb: print("Density estimation for k-peaks clustering started") dc = np.zeros(self.N, dtype=float) log_den = np.zeros(self.N, dtype=float) log_den_err = np.zeros(self.N, dtype=float) log_den_min = 9.9e300 for i in range(self.N): k = self.kstar[i] dc[i] = self.distances[i, k] log_den[i] = k log_den_err[i] = 0 for j in range(1, k): jj = self.dist_indices[i, j] log_den_err[i] = log_den_err[i] + (self.kstar[jj] - k) ** 2 log_den_err[i] = np.sqrt(log_den_err[i] / k) if log_den[i] < log_den_min: log_den_min = log_den[i] # Normalise density self.log_den = log_den self.log_den_err = log_den_err self.dc = dc if self.verb: print("k-peaks density estimation finished") return self.log_den, self.log_den_err
# ----------------------------------------------------------------------------------------------
[docs] def compute_density_PAk(self, Dthr=23.92812698, optimized=True): """Compute the density of each point using the PAk estimator. Args: Dthr (float): Likelihood ratio parameter used to compute optimal k, the value of Dthr=23.92 corresponds to a p-value of 1e-6. Returns: log_den (np.ndarray(float)): estimated log density log_den_err (np.ndarray(float)): estimated error on log density """ # compute optimal k if self.kstar is None: self.compute_kstar(Dthr=Dthr) elif len(np.unique(self.kstar)) == 1: warnings.warn( "Found pointwise optimal k already computed and CONSTANT over the datapoints. \ Make sure to have used a point-adaptive k selection function such as \ 'self.compute_kstar()'' ", stacklevel=2, ) if self.verb: print("PAk density estimation started") sec = time.time() if optimized: log_den, log_den_err, dc = return_not_normalised_density_PAk_optimized( self.distances, self.intrinsic_dim, self.kstar, interpolation=False, ) else: log_den, log_den_err, dc = return_not_normalised_density_PAk( self.distances, self.intrinsic_dim, self.kstar, interpolation=False, ) sec2 = time.time() if self.verb: print( "{0:0.2f} seconds optimizing the likelihood for all the points".format( sec2 - sec ) ) # Normalize density log_den -= np.log(self.N) self.log_den = log_den self.log_den_err = log_den_err self.dc = dc if self.verb: print("PAk density estimation finished") return self.log_den, self.log_den_err
# ----------------------------------------------------------------------------------------------
[docs] def return_entropy(self): """Compute a very rough estimate of the sample Shannon entropy of the data distribution. The computation simply returns the average negative log probability estimates. Returns: H (float): the estimated entropy of the distribution """ assert self.log_den is not None H = -np.mean(self.log_den) return H
# ----------------------------------------------------------------------------------------------
[docs] def return_interpolated_density_kNN(self, X_new, k): """Return the kNN density of the primary dataset, evaluated on a new set of points "X_new". Args: X_new (np.ndarray(float)): The points onto which the density should be computed k (int): the number of neighbours considered for the kNN estimator Returns: log_den (np.ndarray(float)): log density of dataset evaluated on X_new log_den_err (np.ndarray(float)): error on log density estimates """ assert self.X is not None if self.intrinsic_dim is None: _ = self.compute_id_2NN() cross_distances, _ = compute_cross_nn_distances( X_new, self.X, self.maxk, self.metric, self.period ) kstar = np.ones(X_new.shape[0], dtype=int) * k log_den, log_den_err, _ = return_not_normalised_density_kstarNN( cross_distances, self.intrinsic_dim, kstar, interpolation=True ) # Normalise density log_den -= np.log(self.N) return log_den, log_den_err
# ----------------------------------------------------------------------------------------------
[docs] def return_interpolated_density_kstarNN(self, X_new, Dthr=23.92812698): """Return the kstarNN density of the primary dataset, evaluated on a new set of points "X_new". Args: X_new (np.ndarray(float)): The points onto which the density should be computed Dthr: Likelihood ratio parameter used to compute optimal k Returns: log_den (np.ndarray(float)): log density of dataset evaluated on X_new log_den_err (np.ndarray(float)): error on log density estimates """ assert self.X is not None if self.intrinsic_dim is None: _ = self.compute_id_2NN() cross_distances, cross_dist_indices = compute_cross_nn_distances( X_new, self.X, self.maxk, self.metric, self.period ) kstar = cd._compute_kstar_interp( self.intrinsic_dim, X_new.shape[0], self.maxk, Dthr, cross_dist_indices, cross_distances, self.distances, ) log_den, log_den_err, _ = return_not_normalised_density_kstarNN( cross_distances, self.intrinsic_dim, kstar, interpolation=True ) # Normalise density log_den -= np.log(self.N) return log_den, log_den_err
# ----------------------------------------------------------------------------------------------
[docs] def return_interpolated_density_PAk(self, X_new, Dthr=23.92812698): """Return the PAk density of the primary dataset, evaluated on a new set of points "X_new". Args: X_new (np.ndarray(float)): The points onto which the density should be computed Dthr: Likelihood ratio parameter used to compute optimal k Returns: log_den (np.ndarray(float)): log density of dataset evaluated on X_new log_den_err (np.ndarray(float)): error on log density estimates """ assert self.X is not None if self.intrinsic_dim is None: _ = self.compute_id_2NN() cross_distances, cross_dist_indices = compute_cross_nn_distances( X_new, self.X, self.maxk, self.metric, self.period ) kstar = cd._compute_kstar_interp( self.intrinsic_dim, X_new.shape[0], self.maxk, Dthr, cross_dist_indices, cross_distances, self.distances, ) log_den, log_den_err, _ = return_not_normalised_density_PAk( cross_distances, self.intrinsic_dim, kstar, self.maxk, interpolation=True ) # Normalise density log_den -= np.log(self.N) return log_den, log_den_err