Source code for feature_weighting

# Copyright 2021-2023 The DADApy Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""
The *feature_weighting* module contains the *FeatureWeighting* class.

This class uses Differentiable Information Imbalance
"""

import multiprocessing
import time
import warnings
from functools import wraps
from typing import Type, Union

import numpy as np
from scipy.linalg import norm

from dadapy._utils.differentiable_imbalance import (
    _extract_min_diis_lasso_optimization,
    _optimize_dii,
    _optimize_dii_static_zeros,
    _plot_min_lasso_results,
    _refine_lasso_optimization,
    _return_dii,
    _return_dii_gradient,
    _return_full_dist_matrix,
    _return_full_rank_matrix,
    _return_optimal_lambda_from_distances,
)
from dadapy.base import Base

cores = multiprocessing.cpu_count()


def check_maxk(func):
    # TODO: remove when this works with different maxk
    @wraps(func)
    def with_check(*args, **kwargs):
        feature_selector: type[FeatureWeighting] = args[0]
        if feature_selector._maxk_warning:
            if feature_selector.maxk != feature_selector.N - 1:
                warnings.warn(
                    f'{"maxk option not yet available for the FeatureWeighting class. "}'
                    + f"It will be set to the number of data-1 ({feature_selector.N}-1).",
                    stacklevel=2,
                )
            feature_selector._maxk_warning = False
        return func(*args, **kwargs)

    return with_check


[docs] class FeatureWeighting(Base): def __init__( self, coordinates=None, distances=None, maxk=None, period=None, verbose=False, n_jobs=cores, ): super().__init__( coordinates=coordinates, distances=distances, maxk=maxk, period=period, verbose=verbose, n_jobs=n_jobs, ) # This is quite useful for debugging self._cythond = True self.history = None self._full_distance_matrix = None # To show maxk warning only once self._maxk_warning = True @property def full_distance_matrix(self): if self._full_distance_matrix is None: self._full_distance_matrix = _return_full_dist_matrix( data=self.X, n_jobs=self.n_jobs, period=self._parse_own_period(), cythond=self._cythond, ) return self._full_distance_matrix @full_distance_matrix.setter def full_distance_matrix(self, distance_matrix: np.ndarray): if ( (len(distance_matrix.shape) != 2) or (distance_matrix.shape[0] != self.N) or (distance_matrix.shape[1] != self.N) ): raise ValueError( f"Input matrix for full distance matrix not properly shaped. \ Should be {self.N}x{self.N} but is {distance_matrix.shape[0]}x{distance_matrix.shape[1]}." ) self._full_distance_matrix = distance_matrix @staticmethod def _parse_period_for_dii(in_period, in_dims): # TODO: remove when part of Base if in_period is None: return None if isinstance(in_period, np.ndarray) and in_period.shape == (in_dims,): period = in_period elif isinstance(in_period, (int, float)): period = np.full((in_dims), fill_value=in_period, dtype=float) else: raise ValueError( f"'period' must be either a float scalar or a numpy array of floats of shape ({in_dims},)" ) return period def _parse_own_period(self): return self._parse_period_for_dii(self.period, self.dims) def _parse_initial_weights(self, initial_weights: Union[np.ndarray, int, float]): if not ( isinstance(initial_weights, np.ndarray) or isinstance(initial_weights, int) or isinstance(initial_weights, float) or initial_weights is None ): raise ValueError( f"'initial_weights' must be either None," f" float scalar or a numpy array of floats of shape ({self.dims},)" ) if initial_weights is not None: if isinstance(initial_weights, np.ndarray): if initial_weights.shape == (self.dims,): initial_weights = initial_weights else: raise ValueError( f"'initial_weights' must be either None," f" float scalar or a numpy array of floats of shape ({self.dims},)" ) elif isinstance(initial_weights, (int, float)): initial_weights = np.full( (self.dims), fill_value=initial_weights, dtype=float ) else: initial_weights = 1 / np.std(self.X, axis=0) return initial_weights
[docs] @check_maxk def return_optimal_lambda(self, fraction: float = 1.0): """Computes the optimal softmax scaling parameter lambda for the DII optimization. This parameter represents a reasonable scale of distances of the data points in the input data set. Args: fraction (float): Zoom in or out from the optimal distance scale. Default: 1.0. Suggested to keep it at default. Values > 1. show a bigger scale (in the optimization, this means include more neigbors), values < 1 show a smaller scale (in the optimization, this means include less neighbors in the softmax). Values < 1. include on average less neighbors, and very small values only the first neighbor """ return _return_optimal_lambda_from_distances( self.full_distance_matrix, fraction )
[docs] @check_maxk def return_optimal_learning_rate( self, target_data: Type[Base], n_epochs: int = 50, n_samples: int = 200, initial_weights: Union[np.ndarray, int, float] = None, lambd: float = None, decaying_lr: str = "exp", trial_learning_rates: np.ndarray = None, ): """Find the optimal learning rate for the optimization of the DII by testing several on a reduced set Args: target_data: FeatureWeighting object, containing the groundtruth data (D_groundtruth x N array, period (optional)) to be compared to. n_epochs (int): number of epochs in each optimization cycle n_samples (int): Number of samples to use for the learning rate screening. Default = 300. initial_weights (np.ndarray or list): D(input) initial weights for the input features. No zeros allowed here lambd (float): softmax scaling. If None (preferred), this chosen automatically with compute_optimial_lambda decaying_lr (string): Default: "exp". "exp" for exponentially decaying learning rate (cut in half every 10 epochs): lrate = l_rate_initial * 2**(-i_epoch/10), or "cos" for cosine decaying learning rate: lrate = l_rate_initial * 0.5 * (1+ cos((pi * i_epoch)/n_epochs)). "static" for no decay in the learning rate. trial_learning_rates (np.ndarray or list or None): learning rates to try. If None are given, a sensible set of learning rates is tested. Returns: opt_l_rate (float): Learning rate, which leads to optimal unregularized (no l1-penalty) result in the specified number of epochs. History entries added to FeatureWeighting object: trial_learning_rates: np.ndarray. learning rates which were tested to find optimal one. dii_per_epoch_per_lr: np.ndarray, shape (len(trial_learning_rates), n_epochs+1). DII for each trial learning rate at each epoch. weights_per_epoch_per_lr: np.ndarray, shape (len(trial_learning_rates), n_epochs+1, D). Weights for each trial learning rate and at each epoch. These history entries can be accessed as follows: objectname.history['entry_name'] """ in_data = self.X.copy() groundtruth = target_data.X.copy() if n_samples <= len(in_data): in_data = in_data[-n_samples:] groundtruth = groundtruth[-n_samples:] initial_weights = self._parse_initial_weights(initial_weights) period = self._parse_own_period() groundtruthperiod = self._parse_period_for_dii( target_data.period, target_data.dims ) if trial_learning_rates is None: # these learning rates seem to work well for most data lrates = np.array([0.001, 0.01, 0.1, 1.0, 10.0, 50.0, 100.0, 200.0]) else: lrates = trial_learning_rates weights_per_epoch_per_lr = np.zeros( (len(lrates), n_epochs + 1, in_data.shape[1]) ) dii_per_epoch_per_lr = np.zeros((len(lrates), n_epochs + 1)) # optmizations for different learning rates for i, lrate in enumerate(lrates): ( weights_per_epoch_per_lr[i], dii_per_epoch_per_lr[i], _, ) = _optimize_dii( groundtruth_data=groundtruth, data=in_data, weights_0=initial_weights, lambd=lambd, n_epochs=n_epochs, l_rate=lrate, constrain=False, l1_penalty=0.0, decaying_lr=decaying_lr, period=period, groundtruthperiod=groundtruthperiod, n_jobs=self.n_jobs, cythond=self._cythond, ) # find best imbalance opt_lrate_index = np.nanargmin(dii_per_epoch_per_lr[:, -1]) opt_l_rate = lrates[opt_lrate_index] self.history = { "dii_per_epoch_per_lr": dii_per_epoch_per_lr, "weights_per_epoch_per_lr": weights_per_epoch_per_lr, "trial_learning_rates": lrates, } return opt_l_rate
[docs] @check_maxk def return_dii(self, target_data: Type[Base], lambd: float = None): """Computes the DII between two FeatureWeighting objects based on distances of input data and rank information of groundtruth data. Args: target_data: FeatureWeighting object, containing the groundtruth data (D_groundtruth x N array, period (optional)) to be compared to. lambd (float, optional): The regularization parameter. Default: 0.1. The higher this value, the more nearest neighbors are included. Can be calculated automatically with 'return_optimal_lambda'. This sets lambda to a distance smaller than the average distance in the data set but bigger than the minimal distance Returns: dii (float): The computed DII value. Depends on the softmax scale lambda. Raises: None. """ # only accepting target data of Base (or children) is slow if base automatically calculates distances. # either add lazyBase or find other way to implement things like period and metric of Base. if lambd is None: lambd = self.return_optimal_lambda() distances_i = self.full_distance_matrix rank_matrix_j = _return_full_rank_matrix( target_data.X, period=self._parse_period_for_dii(target_data.period, target_data.dims), n_jobs=self.n_jobs, ) return _return_dii( dist_matrix_A=distances_i, rank_matrix_B=rank_matrix_j, lambd=lambd )
[docs] @check_maxk def return_dii_gradient( self, target_data: Type[Base], weights: np.ndarray, lambd: float = None ): """Computes the gradient of the DII between two FeatureWeighting objects (input object and ground truth object (= target_data)) with respect to the weights of the input features. Args: target_data: FeatureWeighting object, containing the groundtruth data (D_groundtruth x N array, period (optional)) to be compared to. weights (np.ndarray): The array of weight values for the input values, where D is the dimension of data. lambd (float, optional): The regularization parameter. Default: 0.1. The higher this value, the more nearest neighbors are included. Can be calculated automatically with 'return_optimal_lambda'. This sets lambda to a distance smaller than the average distance in the data set but bigger than the minimal distance Returns: dii_weight_gradient (np.ndarray): The computed gradient of DII with respect to the weights. Depends on the softmax scale lambda. """ if lambd is None: lambd = self.return_optimal_lambda() period = self._parse_own_period() if period is not None: period *= weights target_period = self._parse_period_for_dii( target_data.period, in_dims=target_data.dims ) rescaled_distances_i = _return_full_dist_matrix( self.X * weights, period=period, n_jobs=self.n_jobs ) rank_matrix_j = _return_full_rank_matrix( target_data.X, period=target_period, n_jobs=self.n_jobs ) return _return_dii_gradient( rescaled_distances_i, self.X, rank_matrix_j, weights=self._parse_initial_weights(weights), lambd=lambd, period=period, n_jobs=self.n_jobs, cythond=self._cythond, )
[docs] @check_maxk def return_weights_optimize_dii( self, target_data: Type[Base], n_epochs: int = 100, constrain: bool = False, initial_weights: Union[np.ndarray, int, float] = None, lambd: float = None, learning_rate: float = None, l1_penalty: float = 0.0, decaying_lr: str = "exp", ): """Optimize the differentiable information imbalance using gradient descent of the DII between input data object A and groundtruth data object B. Args: target_data: FeatureWeighting object, containing the groundtruth data (D_groundtruth x N array, period (optional)) to be compared to. n_epochs: int, optional The number of epochs in the gradient descent optimization. If None, it is set to 100. constrain: bool Constrain the sum of the weights to sum up to the number of weights. Default: False initial_ weights : numpy.ndarray, shape (D,) The array of starting weight values for the input values, where D is the dimension of data. If none, it is initialized to 1/var for each variable This cannot be initialized to 0's. It can be initialized to all 1 or the inverse of the standard deviation lambd : float, optional The lambda scaling parameter of the softmax. If None, it is calculated automatically. Default is None. learning_rate: float, optional The learning rate of the gradient descent. If None, automatically estimated to be fast. l1_penalty: float, optional The l1-regularization strength, if sparcity is needed. Default: 0 (l1-regularization turned off). decaying_lr (string): Default: "exp". "exp" for exponentially decaying learning rate (cut in half every 10 epochs): lrate = l_rate_initial * 2**(-i_epoch/10), or "cos" for cosine decaying learning rate: lrate = l_rate_initial * 0.5 * (1+ cos((pi * i_epoch)/n_epochs)). "static" for no decay in the learning rate. Returns: final_weights: np.ndarray, shape (D). Array of the optmized weights. History entries added to FeatureWeighting object: weights_per_epoch: np.ndarray, shape (n_epochs+1, D). List of lists of the weights during optimization. dii_per_epoch: np.ndarray, shape (n_epochs+1, ). List of the differentiable information imbalances during optimization. l1_term_per_epoch: np.ndarray, shape (n_epochs+1, ). List of the l1_penalty terms contributing to the the loss function during optimization. These history entries can be accessed as follows: objectname.history['entry_name'] """ # initiate the weights period = self._parse_own_period() initial_weights = self._parse_initial_weights(initial_weights) # find a suitable learning rate by chosing the best optimization if learning_rate is None: learning_rate = self.return_optimal_learning_rate( target_data=target_data, n_epochs=50, n_samples=200, initial_weights=initial_weights, lambd=lambd, decaying_lr=decaying_lr, trial_learning_rates=None, ) weights_list, diis, l1_loss_terms = _optimize_dii( groundtruth_data=target_data.X, groundtruthperiod=self._parse_period_for_dii( target_data.period, target_data.dims ), data=self.X, period=period, weights_0=initial_weights, lambd=lambd, constrain=constrain, l1_penalty=l1_penalty, n_epochs=n_epochs, l_rate=learning_rate, decaying_lr=decaying_lr, n_jobs=self.n_jobs, cythond=self._cythond, ) # TODO: include a function that gives at least a reasonable estimate for the l1 penalty when wanting x features self.history = { "weights_per_epoch": weights_list, "dii_per_epoch": diis, "l1_term_per_epoch": l1_loss_terms, } return weights_list[-1]
[docs] @check_maxk def return_backward_greedy_dii_elimination( self, target_data: Type[Base], initial_weights: Union[np.ndarray, int, float] = None, lambd: float = None, n_epochs: int = 100, learning_rate: float = None, constrain: bool = False, decaying_lr: str = "exp", ): """Do a stepwise backward elimination of feature weights, always eliminating the lowest weight; after each elimination the DII is optimized by gradient descent using the remaining features Args: target_data: FeatureWeighting object, containing the groundtruth data (D_groundtruth x N array, period (optional)) to be compared to. initial_weights (np.ndarray or list): D(input) initial weights for the input features. No zeros allowed here lambd (float): softmax scaling. If None (preferred) this chosen automatically with compute_optimal_lambda n_epochs (int): number of epochs in each optimization cycle learning_rate (float): learning rate. Has to be tuned, especially if constrain=True (otherwise optmization could fail) constrain (bool): if True, rescale the weights so the biggest weight = 1 l1_penalty (float): l1 regularization strength decaying_lr (string): Default: "exp". "exp" for exponentially decaying learning rate (cut in half every 10 epochs): lrate = l_rate_initial * 2**(-i_epoch/10), or "cos" for cosine decaying learning rate: lrate = l_rate_initial * 0.5 * (1+ cos((pi * i_epoch)/n_epochs)). "static" for no decay in the learning rate. Returns: final_diis: np.ndarray, shape (D). Array of the optmized DII for each of the according weights. final_weights: np.ndarray, shape (D x D). Array of the optmized weights for each number of non-zero weights. History entries added to FeatureWeighting object: dii_per_epoch: np.ndarray, shape (D, n_epochs+1, D). Weights during optimisation for every epoch and every number of non-zero weights. For final weights: weights_list[:,-1,:] weights_per_epoch: np.ndarray, shape (D, n_epochs+1, ). DII during optimization for every epoch and number of non-zero weights. For final imbalances: diis_list[:,-1] These history entries can be accessed as follows: objectname.history['entry_name'] """ initial_weights = self._parse_initial_weights(initial_weights) # INFO: do not precompute optimal lambda here, otherwise it becomes a fixed value in the optimization # and the results are not optimal any more. if learning_rate is None: learning_rate = self.return_optimal_learning_rate( target_data=target_data, n_epochs=50, n_samples=200, initial_weights=initial_weights, lambd=lambd, decaying_lr=decaying_lr, trial_learning_rates=None, ) weights_per_epoch = np.full((self.dims, n_epochs + 1, self.dims), np.nan) imbalances_per_epoch = np.full((self.dims, n_epochs + 1), np.nan) # for making a warm start already for the first optimization end_weights = self.return_weights_optimize_dii( target_data=target_data, n_epochs=n_epochs, initial_weights=initial_weights, lambd=lambd, learning_rate=learning_rate, decaying_lr=decaying_lr, l1_penalty=0.0, ) nonzeros = norm(end_weights, 0) while nonzeros >= 1: start = time.time() gs, imbs = _optimize_dii_static_zeros( groundtruth_data=target_data.X, data=self.X, weights_0=end_weights, lambd=lambd, n_epochs=n_epochs, l_rate=learning_rate, constrain=constrain, decaying_lr=decaying_lr, period=self._parse_own_period(), groundtruthperiod=self._parse_period_for_dii( target_data.period, target_data.dims ), n_jobs=self.n_jobs, cythond=self._cythond, ) end = time.time() timing = end - start if self.verb: print( f"number of nonzero weights: {int(nonzeros)}, execution time: {timing:.2f} s." ) end_weights = gs[-1].copy() arr = end_weights.copy() arr[arr == 0] = np.nan if np.isnan(arr).all(): weights_per_epoch[self.dims - int(nonzeros)] = gs imbalances_per_epoch[self.dims - int(nonzeros)] = imbs break minweight = np.nanargmin(arr) end_weights[minweight] = 0 weights_per_epoch[self.dims - (int(nonzeros))] = gs imbalances_per_epoch[self.dims - (int(nonzeros))] = imbs nonzeros = norm(end_weights, 0) self.history = { "dii_per_epoch": imbalances_per_epoch, "weights_per_epoch": weights_per_epoch, } # to select a sensible set of features, plot the imbalances # and chose at which number of non-zero weights still enough information is retained return imbalances_per_epoch[:, -1], weights_per_epoch[:, -1, :]