Module artemis.importance_methods.model_agnostic

Expand source code
from artemis.importance_methods.model_agnostic._pdp import PartialDependenceBasedImportance
from artemis.importance_methods.model_agnostic._permutational_importance import PermutationImportance

__all__ = ["PermutationImportance", "PartialDependenceBasedImportance"]

Classes

class PartialDependenceBasedImportance

Partial Dependence Based Feature Importance. It is used for calculating feature importance for partial dependence based feature interaction methods: Friedman's H-statistic and Greenwell methods.

Attributes

method : str
Method name.
feature_importance : pd.DataFrame
Feature importance values.
features_included : List[str]
List of features for which importance is calculated.
X_sampled : pd.DataFrame
Sampled data used for calculation.
pd_calculator : PartialDependenceCalculator
Object used to calculate and store partial dependence values.

References

Constructor for PartialDependenceBasedImportance

Expand source code
class PartialDependenceBasedImportance(FeatureImportanceMethod):
    """
    Partial Dependence Based Feature Importance.
    It is used for calculating feature importance for partial dependence based feature interaction methods:
    Friedman's H-statistic and Greenwell methods.


    Attributes
    ----------
    method : str 
        Method name.
    feature_importance : pd.DataFrame 
        Feature importance values.
    features_included : List[str]
        List of features for which importance is calculated.
    X_sampled: pd.DataFrame
        Sampled data used for calculation.
    pd_calculator : PartialDependenceCalculator
        Object used to calculate and store partial dependence values.

    References
    ----------
    - https://arxiv.org/abs/1805.04755
    """

    def __init__(self):
        """Constructor for PartialDependenceBasedImportance"""
        super().__init__(ImportanceMethod.PDP_BASED_IMPORTANCE)

    def importance(
        self,
        model,
        X: pd.DataFrame,
        n: int = None,
        predict_function: Optional[Callable] = None,
        features: Optional[List[str]] = None,
        show_progress: bool = False,
        batchsize: int = 2000,
        pd_calculator: Optional[PartialDependenceCalculator] = None,
    ):
        """Calculates Partial Dependence Based Feature Importance.
        
        Parameters
        ----------
        model : object
             Model for which importance will be calculated, should have predict_proba or predict method, or predict_function should be provided. 
        X : pd.DataFrame
             Data used to calculate importance. If n is not None, n rows from X will be sampled. 
        n : int, optional
            Number of samples to be used for calculation of importance. If None, all rows from X will be used. Default is None.
        predict_function : Callable, optional
            Function used to predict model output. It should take model and dataset and outputs predictions. 
            If None, `predict_proba` method will be used if it exists, otherwise `predict` method. Default is None.
        features : List[str], optional
            List of features for which importance will be calculated. If None, all features from X will be used. Default is None.
        show_progress : bool
            If True, progress bar will be shown. Default is False.
        batchsize : int
            Batch size for calculating partial dependence. Data for prediction are collected until the number of rows exceeds batchsize. 
            Then, the `predict_function` is called, jointly for the entire batch of observations. It speeds up the operation of the method
            by reducing the number of `predict_function` calls.
            Default is 2000.
        pd_calculator : PartialDependenceCalculator, optional
            PartialDependenceCalculator object containing partial dependence values for a given model and dataset. 
            Providing this object speeds up the calculation as partial dependence values do not need to be recalculated.
            If None, it will be created from scratch. Default is None.

        Returns
        -------
        pd.DataFrame
            Result dataframe containing feature importance with columns: "Feature", "Importance"
        """
        self.predict_function = get_predict_function(model, predict_function)
        self.X_sampled = sample_if_not_none(self._random_generator, X, n)
        self.features_included = all_if_none(X.columns, features)
  

        if pd_calculator is None:
            self.pd_calculator = PartialDependenceCalculator(model, self.X_sampled, self.predict_function, batchsize)
        else: 
            if pd_calculator.model != model:
                raise ValueError("Model in PDP calculator is different than the model in the method.")
            if not pd_calculator.X.equals(self.X_sampled):
                raise ValueError("Data in PDP calculator is different than the data in the method.")
            self.pd_calculator = pd_calculator

        self.feature_importance = self._pdp_importance(show_progress)
        return self.feature_importance

    @property
    def importance_ascending_order(self):
        return False

    def _pdp_importance(self, show_progress: bool) -> pd.DataFrame:
        self.pd_calculator.calculate_pd_single(show_progress=show_progress)

        importance = []
        num_features, _ = split_features_num_cat(self.X_sampled, self.features_included)

        for feature in self.features_included:
            pdp = self.pd_calculator.get_pd_single(feature)
            importance.append(_calc_importance(feature, pdp, feature in num_features))

        return pd.DataFrame(importance, columns=["Feature", "Importance"]).sort_values(
            by="Importance", ascending=self.importance_ascending_order, ignore_index=True
        ).fillna(0)

Ancestors

  • artemis.importance_methods._method.FeatureImportanceMethod

Instance variables

var importance_ascending_order
Expand source code
@property
def importance_ascending_order(self):
    return False

Methods

def importance(self, model, X: pandas.core.frame.DataFrame, n: int = None, predict_function: Optional[Callable] = None, features: Optional[List[str]] = None, show_progress: bool = False, batchsize: int = 2000, pd_calculator: Optional[artemis._utilities.pd_calculator.PartialDependenceCalculator] = None)

Calculates Partial Dependence Based Feature Importance.

Parameters

model : object
Model for which importance will be calculated, should have predict_proba or predict method, or predict_function should be provided.
X : pd.DataFrame
Data used to calculate importance. If n is not None, n rows from X will be sampled.
n : int, optional
Number of samples to be used for calculation of importance. If None, all rows from X will be used. Default is None.
predict_function : Callable, optional
Function used to predict model output. It should take model and dataset and outputs predictions. If None, predict_proba method will be used if it exists, otherwise predict method. Default is None.
features : List[str], optional
List of features for which importance will be calculated. If None, all features from X will be used. Default is None.
show_progress : bool
If True, progress bar will be shown. Default is False.
batchsize : int
Batch size for calculating partial dependence. Data for prediction are collected until the number of rows exceeds batchsize. Then, the predict_function is called, jointly for the entire batch of observations. It speeds up the operation of the method by reducing the number of predict_function calls. Default is 2000.
pd_calculator : PartialDependenceCalculator, optional
PartialDependenceCalculator object containing partial dependence values for a given model and dataset. Providing this object speeds up the calculation as partial dependence values do not need to be recalculated. If None, it will be created from scratch. Default is None.

Returns

pd.DataFrame
Result dataframe containing feature importance with columns: "Feature", "Importance"
Expand source code
def importance(
    self,
    model,
    X: pd.DataFrame,
    n: int = None,
    predict_function: Optional[Callable] = None,
    features: Optional[List[str]] = None,
    show_progress: bool = False,
    batchsize: int = 2000,
    pd_calculator: Optional[PartialDependenceCalculator] = None,
):
    """Calculates Partial Dependence Based Feature Importance.
    
    Parameters
    ----------
    model : object
         Model for which importance will be calculated, should have predict_proba or predict method, or predict_function should be provided. 
    X : pd.DataFrame
         Data used to calculate importance. If n is not None, n rows from X will be sampled. 
    n : int, optional
        Number of samples to be used for calculation of importance. If None, all rows from X will be used. Default is None.
    predict_function : Callable, optional
        Function used to predict model output. It should take model and dataset and outputs predictions. 
        If None, `predict_proba` method will be used if it exists, otherwise `predict` method. Default is None.
    features : List[str], optional
        List of features for which importance will be calculated. If None, all features from X will be used. Default is None.
    show_progress : bool
        If True, progress bar will be shown. Default is False.
    batchsize : int
        Batch size for calculating partial dependence. Data for prediction are collected until the number of rows exceeds batchsize. 
        Then, the `predict_function` is called, jointly for the entire batch of observations. It speeds up the operation of the method
        by reducing the number of `predict_function` calls.
        Default is 2000.
    pd_calculator : PartialDependenceCalculator, optional
        PartialDependenceCalculator object containing partial dependence values for a given model and dataset. 
        Providing this object speeds up the calculation as partial dependence values do not need to be recalculated.
        If None, it will be created from scratch. Default is None.

    Returns
    -------
    pd.DataFrame
        Result dataframe containing feature importance with columns: "Feature", "Importance"
    """
    self.predict_function = get_predict_function(model, predict_function)
    self.X_sampled = sample_if_not_none(self._random_generator, X, n)
    self.features_included = all_if_none(X.columns, features)


    if pd_calculator is None:
        self.pd_calculator = PartialDependenceCalculator(model, self.X_sampled, self.predict_function, batchsize)
    else: 
        if pd_calculator.model != model:
            raise ValueError("Model in PDP calculator is different than the model in the method.")
        if not pd_calculator.X.equals(self.X_sampled):
            raise ValueError("Data in PDP calculator is different than the data in the method.")
        self.pd_calculator = pd_calculator

    self.feature_importance = self._pdp_importance(show_progress)
    return self.feature_importance
class PermutationImportance (metric: artemis._utilities.performance_metrics.Metric = <artemis._utilities.performance_metrics.RMSE object>, random_state: Optional[int] = None)

Permutation-Based Feature Importance. It is used for calculating feature importance for performance based feature interaction - Sejong Oh method.

Importance of a feature is defined by the metric selected by user (default is sum of gains).

Attributes

method : str
Method name.
metric : Metric
Metric used for calculating performance.
feature_importance : pd.DataFrame
Feature importance values.

References

Constructor for PermutationImportance.

Parameters

metric : Metric
Metric used to calculate model performance. Defaults to RMSE().
random_state : int, optional
Random state for reproducibility. Defaults to None.
Expand source code
class PermutationImportance(FeatureImportanceMethod):
    """
    Permutation-Based Feature Importance.
    It is used for calculating feature importance for performance based feature interaction - Sejong Oh method.

    Importance of a feature is defined by the metric selected by user (default is sum of gains).

    Attributes
    ----------
    method : str 
        Method name.
    metric: Metric
        Metric used for calculating performance.
    feature_importance : pd.DataFrame 
        Feature importance values.
        
    References
    ----------
    - https://jmlr.org/papers/v20/18-760.html
    """

    def __init__(self, metric: Metric = RMSE(), random_state: Optional[int] = None):
        """Constructor for PermutationImportance.

        Parameters
        ----------
        metric : Metric
            Metric used to calculate model performance. Defaults to RMSE().
        random_state : int, optional 
            Random state for reproducibility. Defaults to None.
        """
        super().__init__(ImportanceMethod.PERMUTATION_IMPORTANCE, random_state=random_state)
        self.metric = metric

    def importance(
        self,
        model,
        X: pd.DataFrame,
        y_true: np.array,
        n_repeat: int = 15,
        features: Optional[List[str]] = None,
        show_progress: bool = False,
    ):
        """Calculates Permutation Based Feature Importance.

        Parameters
        ----------
        model : object
               Model for which importance will be calculated, should have predict method.
        X : pd.DataFrame
            Data used to calculate importance. 
        y_true : np.array or pd.Series
            Target values for X data. 
        n_repeat : int, optional
            Number of permutations. Default is 10.
        features : List[str], optional
            List of features for which importance will be calculated. If None, all features from X will be used. Default is None.
        show_progress : bool
            If True, progress bar will be shown. Default is False.

        Returns
        -------
        pd.DataFrame
            Result dataframe containing feature importance with columns: "Feature", "Importance"
        """
        self.feature_importance = _permutation_importance(
            model, X, y_true, self.metric, n_repeat, features, show_progress, self._random_generator
        )
        return self.feature_importance
    @property
    def importance_ascending_order(self):
        return False

Ancestors

  • artemis.importance_methods._method.FeatureImportanceMethod

Instance variables

var importance_ascending_order
Expand source code
@property
def importance_ascending_order(self):
    return False

Methods

def importance(self, model, X: pandas.core.frame.DataFrame, y_true: , n_repeat: int = 15, features: Optional[List[str]] = None, show_progress: bool = False)

Calculates Permutation Based Feature Importance.

Parameters

model : object
Model for which importance will be calculated, should have predict method.
X : pd.DataFrame
Data used to calculate importance.
y_true : np.array or pd.Series
Target values for X data.
n_repeat : int, optional
Number of permutations. Default is 10.
features : List[str], optional
List of features for which importance will be calculated. If None, all features from X will be used. Default is None.
show_progress : bool
If True, progress bar will be shown. Default is False.

Returns

pd.DataFrame
Result dataframe containing feature importance with columns: "Feature", "Importance"
Expand source code
def importance(
    self,
    model,
    X: pd.DataFrame,
    y_true: np.array,
    n_repeat: int = 15,
    features: Optional[List[str]] = None,
    show_progress: bool = False,
):
    """Calculates Permutation Based Feature Importance.

    Parameters
    ----------
    model : object
           Model for which importance will be calculated, should have predict method.
    X : pd.DataFrame
        Data used to calculate importance. 
    y_true : np.array or pd.Series
        Target values for X data. 
    n_repeat : int, optional
        Number of permutations. Default is 10.
    features : List[str], optional
        List of features for which importance will be calculated. If None, all features from X will be used. Default is None.
    show_progress : bool
        If True, progress bar will be shown. Default is False.

    Returns
    -------
    pd.DataFrame
        Result dataframe containing feature importance with columns: "Feature", "Importance"
    """
    self.feature_importance = _permutation_importance(
        model, X, y_true, self.metric, n_repeat, features, show_progress, self._random_generator
    )
    return self.feature_importance