Module artemis.importance_methods.model_agnostic
Expand source code
from artemis.importance_methods.model_agnostic._pdp import PartialDependenceBasedImportance
from artemis.importance_methods.model_agnostic._permutational_importance import PermutationImportance
__all__ = ["PermutationImportance", "PartialDependenceBasedImportance"]
Classes
class PartialDependenceBasedImportance
-
Partial Dependence Based Feature Importance. It is used for calculating feature importance for partial dependence based feature interaction methods: Friedman's H-statistic and Greenwell methods.
Attributes
method
:str
- Method name.
feature_importance
:pd.DataFrame
- Feature importance values.
features_included
:List[str]
- List of features for which importance is calculated.
X_sampled
:pd.DataFrame
- Sampled data used for calculation.
pd_calculator
:PartialDependenceCalculator
- Object used to calculate and store partial dependence values.
References
Constructor for PartialDependenceBasedImportance
Expand source code
class PartialDependenceBasedImportance(FeatureImportanceMethod): """ Partial Dependence Based Feature Importance. It is used for calculating feature importance for partial dependence based feature interaction methods: Friedman's H-statistic and Greenwell methods. Attributes ---------- method : str Method name. feature_importance : pd.DataFrame Feature importance values. features_included : List[str] List of features for which importance is calculated. X_sampled: pd.DataFrame Sampled data used for calculation. pd_calculator : PartialDependenceCalculator Object used to calculate and store partial dependence values. References ---------- - https://arxiv.org/abs/1805.04755 """ def __init__(self): """Constructor for PartialDependenceBasedImportance""" super().__init__(ImportanceMethod.PDP_BASED_IMPORTANCE) def importance( self, model, X: pd.DataFrame, n: int = None, predict_function: Optional[Callable] = None, features: Optional[List[str]] = None, show_progress: bool = False, batchsize: int = 2000, pd_calculator: Optional[PartialDependenceCalculator] = None, ): """Calculates Partial Dependence Based Feature Importance. Parameters ---------- model : object Model for which importance will be calculated, should have predict_proba or predict method, or predict_function should be provided. X : pd.DataFrame Data used to calculate importance. If n is not None, n rows from X will be sampled. n : int, optional Number of samples to be used for calculation of importance. If None, all rows from X will be used. Default is None. predict_function : Callable, optional Function used to predict model output. It should take model and dataset and outputs predictions. If None, `predict_proba` method will be used if it exists, otherwise `predict` method. Default is None. features : List[str], optional List of features for which importance will be calculated. If None, all features from X will be used. Default is None. show_progress : bool If True, progress bar will be shown. Default is False. batchsize : int Batch size for calculating partial dependence. Data for prediction are collected until the number of rows exceeds batchsize. Then, the `predict_function` is called, jointly for the entire batch of observations. It speeds up the operation of the method by reducing the number of `predict_function` calls. Default is 2000. pd_calculator : PartialDependenceCalculator, optional PartialDependenceCalculator object containing partial dependence values for a given model and dataset. Providing this object speeds up the calculation as partial dependence values do not need to be recalculated. If None, it will be created from scratch. Default is None. Returns ------- pd.DataFrame Result dataframe containing feature importance with columns: "Feature", "Importance" """ self.predict_function = get_predict_function(model, predict_function) self.X_sampled = sample_if_not_none(self._random_generator, X, n) self.features_included = all_if_none(X.columns, features) if pd_calculator is None: self.pd_calculator = PartialDependenceCalculator(model, self.X_sampled, self.predict_function, batchsize) else: if pd_calculator.model != model: raise ValueError("Model in PDP calculator is different than the model in the method.") if not pd_calculator.X.equals(self.X_sampled): raise ValueError("Data in PDP calculator is different than the data in the method.") self.pd_calculator = pd_calculator self.feature_importance = self._pdp_importance(show_progress) return self.feature_importance @property def importance_ascending_order(self): return False def _pdp_importance(self, show_progress: bool) -> pd.DataFrame: self.pd_calculator.calculate_pd_single(show_progress=show_progress) importance = [] num_features, _ = split_features_num_cat(self.X_sampled, self.features_included) for feature in self.features_included: pdp = self.pd_calculator.get_pd_single(feature) importance.append(_calc_importance(feature, pdp, feature in num_features)) return pd.DataFrame(importance, columns=["Feature", "Importance"]).sort_values( by="Importance", ascending=self.importance_ascending_order, ignore_index=True ).fillna(0)
Ancestors
- artemis.importance_methods._method.FeatureImportanceMethod
Instance variables
var importance_ascending_order
-
Expand source code
@property def importance_ascending_order(self): return False
Methods
def importance(self, model, X: pandas.core.frame.DataFrame, n: int = None, predict_function: Optional[Callable] = None, features: Optional[List[str]] = None, show_progress: bool = False, batchsize: int = 2000, pd_calculator: Optional[artemis._utilities.pd_calculator.PartialDependenceCalculator] = None)
-
Calculates Partial Dependence Based Feature Importance.
Parameters
model
:object
- Model for which importance will be calculated, should have predict_proba or predict method, or predict_function should be provided.
X
:pd.DataFrame
- Data used to calculate importance. If n is not None, n rows from X will be sampled.
n
:int
, optional- Number of samples to be used for calculation of importance. If None, all rows from X will be used. Default is None.
predict_function
:Callable
, optional- Function used to predict model output. It should take model and dataset and outputs predictions.
If None,
predict_proba
method will be used if it exists, otherwisepredict
method. Default is None. features
:List[str]
, optional- List of features for which importance will be calculated. If None, all features from X will be used. Default is None.
show_progress
:bool
- If True, progress bar will be shown. Default is False.
batchsize
:int
- Batch size for calculating partial dependence. Data for prediction are collected until the number of rows exceeds batchsize.
Then, the
predict_function
is called, jointly for the entire batch of observations. It speeds up the operation of the method by reducing the number ofpredict_function
calls. Default is 2000. pd_calculator
:PartialDependenceCalculator
, optional- PartialDependenceCalculator object containing partial dependence values for a given model and dataset. Providing this object speeds up the calculation as partial dependence values do not need to be recalculated. If None, it will be created from scratch. Default is None.
Returns
pd.DataFrame
- Result dataframe containing feature importance with columns: "Feature", "Importance"
Expand source code
def importance( self, model, X: pd.DataFrame, n: int = None, predict_function: Optional[Callable] = None, features: Optional[List[str]] = None, show_progress: bool = False, batchsize: int = 2000, pd_calculator: Optional[PartialDependenceCalculator] = None, ): """Calculates Partial Dependence Based Feature Importance. Parameters ---------- model : object Model for which importance will be calculated, should have predict_proba or predict method, or predict_function should be provided. X : pd.DataFrame Data used to calculate importance. If n is not None, n rows from X will be sampled. n : int, optional Number of samples to be used for calculation of importance. If None, all rows from X will be used. Default is None. predict_function : Callable, optional Function used to predict model output. It should take model and dataset and outputs predictions. If None, `predict_proba` method will be used if it exists, otherwise `predict` method. Default is None. features : List[str], optional List of features for which importance will be calculated. If None, all features from X will be used. Default is None. show_progress : bool If True, progress bar will be shown. Default is False. batchsize : int Batch size for calculating partial dependence. Data for prediction are collected until the number of rows exceeds batchsize. Then, the `predict_function` is called, jointly for the entire batch of observations. It speeds up the operation of the method by reducing the number of `predict_function` calls. Default is 2000. pd_calculator : PartialDependenceCalculator, optional PartialDependenceCalculator object containing partial dependence values for a given model and dataset. Providing this object speeds up the calculation as partial dependence values do not need to be recalculated. If None, it will be created from scratch. Default is None. Returns ------- pd.DataFrame Result dataframe containing feature importance with columns: "Feature", "Importance" """ self.predict_function = get_predict_function(model, predict_function) self.X_sampled = sample_if_not_none(self._random_generator, X, n) self.features_included = all_if_none(X.columns, features) if pd_calculator is None: self.pd_calculator = PartialDependenceCalculator(model, self.X_sampled, self.predict_function, batchsize) else: if pd_calculator.model != model: raise ValueError("Model in PDP calculator is different than the model in the method.") if not pd_calculator.X.equals(self.X_sampled): raise ValueError("Data in PDP calculator is different than the data in the method.") self.pd_calculator = pd_calculator self.feature_importance = self._pdp_importance(show_progress) return self.feature_importance
class PermutationImportance (metric: artemis._utilities.performance_metrics.Metric = <artemis._utilities.performance_metrics.RMSE object>, random_state: Optional[int] = None)
-
Permutation-Based Feature Importance. It is used for calculating feature importance for performance based feature interaction - Sejong Oh method.
Importance of a feature is defined by the metric selected by user (default is sum of gains).
Attributes
method
:str
- Method name.
metric
:Metric
- Metric used for calculating performance.
feature_importance
:pd.DataFrame
- Feature importance values.
References
Constructor for PermutationImportance.
Parameters
metric
:Metric
- Metric used to calculate model performance. Defaults to RMSE().
random_state
:int
, optional- Random state for reproducibility. Defaults to None.
Expand source code
class PermutationImportance(FeatureImportanceMethod): """ Permutation-Based Feature Importance. It is used for calculating feature importance for performance based feature interaction - Sejong Oh method. Importance of a feature is defined by the metric selected by user (default is sum of gains). Attributes ---------- method : str Method name. metric: Metric Metric used for calculating performance. feature_importance : pd.DataFrame Feature importance values. References ---------- - https://jmlr.org/papers/v20/18-760.html """ def __init__(self, metric: Metric = RMSE(), random_state: Optional[int] = None): """Constructor for PermutationImportance. Parameters ---------- metric : Metric Metric used to calculate model performance. Defaults to RMSE(). random_state : int, optional Random state for reproducibility. Defaults to None. """ super().__init__(ImportanceMethod.PERMUTATION_IMPORTANCE, random_state=random_state) self.metric = metric def importance( self, model, X: pd.DataFrame, y_true: np.array, n_repeat: int = 15, features: Optional[List[str]] = None, show_progress: bool = False, ): """Calculates Permutation Based Feature Importance. Parameters ---------- model : object Model for which importance will be calculated, should have predict method. X : pd.DataFrame Data used to calculate importance. y_true : np.array or pd.Series Target values for X data. n_repeat : int, optional Number of permutations. Default is 10. features : List[str], optional List of features for which importance will be calculated. If None, all features from X will be used. Default is None. show_progress : bool If True, progress bar will be shown. Default is False. Returns ------- pd.DataFrame Result dataframe containing feature importance with columns: "Feature", "Importance" """ self.feature_importance = _permutation_importance( model, X, y_true, self.metric, n_repeat, features, show_progress, self._random_generator ) return self.feature_importance @property def importance_ascending_order(self): return False
Ancestors
- artemis.importance_methods._method.FeatureImportanceMethod
Instance variables
var importance_ascending_order
-
Expand source code
@property def importance_ascending_order(self): return False
Methods
def importance(self, model, X: pandas.core.frame.DataFrame, y_true:
, n_repeat: int = 15, features: Optional[List[str]] = None, show_progress: bool = False) -
Calculates Permutation Based Feature Importance.
Parameters
model
:object
- Model for which importance will be calculated, should have predict method.
X
:pd.DataFrame
- Data used to calculate importance.
y_true
:np.array
orpd.Series
- Target values for X data.
n_repeat
:int
, optional- Number of permutations. Default is 10.
features
:List[str]
, optional- List of features for which importance will be calculated. If None, all features from X will be used. Default is None.
show_progress
:bool
- If True, progress bar will be shown. Default is False.
Returns
pd.DataFrame
- Result dataframe containing feature importance with columns: "Feature", "Importance"
Expand source code
def importance( self, model, X: pd.DataFrame, y_true: np.array, n_repeat: int = 15, features: Optional[List[str]] = None, show_progress: bool = False, ): """Calculates Permutation Based Feature Importance. Parameters ---------- model : object Model for which importance will be calculated, should have predict method. X : pd.DataFrame Data used to calculate importance. y_true : np.array or pd.Series Target values for X data. n_repeat : int, optional Number of permutations. Default is 10. features : List[str], optional List of features for which importance will be calculated. If None, all features from X will be used. Default is None. show_progress : bool If True, progress bar will be shown. Default is False. Returns ------- pd.DataFrame Result dataframe containing feature importance with columns: "Feature", "Importance" """ self.feature_importance = _permutation_importance( model, X, y_true, self.metric, n_repeat, features, show_progress, self._random_generator ) return self.feature_importance