Source code for elphick.sklearn_viz.features.importance

import logging
from typing import Union, Optional, Dict, Callable

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.base import is_classifier

from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline
from sklearn.utils.validation import check_is_fitted

from elphick.sklearn_viz.model_selection.scorers import r2_score_with_nan, classification_scorers, regression_scorers
from elphick.sklearn_viz.utils import log_timer


[docs]def plot_feature_importance(mdl,
                            sort: bool = False,
                            top_k: Optional[int] = None,
                            horizontal: bool = False,
                            permute: bool = False,
                            pipeline_input_features: bool = False,
                            x_test: Optional[pd.DataFrame] = None,
                            y_test: Optional[Union[pd.DataFrame, pd.Series]] = None,
                            title: Optional[str] = None
                            ) -> go.Figure:
    """

    Args:
        mdl: The scikit-learn model or pipeline.
        sort: If True, sort by decreasing importance
        top_k: Include only the top k features in the plot.  Will ignore the sort argument.
        horizontal: If True plot horizontal bars, if False vertical bars.
        permute: If True plot permutation importance.  Better, but slower.  Requires X_test and y_test to be provided.
        pipeline_input_features: If True, and a pipeline is provided, report the features provided as inputs to
         the pipeline.  If False, reports the estimator (last pipeline step) input features.  Requires permute = True.
        x_test: X values provided to execute permuted importance.
        y_test: y values provided to execute permuted importance.
        title: title for the plot

    Returns:
        a plotly GraphObjects.Figure

    """

    return FeatureImportance(mdl=mdl, permute=permute, pipeline_input_features=pipeline_input_features,
                             x_test=x_test, y_test=y_test).plot(sort=sort, top_k=top_k, horizontal=horizontal,
                                                                title=title)


[docs]class FeatureImportance:
[docs]    def __init__(self,
                 mdl,
                 permute: bool = False,
                 pipeline_input_features: bool = False,
                 x_test: Optional[pd.DataFrame] = None,
                 y_test: Optional[Union[pd.DataFrame, pd.Series]] = None,
                 scorer: Optional[Union[str, Callable]] = None,):
        """

        Args:
            mdl: The scikit-learn model or pipeline.
            permute: If True plot permutation importance.  Better, but slower.  Requires X_test and y_test to be provided.
            pipeline_input_features: If True, and a pipeline is provided, report the features provided as inputs to
             the pipeline.  If False, reports the estimator (last pipeline step) input features.  Requires permute = True.
            x_test: X values provided to execute permuted importance.
            y_test: y values provided to execute permuted importance.
            scorer: Optional callable scorer which the model will be fitted using

        """
        self._logger = logging.getLogger(name=__class__.__name__)
        self.mdl = mdl
        self.permute: bool = permute
        self.pipeline_input_features: bool = pipeline_input_features
        self.X_test: Optional[pd.DataFrame] = x_test
        self.y_test: Optional[Union[pd.DataFrame, pd.Series]] = y_test

        if scorer is not None:
            self.scorer = scorer
        else:
            self.scorer = classification_scorers[list(classification_scorers.keys())[0]] if is_classifier(self.mdl) else \
                regression_scorers[list(regression_scorers.keys())[0]]

        self._data: Optional[pd.DataFrame] = None
        self.is_pipeline: bool = isinstance(mdl, Pipeline)

        if not self.permute:
            check_is_fitted(mdl[-1]) if self.is_pipeline else check_is_fitted(mdl)

    @property
    @log_timer
    def data(self) -> Optional[pd.DataFrame]:
        if self._data is not None:
            res = self._data
        else:
            mdl = self.mdl

            if self.permute:
                self._logger.info("Generating feature importance by permutation")
                x = self.X_test
                if self.is_pipeline and not self.pipeline_input_features:
                    mdl = mdl[-1]
                    x = self.mdl[0:-1].transform(self.X_test)

                result = permutation_importance(estimator=mdl, X=x, y=self.y_test, n_repeats=10, random_state=42,
                                                n_jobs=2, scoring=self.scorer)
                importances = result.importances_mean
                std = result.importances_std
            else:
                self._logger.info("Extracting feature importance from the fitted model")
                if self.is_pipeline:
                    mdl = mdl[-1]
                try:  # trees
                    importances = mdl.feature_importances_
                    std = np.std([tree.feature_importances_ for tree in mdl.estimators_], axis=0)
                except AttributeError:  # regression
                    importances = mdl.coef_
                    std = np.full(len(importances), np.nan)

            try:
                feature_names = mdl.feature_names_in_
            except AttributeError:
                self._logger.warning("Feature names are not available within the model."
                                     " Setting the transform output to pandas will correct this."
                                     " e.g. pipe.set_output(transform='pandas')."
                                     " Retrying with the pre-processed feature names.")
                try:
                    # Likely non-sklearn estimator like CatBoostRegressor
                    feature_names = list(self.mdl[0:-1].transform(self.X_test).columns)
                except AttributeError:
                    self._logger.warning("Feature names are not available within the model."
                                         " Setting default names")
                    feature_names = [f"F{i}" for i in range(1, mdl.n_features_in_ + 1)]

            res: pd.DataFrame = pd.DataFrame([importances, std], index=['importance', 'std'], columns=feature_names).T
            self._data = res

        return res

[docs]    def plot(self,
             sort: bool = False,
             top_k: Optional[int] = None,
             horizontal: bool = False,
             title: Optional[str] = None) -> go.Figure:
        """

        Args:
            sort: If True, sort by decreasing importance
            top_k: Include only the top k features in the plot.  Will ignore the sort argument.
            horizontal: If True plot horizontal bars, if False vertical bars.
             the pipeline.  If False, reports the estimator (last pipeline step) input features.  Requires permute = True.
            title: title for the plot

        Returns:
            a plotly GraphObjects.Figure

        """
        data = self.data
        subtitle: str = 'Feature Importance'
        if self.permute:
            subtitle = 'Permuted ' + subtitle

        if sort or top_k is not None:
            data = data.sort_values(by=['importance'], ascending=False)
            if top_k is not None:
                data = data.iloc[0:top_k, :]
            if horizontal:
                data = data.sort_values(by=['importance'], ascending=True)

        if horizontal:
            kwargs: Dict = {'y': data.index,
                            'x': data['importance'],
                            'error_x': dict(type='data', array=data['std']),
                            'orientation': 'h'}
        else:
            kwargs: Dict = {'x': data.index,
                            'y': data['importance'],
                            'error_y': dict(type='data', array=data['std'])}

        if title is None:
            title = subtitle
        else:
            title = title + '<br>' + subtitle

        fig = go.Figure()
        fig.add_trace(go.Bar(name='Model 1', **kwargs))
        fig.update_layout(title=title, barmode='group')

        return fig