Source code for elphick.mass_composition.utils.sklearn

import logging

import pandas as pd

try:
    from sklearn.pipeline import Pipeline
    from sklearn.base import BaseEstimator, RegressorMixin
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestRegressor

    SKLEARN_AVAILABLE = True
except ImportError:
    SKLEARN_AVAILABLE = False


[docs]def extract_feature_names(pipeline): for name, step in pipeline.named_steps.items(): if hasattr(step, 'get_feature_names_out'): # This step has a get_feature_names_out method, so we use it return step.get_feature_names_out() elif hasattr(step, 'get_feature_names'): # This step has a get_feature_names method return step.get_feature_names() elif hasattr(step, 'get_params'): # This step doesn't have a method to get feature names directly, but it might have transformer(s) that do params = step.get_params() for param_name, param_value in params.items(): if hasattr(param_value, 'get_feature_names_out'): return param_value.get_feature_names_out() elif hasattr(param_value, 'get_feature_names'): return param_value.get_feature_names() return []
if SKLEARN_AVAILABLE: class PandasPipeline(Pipeline, RegressorMixin): def __init__(self, steps, memory=None, verbose=False): super().__init__(steps, memory=memory, verbose=verbose) self._logger = logging.getLogger(__class__.__name__) self.feature_names_in__ = None self.feature_names_out_ = None self.set_output(transform='pandas')
[docs] def fit(self, X, y=None, **fit_params): if not isinstance(X, pd.DataFrame) or not isinstance(y, pd.DataFrame): raise ValueError("Input X and y must be pandas DataFrame") self.feature_names_in__ = X.columns.to_list() self.feature_names_out_ = y.columns.tolist() super().fit(X, y, **fit_params) return self
[docs] def transform(self, X: pd.DataFrame, **kwargs) -> pd.DataFrame: if not isinstance(X, pd.DataFrame): raise ValueError("Input X must be pandas DataFrame") # ignore any features that the model was not fitted on and log if self.feature_names_in__ is not None and any([col not in self.feature_names_in__ for col in X.columns]): missing_features = [col for col in X.columns if col not in self.feature_names_in__] self._logger.info(f"Features {missing_features} were passed but are ignored since they" f" are not required by the model") X = X.copy().drop(columns=missing_features) Xt: pd.DataFrame = Pipeline.transform(self, X) return Xt
[docs] def predict(self, X: pd.DataFrame) -> pd.DataFrame: if not isinstance(X, pd.DataFrame): raise ValueError("Input X must be pandas DataFrame") # ignore any features that the model was not fitted on and log if self.feature_names_in__ is not None and any([col not in self.feature_names_in__ for col in X.columns]): missing_features = [col for col in X.columns if col not in self.feature_names_in__] self._logger.info(f"Features {missing_features} were passed but are ignored since they" f" are not required by the model") X = X.copy().drop(columns=missing_features) predictions = super().predict(X) return pd.DataFrame(predictions, columns=self.feature_names_out_, index=X.index)
[docs] def score(self, X: pd.DataFrame, y: pd.DataFrame) -> float: # ignore any features that the model was not fitted on and log if self.feature_names_in__ is not None and any([col not in self.feature_names_in__ for col in X.columns]): missing_features = [col for col in X.columns if col not in self.feature_names_in__] self._logger.info(f"Features {missing_features} were passed but are ignored since they" f" are not required by the model") X = X.copy().drop(columns=missing_features) # Call the parent class's score method return super().score(X, y)
[docs] def get_feature_names_out(self): return self.feature_names_out_
@classmethod def from_pipeline(cls, pipeline): return PandasPipeline(pipeline.steps) else:
[docs] class PandasPipeline:
[docs] def __init__(self, *args, **kwargs): raise ImportError("sklearn is not installed but is required for PandasPipeline. Please install it.")