Source code for elphick.sklearn_viz.features.principal_components

"""
Developed from the example here: https://plotly.com/python/pca-visualization/
"""
import dataclasses
import logging
from typing import Optional, List, Dict

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from pandas.core.dtypes.common import is_numeric_dtype
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from elphick.sklearn_viz.utils import log_timer


[docs]def plot_principal_components(x: pd.DataFrame, color: Optional[pd.Series] = None, plot_3d: bool = True, loading_vectors: bool = True, standardised: bool = False, title: Optional[str] = None) -> go.Figure: """ Args: x: X values to transform and plot. color: optional series by which to color the markers plot_3d: If True plot the top 3 principal components in 3D, otherwise the top 2 in 2D. loading_vectors: If True and plot_type is '2D'|'3D' loading vectors will be displayed. standardised: If True, plot the standardised PCA, where vectors are transformed to zero mean and unit variance. title: Optional plot title Returns: a plotly GraphObjects.Figure """ return PrincipalComponents(x=x, color=color).plot_principal_components(plot_3d=plot_3d, loading_vectors=loading_vectors, standardised=standardised, title=title)
[docs]def plot_explained_variance(x: pd.DataFrame, y: Optional[pd.Series] = None, title: Optional[str] = None) -> go.Figure: """Plot the cumulative explained variance by principal component. Args: x: X values to transform and plot. y: optional target vector title: Optional plot title Returns: """ return PrincipalComponents(x=x, color=y).plot_explained_variance(title=title)
[docs]def plot_scatter_matrix(x: pd.DataFrame, y: Optional[pd.Series] = None, original_features: bool = False, title: Optional[str] = None) -> go.Figure: """Plot a scatter matrix Args: x: X values to transform and plot. y: optional series by which to color the markers original_features: If True, plot the original features, otherwise plot the principal components. title: Optional plot title Returns: """ return PrincipalComponents(x=x, color=y).plot_scatter_matrix(original_features=original_features, title=title)
[docs]def plot_loading_vectors(x: pd.DataFrame, color: Optional[pd.Series] = None, standardised: bool = False, title: Optional[str] = None) -> go.Figure: """ Args: x: X values to transform and plot. color: optional series by which to color the markers standardised: If True, plot the standardised PCA, where vectors are transformed to zero mean and unit variance. title: Optional plot title Returns: a plotly GraphObjects.Figure """ return PrincipalComponents(x=x, color=color).plot_loading_vectors(standardised=standardised, title=title, by_color=color is not None)
[docs]def plot_correlation_circle(x: pd.DataFrame, color: Optional[pd.Series] = None, title: Optional[str] = None) -> go.Figure: """ Args: x: X values to transform and plot. color: optional series by which to color the markers title: Optional plot title Returns: a plotly GraphObjects.Figure """ return PrincipalComponents(x=x, color=color).plot_loading_vectors(standardised=True, title=title, by_color=color is not None)
[docs]@dataclasses.dataclass class PCResults: """Class to hold Principal Component results""" data: pd.DataFrame explained_variance: pd.Series loadings: pd.DataFrame
[docs]class PrincipalComponents:
[docs] def __init__(self, x: pd.DataFrame, color: Optional[pd.Series] = None): """ Args: x: X values to transform and plot. color: the optional series by which to color the markers """ self._logger = logging.getLogger(name=__class__.__name__) self.x: pd.DataFrame = x self.color: Optional[pd.Series] = color self._data: Optional[Dict] = None
@property @log_timer def data(self) -> Optional[Dict]: def get_pca_results(pipe, x): xt: pd.DataFrame = pipe.fit_transform(x) xt.columns = [f"PC{i + 1}" for i in range(len(x.columns))] var: pd.Series = pd.Series(data=pipe['pca'].explained_variance_ratio_ * 100., name='explained_variance') loadings = pd.DataFrame(data=pipe['pca'].components_.T * np.sqrt(pipe['pca'].explained_variance_), index=x.columns, columns=xt.columns) return PCResults(data=xt, explained_variance=var, loadings=loadings) if self._data is not None: res = self._data else: res: Dict = {} self._logger.info("Commencing PCA") pca = make_pipeline(PCA()).set_output(transform="pandas") pca_std = make_pipeline(StandardScaler(), PCA()).set_output(transform="pandas") for label, pipe in {'raw': pca, 'std': pca_std}.items(): res[label] = get_pca_results(pipe=pipe, x=self.x) if (self.color is not None) & (not is_numeric_dtype(self.color)): for grp in self.color.unique(): if 'group' not in res.keys(): res['group'] = dict() if grp not in res['group'].keys(): res['group'][grp] = dict() res['group'][grp][label] = get_pca_results(pipe=pipe, x=self.x.loc[self.color == grp, :]) self._data = res return res
[docs] def plot_principal_components(self, plot_3d: bool = False, loading_vectors: bool = True, standardised: bool = False, title: Optional[str] = None) -> go.Figure: """Create the pca plot Args: plot_3d: If True plot the top 3 principal components in 3D, otherwise the top 2 in 2D. loading_vectors: If True and plot_type is '2D'|'3D' loading vectors will be displayed. standardised: If True, plot the standardised PCA, where vectors are transformed to zero mean and unit variance. title: Optional plot title Loading vectors are implemented manually rather than with annotations (lines with arrows), the problem is described well here: https://community.plotly.com/t/set-pca-loadings-aka-arrows-in-a-3d-scatter-plot/72905 Returns: a plotly GraphObjects.Figure """ label: str = 'std' if standardised else 'raw' pca_data: pd.DataFrame = self.data[label].data pca_loadings: pd.DataFrame = self.data[label].loadings pca_variance: pd.DataFrame = self.data[label].explained_variance df_plot: pd.DataFrame = pd.concat([pca_data, self.x], axis=1).reset_index() if plot_3d: fig = px.scatter_3d(df_plot, x='PC1', y='PC2', z='PC3', color=self.color, hover_data=list(self.x.reset_index().columns)) fig.update_traces(marker_size=4) if loading_vectors: annots: List = [dict(x=row.PC1, y=row.PC2, z=row.PC3, text=i, showarrow=False, xanchor="left", xshift=10, yshift=10, opacity=0.7) for i, row in pca_loadings.iterrows()] fig.update_layout(scene=dict(annotations=annots)) for feature_name, row in pca_loadings.iterrows(): # noinspection PyTypeChecker fig.add_trace( go.Scatter3d(x=(row.PC1,), y=(row.PC2,), z=(row.PC3,), mode='markers', marker={'size': 6, 'line': dict(width=2, color='black')}, name=feature_name, showlegend=True, legendgroup="features", legendgrouptitle_text="feature vectors", )) fig.add_trace( go.Scatter3d(x=(0, row.PC1), y=(0, row.PC2), z=(0, row.PC3), mode='lines', line={'width': 5, 'color': 'black'}, name=feature_name, showlegend=False)) fig.update_layout(legend=dict(groupclick="toggleitem")) title = (f"Top 3 Principal Components<br>Explained Variance = " f"{round(pca_variance.iloc[0:3].sum(), 1)}%") if title is None else title else: # 2D fig = px.scatter(df_plot, x='PC1', y='PC2', color=self.color, hover_data=list(self.x.reset_index().columns)) fig.update_traces(marker_size=5) if loading_vectors: loadings = pca_loadings.iloc[:, 0:2] self.add_loading_vectors(fig, loadings) title = (f"Top 2 Principal Components<br>Explained Variance = " f"{round(pca_variance.iloc[0:2].sum(), 1)}%") if title is None else title fig.update_layout(legend_title_text=self.color.name) fig.update_layout(title=title, xaxis_title=f"PC1 ({round(self.data['std'].explained_variance.iloc[0], 1)}%)", yaxis_title=f"PC2 ({round(self.data['std'].explained_variance.iloc[1], 1)}%)") if self.color is not None: fig.update_layout(coloraxis_colorbar_title_text=self.color.name) return fig
[docs] def plot_explained_variance(self, standardised: bool = False, title: Optional[str] = None) -> go.Figure: """Plot the cumulative explained variance by principal component. Args: standardised: If True, plot the standardised PCA, where vectors are transformed to zero mean and unit variance. title: Optional plot title Returns: """ pca_variance: pd.DataFrame = self.data['std'].explained_variance if standardised else self.data[ 'raw'].explained_variance exp_var_cumul = np.cumsum(pca_variance) fig = px.area( x=range(1, exp_var_cumul.shape[0] + 1), y=exp_var_cumul, labels={"x": "# Components", "y": "Explained Variance"} ) title = 'Cumulative Explained Variance by Principal Component' if title is None else title fig.update_layout(title=title) fig.update_xaxes(type='category') return fig
[docs] def plot_scatter_matrix(self, original_features: bool = False, standardised: bool = False, title: Optional[str] = None) -> go.Figure: """Plot a scatter matrix Args: original_features: If True, plot the original features, otherwise plot the principal components. standardised: If True, plot the standardised PCA, where vectors are transformed to zero mean and unit variance. title: Optional plot title Returns: """ label: str = 'std' if standardised else 'raw' y = self.color if original_features: x = self.x title = 'Scatter Matrix - Original Feature Space' if title is None else title else: x = self.data[label].data title = 'Scatter Matrix - All Principal Components' if title is None else title if original_features: df_plot: pd.DataFrame = pd.concat([x, y], axis=1).reset_index() hover_data = ['index' if x.index.name is None else x.index.name] else: df_plot: pd.DataFrame = pd.concat([x, y, self.x], axis=1).reset_index() hover_data = list(self.x.reset_index().columns) fig = px.scatter_matrix(data_frame=df_plot, dimensions=list(x.columns), color=y.name, hover_data=hover_data) fig.update_traces(diagonal_visible=False) title = 'Top 3 Principal Components' if title is None else title fig.update_layout(title=title) return fig
[docs] def plot_loading_vectors(self, standardised: bool = False, by_color: bool = False, title: Optional[str] = None) -> go.Figure: """plot the loading vectors. Args: standardised: If True, plot the standardised PCA, where vectors are transformed to zero mean and unit variance. by_color: If True, plot the loading vectors by color group. title: Optional plot title Returns: a plotly GraphObjects.Figure """ label: str = 'std' if standardised else 'raw' if by_color: chunks = [] for grp, d_label in self.data['group'].items(): chunks.append(self.data['group'][grp][label].loadings.assign(group=grp)) loadings = pd.concat(chunks, axis='index') fig = px.scatter(loadings, x='PC1', y='PC2', color='group', hover_data=loadings.columns.tolist()) else: loadings = self.data[label].loadings.iloc[:, 0:2] fig = px.scatter(loadings, x='PC1', y='PC2', hover_data=loadings.columns.tolist()) fig.update_traces(marker=dict(size=1)) if standardised: fig.add_shape(type="circle", xref="x", yref="y", x0=-1, y0=-1, x1=1, y1=1, line_color="gray") title_main = "Correlation Circle" else: title_main = "Top 2 Principal Components" fig = self.add_loading_vectors(fig, loadings) title = (f"{title_main}<br>Explained Variance = " f"{round(self.data['std'].explained_variance.iloc[0:2].sum(), 1)}%") if title is None else title fig.update_layout(title=title, xaxis_title=f"PC1 ({round(self.data['std'].explained_variance.iloc[0], 1)}%)", yaxis_title=f"PC2 ({round(self.data['std'].explained_variance.iloc[1], 1)}%)") fig.update_yaxes(scaleanchor="x", scaleratio=1) fig.update_layout(scene=dict(aspectmode="data")) return fig
def add_loading_vectors(self, fig, loadings) -> go.Figure: if 'group' in loadings.columns: cm = px.colors.qualitative.Plotly grp_colors = dict(zip(loadings['group'].unique(), cm[0:len(loadings['group'].unique())])) for i, feature in enumerate(loadings.index): if 'group' in loadings.columns: grp = loadings.iloc[i, :]['group'] arrowcolor = grp_colors[grp] font = dict(color=grp_colors[grp]) else: arrowcolor = None font = None fig.add_annotation( ax=0, ay=0, axref="x", ayref="y", x=loadings.iloc[i, 0], y=loadings.iloc[i, 1], showarrow=True, arrowsize=2, arrowhead=2, xanchor="right", yanchor="top", arrowcolor=arrowcolor, ) fig.add_annotation( x=loadings.iloc[i, 0], y=loadings.iloc[i, 1], ax=0, ay=0, xanchor="center", yanchor="bottom", text=feature, yshift=5, font=font ) return fig