Source code for elphick.mass_composition.utils.pd_utils

"""
Pandas utils
"""
import inspect
import logging
from typing import List, Dict, Optional

import pandas as pd
from pandas import DataFrame
from pandas.core.dtypes.common import is_float_dtype

from elphick.mass_composition.utils import solve_mass_moisture
from elphick.mass_composition.utils.size import mean_size


[docs]def column_prefixes(columns: List[str]) -> Dict[str, List[str]]:
    return {prefix: [col for col in columns if prefix == col.split('_')[0]] for prefix in
            list(dict.fromkeys([col.split('_')[0] for col in columns if len(col.split('_')) > 1]))}


[docs]def column_prefix_counts(columns: List[str]) -> Dict[str, int]:
    return {k: len(v) for k, v in column_prefixes(columns).items()}


[docs]def mass_to_composition(df: pd.DataFrame,
                        mass_wet: str = 'mass_wet',
                        mass_dry: str = 'mass_dry') -> pd.DataFrame:
    """Convert a mass DataFrame to composition

    Args:
        df: The pd.DataFrame containing mass.  H2O if provided will be ignored.  All columns other than the
         mass_wet and mass_dry are assumed to be `additive`, that is, dry mass weighting is valid.
         Assumes composition is in %w/w units.
        mass_wet: The wet mass column, not optional.  Consider solve_mass_moisture prior to this call if needed.
        mass_dry: The dry mass column, not optional.  Consider solve_mass_moisture prior to this call if needed.

    Returns:
        A pd.Dataframe containing mass (wet and dry mass) and composition
    """
    non_float_cols = _detect_non_float_columns(df)
    non_component_cols: List[str] = [mass_wet.lower(), mass_dry.lower(), 'h2o', 'moisture'] + [col.lower() for col in
                                                                                               non_float_cols]

    mass: pd.DataFrame = df[[mass_wet, mass_dry]]
    component_cols = [col for col in df.columns if col.lower() not in non_component_cols]
    component_mass: pd.DataFrame = df[component_cols]
    composition: pd.DataFrame = component_mass.div(mass[mass_dry], axis=0) * 100.0
    moisture: pd.Series = solve_mass_moisture(mass_wet=mass[mass_wet], mass_dry=mass[mass_dry])

    return pd.concat([mass, moisture, composition], axis='columns')


[docs]def composition_to_mass(df: pd.DataFrame,
                        mass_wet: str = 'mass_wet',
                        mass_dry: str = 'mass_dry') -> pd.DataFrame:
    """Convert a composition Dataframe to mass

    Args:
        df: The pd.DataFrame containing mass_wet, mass+_dry and composition columns.  H2O if provided will be dropped.
          All columns other than the mass_wet and mass_dry are assumed to be `additive`, that is, dry mass weighting
          is valid.  Assumes composition is in %w/w units.
        mass_wet: The wet mass column, not optional.  Consider solve_mass_moisture prior to this call if needed.
        mass_dry: The dry mass column, not optional.  Consider solve_mass_moisture prior to this call if needed.

    Returns:
        A pd.Dataframe containing mass for all components
    """
    non_float_cols = _detect_non_float_columns(df)
    non_component_cols: List[str] = [mass_wet.lower(), mass_dry.lower(), 'h2o', 'moisture'] + [col.lower() for col in
                                                                                               non_float_cols]

    mass: pd.DataFrame = df[[mass_wet, mass_dry]]
    component_cols = [col for col in df.columns if col.lower() not in non_component_cols]
    composition: pd.DataFrame = df[component_cols]
    component_mass: pd.DataFrame = composition.mul(mass[mass_dry], axis=0) / 100.0
    moisture_mass: pd.Series = pd.Series(mass[mass_wet] - mass[mass_dry], name='H2O', index=mass.index)
    return pd.concat([mass, moisture_mass, component_mass], axis='columns')


[docs]def weight_average(df: pd.DataFrame,
                   mass_wet: str = 'mass_wet',
                   mass_dry: str = 'mass_dry') -> DataFrame:
    """Weight Average a DataFrame containing mass-composition

    Args:
        df: The pd.DataFrame containing mass-composition.  H2O if provided will be ignored.  All columns other than the
         mass_wet and mass_dry are assumed to be `additive`, that is, dry mass weighting is valid.
         Assumes composition is in %w/w units.
        mass_wet: The wet mass column, not optional.  Consider solve_mass_moisture prior to this call if needed.
        mass_dry: The dry mass column, not optional.  Consider solve_mass_moisture prior to this call if needed.

    Returns:
        A pd.Series containing the total mass and weight averaged composition.
    """
    non_float_cols = _detect_non_float_columns(df)

    mass_sum: pd.DataFrame = df.pipe(composition_to_mass, mass_wet=mass_wet, mass_dry=mass_dry).sum(
        axis="index").to_frame().T
    moisture: pd.Series = solve_mass_moisture(mass_wet=mass_sum[mass_wet],
                                              mass_dry=mass_sum[mass_dry])
    component_cols = [col for col in df.columns if
                      col.lower() not in [mass_wet, mass_dry, 'h2o', 'moisture'] + non_float_cols]
    weighted_composition: pd.Series = mass_sum[component_cols].div(mass_sum[mass_dry], axis=0) * 100

    return pd.concat([mass_sum[[mass_wet, mass_dry]], moisture, weighted_composition], axis=1)


[docs]def calculate_recovery(df: pd.DataFrame,
                       df_ref: pd.DataFrame,
                       mass_wet: str = 'mass_wet',
                       mass_dry: str = 'mass_dry') -> pd.DataFrame:
    """Calculate recovery of mass-composition for two DataFrames

    Args:
        df: The pd.DataFrame containing mass-composition.  H2O if provided will be ignored.  All columns other than the
         mass_wet and mass_dry are assumed to be `additive`, that is, dry mass weighting is valid.
         Assumes composition is in %w/w units.
        mass_wet: The wet mass column, not optional.  Consider solve_mass_moisture prior to this call if needed.
        mass_dry: The dry mass column, not optional.  Consider solve_mass_moisture prior to this call if needed.

    Returns:
        A pd.Series containing the total mass and weight averaged composition.
    """

    res: pd.DataFrame = df.pipe(composition_to_mass, mass_wet=mass_wet, mass_dry=mass_dry) / df_ref.pipe(
        composition_to_mass, mass_wet=mass_wet, mass_dry=mass_dry)
    return res


[docs]def calculate_partition(df_feed: pd.DataFrame,
                        df_ref: pd.DataFrame,
                        col_mass_dry: str = 'mass_dry') -> pd.DataFrame:
    """Calculate the partition curve from two streams

    Applicable to the one dimensional case only.  The PN is bounded [0, 1].
    The interval mean for size is the geometric mean, otherwise the arithmetic mean.
    The interval mean is named `da`, which can be interpreted as `diameter-average` or `density-average`.
    TODO: consider a generalised name, fraction-average -> fa?

    Args:
        df_feed: The pd.DataFrame containing mass-composition representing the fractionated feed.
        df_ref: The pd.DataFrame containing mass-composition representing the fractionated reference stream.
        col_mass_dry: The dry mass column, not optional.

    Returns:
        A pd.DataFrame containing the partition data.
    """

    res: pd.DataFrame = df_ref[[col_mass_dry]].div(df_feed[[col_mass_dry]]).rename(columns={col_mass_dry: 'PN'})
    if df_ref.index.name.lower() == 'size':
        res.insert(loc=0, column='da', value=mean_size(res.index))
    else:
        res.insert(loc=0, column='da', value=res.index.mid)
    return res


def _detect_non_float_columns(df):
    _logger: logging.Logger = logging.getLogger(inspect.stack()[1].function)
    non_float_cols: List = [col for col in df.columns if col not in df.select_dtypes(include=[float]).columns]
    if len(non_float_cols) > 0:
        _logger.info(f"The following columns are not float columns and will be ignored: {non_float_cols}")
    return non_float_cols