Source code for survey_enhance.percentile_match

import pandas as pd
import numpy as np
from microdf import MicroDataFrame, MicroSeries
from typing import Union

DataFrame = Union[pd.DataFrame, MicroDataFrame]
Series = Union[pd.Series, MicroSeries]


[docs]def match_percentiles_df(
    target_df: DataFrame,
    source_df: DataFrame,
    percentile_threshold: float = 0.95,
    num_groups: int = 10,
) -> pd.DataFrame:
    """
    Match the percentiles of the source_df to the target_df.

    Args:
        target_df: The DataFrame to edit to match the source_df's percentiles.
        source_df: The DataFrame to match the percentiles to.
        percentile_threshold: Don't adjust data for percentiles below this threshold.
        num_groups: The number of percentile groups to split the data into.

    Returns:
        A DataFrame with the same index as target_df, but with the adjusted values.
    """

    target_df = target_df.copy()

    for column in target_df.columns:
        target_df[column] = match_percentiles(
            target_df[column],
            source_df[column],
            percentile_threshold,
            num_groups,
        )

    return target_df


[docs]def match_percentiles(
    targets: Series,
    sources: Series,
    percentile_threshold: float = 0.95,
    num_groups: int = 10,
) -> pd.Series:
    """
    Match the percentiles of the source Series to the target Series.

    Args:
        targets: The Series to edit to match the source Series's percentiles.
        sources: The Series to match the percentiles to.
        percentile_threshold: Don't adjust data for percentiles below this threshold.
        num_groups: The number of percentile groups to split the data into.

    Returns:
        A Series with the same index as target_df, but with the adjusted values.
    """
    if not isinstance(targets, MicroSeries):
        targets = MicroSeries(targets)
    if not isinstance(sources, MicroSeries):
        sources = MicroSeries(sources)
    targets = targets.copy()

    percentile_boundaries = np.linspace(
        percentile_threshold, 1, num_groups + 1
    )
    lower_percentiles = percentile_boundaries[:-1]
    upper_percentiles = percentile_boundaries[1:]

    for lower, upper in zip(lower_percentiles, upper_percentiles):
        lower_target = targets[targets > 0].quantile(lower)
        upper_target = targets[targets > 0].quantile(upper)
        lower_source = sources[sources > 0].quantile(lower)
        upper_source = sources[sources > 0].quantile(upper)

        # Replace all values in the target Series that fall within the current
        # percentile range with the mean of the source Series's values in the
        # same percentile range.

        target_in_range = (targets >= lower_target) & (targets <= upper_target)
        source_in_range = (sources >= lower_source) & (sources <= upper_source)

        targets[target_in_range] = sources[source_in_range].mean()
    return targets