This document provides a comprehensive guide to benchmarking different imputation methods using Microimpute. The examples below illustrate the workflow for comparing various imputation approaches and evaluating their performance.
# On the Diabetes Dataset (Numerical Variables)
from typing import List, Type
import pandas as pd
from microimpute.comparisons import *
from microimpute.config import RANDOM_STATE
from microimpute.models import *
from microimpute.visualizations import method_comparison_results
from microimpute.evaluations import *
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
# 1. Prepare data
diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
X_train, X_test = train_test_split(
df, test_size=0.2, random_state=RANDOM_STATE
)
predictors = ["age", "sex", "bmi", "bp"]
imputed_variables = ["s1", "s4"] # Numerical variables
Y_test: pd.DataFrame = X_test[imputed_variables]
# 2. Run imputation methods
model_classes: List[Type[Imputer]] = [QRF, OLS, QuantReg, Matching]
method_imputations = get_imputations(
model_classes, X_train, X_test, predictors, imputed_variables
)
# 3. Compare imputation methods using unified metrics function
# The function automatically detects that these are numerical variables and uses quantile loss
loss_comparison_df = compare_metrics(
Y_test, method_imputations, imputed_variables
)
# 4. Plot results - filter for quantile loss metrics only
quantile_loss_df = loss_comparison_df[loss_comparison_df['Metric'] == 'quantile_loss']
comparison_viz = method_comparison_results(
data=quantile_loss_df,
metric="quantile_loss",
data_format="long",
)
fig = comparison_viz.plot(
title="Method Comparison on Diabetes Dataset (Numerical Variables)",
)
fig.show()
# Display summary statistics
print("Summary of quantile loss by method:")
summary = quantile_loss_df[quantile_loss_df['Imputed Variable'] == 'mean_quantile_loss'].groupby('Method')['Loss'].mean()
print(summary.sort_values())
# Evaluate sensitivity to predictors
leave_one_out_results = leave_one_out_analysis(
df,
predictors,
imputed_variables,
QRF
)
predictor_inclusion_results = progressive_predictor_inclusion(
df,
predictors,
imputed_variables,
QRF
)
print(f"Inclusion order: {predictor_inclusion_results['inclusion_order']}")
for element in predictor_inclusion_results["predictor_impacts"]:
print(f"Predictor: {element['predictor']} \nLoss reduction: {element['loss_reduction']}")# On the SCF Dataset
from typing import List, Type, Optional, Union
import io
import logging
import pandas as pd
from pydantic import validate_call
import requests
import zipfile
from microimpute.comparisons import *
from microimpute.config import RANDOM_STATE, VALIDATE_CONFIG, VALID_YEARS
from microimpute.models import *
from microimpute.utils.data import preprocess_data
from microimpute.evaluations import *
import warnings
warnings.filterwarnings("ignore")
logger = logging.getLogger(__name__)
# 1. Prepare data
@validate_call(config=VALIDATE_CONFIG)
def load_scf(
years: Optional[Union[int, List[int]]] = None,
columns: Optional[List[str]] = None,
) -> pd.DataFrame:
"""Load Survey of Consumer Finances data for specified years and columns.
Args:
years: Year or list of years to load data for.
columns: List of column names to load.
Returns:
DataFrame containing the requested data.
Raises:
ValueError: If no Stata files are found in the downloaded zip
or invalid parameters
RuntimeError: If there's a network error or a problem processing
the downloaded data
"""
def scf_url(year: int) -> str:
"""Return the URL of the SCF summary microdata zip file for a year."""
if year not in VALID_YEARS:
logger.error(
f"Invalid SCF year: {year}. Valid years are {VALID_YEARS}"
)
raise
url = f"https://www.federalreserve.gov/econres/files/scfp{year}s.zip"
return url
logger.info(f"Loading SCF data with years={years}")
try:
# Identify years for download
if years is None:
years = VALID_YEARS
logger.warning(f"Using default years: {years}")
if isinstance(years, int):
years = [years]
all_data: List[pd.DataFrame] = []
for year in years:
logger.info(f"Processing data for year {year}")
try:
# Download zip file
logger.debug(f"Downloading SCF data for year {year}")
url = scf_url(year)
try:
response = requests.get(url, timeout=60)
response.raise_for_status() # Raise an error for bad responses
except requests.exceptions.RequestException as e:
logger.error(
f"Network error downloading SCF data for year {year}: {str(e)}"
)
raise
# Process zip file
z = zipfile.ZipFile(io.BytesIO(response.content))
# Find the .dta file in the zip
dta_files: List[str] = [
f for f in z.namelist() if f.endswith(".dta")
]
if not dta_files:
logger.error(
f"No Stata files found in zip for year {year}"
)
raise
# Read the Stata file
try:
logger.debug(f"Reading Stata file: {dta_files[0]}")
with z.open(dta_files[0]) as f:
df = pd.read_stata(
io.BytesIO(f.read()), columns=columns
)
logger.debug(
f"Read DataFrame with shape {df.shape}"
)
except Exception as e:
logger.error(
f"Error reading Stata file for year {year}: {str(e)}"
)
raise
# Add year column
df["year"] = year
logger.info(
f"Successfully processed data for year {year}, shape: {df.shape}"
)
all_data.append(df)
except Exception as e:
logger.error(f"Error processing year {year}: {str(e)}")
raise
# Combine all years
logger.debug(f"Combining data from {len(all_data)} years")
if len(all_data) > 1:
result = pd.concat(all_data)
logger.info(
f"Combined data from {len(years)} years, final shape: {result.shape}"
)
return result
else:
logger.info(
f"Returning data for single year, shape: {all_data[0].shape}"
)
return all_data[0]
except Exception as e:
logger.error(f"Error in _load: {str(e)}")
raise
scf_data = load_scf(2022)
PREDICTORS: List[str] = [
"hhsex", # sex of head of household
"age", # age of respondent
"married", # marital status of respondent
# "kids", # number of children in household
"race", # race of respondent
"income", # total annual income of household
"wageinc", # income from wages and salaries
"bussefarminc", # income from business, self-employment or farm
"intdivinc", # income from interest and dividends
"ssretinc", # income from social security and retirement accounts
"lf", # labor force status
]
IMPUTED_VARIABLES: List[str] = ["networth"]
# Evaluate predictors
predictors_evaluation = compute_predictor_correlations(scf_data, PREDICTORS, IMPUTED_VARIABLES)
print("\nMutual information with networth:")
print(predictors_evaluation["predictor_target_mi"])
X_train, X_test = preprocess_data(
data=scf_data, full_data=False, normalize=False,
)
# Shrink down the data by sampling
X_train = X_train.sample(frac=0.01, random_state=RANDOM_STATE)
X_test = X_test.sample(frac=0.01, random_state=RANDOM_STATE)
Y_test: pd.DataFrame = X_test[IMPUTED_VARIABLES]
# 2. Run imputation methods
model_classes: List[Type[Imputer]] = [QRF, OLS, QuantReg, Matching]
method_imputations = get_imputations(
model_classes, X_train, X_test, PREDICTORS, IMPUTED_VARIABLES
)
# 3. Compare imputation methods using unified metrics function
loss_comparison_df = compare_metrics(
Y_test, method_imputations, IMPUTED_VARIABLES
)
# 4. Plot results - filter for quantile loss metrics only
quantile_loss_df = loss_comparison_df[loss_comparison_df['Metric'] == 'quantile_loss']
comparison_viz = method_comparison_results(
data=quantile_loss_df,
metric="quantile_loss",
data_format="long",
)
fig = comparison_viz.plot(
title="Method Comparison on SCF Dataset",
show_mean=True,
)
fig.show()
# Compare distribution similarity
for model, imputations in method_imputations.items():
print(f"Model: {model}, distribution similarity: \n{compare_distributions(
donor_data=pd.DataFrame(scf_data["networth"]),
receiver_data=pd.DataFrame(imputations[0.5]["networth"]),
imputed_variables=IMPUTED_VARIABLES,
)}")Data preparation¶
The data preparation phase establishes the foundation for meaningful benchmarking comparisons. The load_scf() function downloads data from user-specified survey years, carefully selecting relevant predictor and target variables that capture the essential relationships for imputation.
The preprocess_data() applies normalization techniques to the features when normalize=True, ensuring that variables with different scales don’t unduly influence the imputation models. This preprocessing step is crucial, particularly if introducing additional for methods like nearest neighbor matching that rely on distance calculations. Finally, the function splits the data into training and testing sets, maintaining the statistical properties of both sets while creating an appropriate evaluation framework. If you would like to normalizing the data set without splitting it (for example in the event of performing cross-validation), set the full_data parameter to False.
# Normalizing
processed_data = preprocess_data(dataset, full_data=True)
# Normalizing and splitting
X_train, X_test = preprocess_data(dataset)Imputation generation¶
The imputation generation process serves as the core operational phase of the benchmarking framework. The get_imputations() function orchestrates this process with remarkable efficiency, handling all aspects of model training and prediction generation. It systematically trains each specified model on identical training data, ensuring a fair comparison across different imputation approaches.
After training, the function generates predictions at user-specified quantiles, allowing for evaluation across different parts of the conditional distribution. The quantile-based approach provides insights not just into central tendency (as with mean-based methods) but into the entire shape of the imputed distributions. This comprehensive prediction generation creates a rich dataset for subsequent evaluation.
The function organizes all results into a consistent, structured format designed for straightforward comparison. The returned nested dictionary architecture provides intuitive access to predictions from different models at different quantiles:
{
"ModelName1": {
0.1: DataFrame of predictions at 10th percentile,
0.5: DataFrame of predictions at 50th percentile,
0.9: DataFrame of predictions at 90th percentile.
},
"ModelName2": {
0.1: DataFrame of predictions at 10th percentile,
...
},
...
}This well-designed data structure simplifies downstream analysis and visualization, allowing researchers to focus on interpreting results rather than managing data formats.
At this stage, a model object can only handle the imputation of on variable at a time, meaning that to impute multiple variables from a data set, a new model object must be created for each of them.
Evaluation metrics for different variable types¶
Microimpute employs evaluation metrics tailored to the type of variable being imputed. The framework automatically selects the appropriate metric based on whether the imputed variable is numerical or categorical, ensuring meaningful performance assessment across different data types.
Quantile loss for numerical imputation¶
The evaluation of numerical imputation employs quantile loss to assess imputation quality. This approach provides a more nuanced evaluation than traditional metrics like mean squared error, particularly for capturing performance across different parts of the distribution.
At the foundation of this evaluation lies the quantile_loss() function, which implements the standard quantile loss formulation:
where is the quantile to be evaluated, represents the true value and is the imputed value.
This mathematical formulation creates an asymmetric loss function that penalizes under-prediction more heavily for higher quantiles and over-prediction more heavily for lower quantiles. This asymmetry aligns perfectly with the interpretation of quantiles—a 90th percentile prediction should rarely be below the true value, while a 10th percentile prediction should rarely exceed it.
Log loss for categorical imputation¶
When imputing categorical variables, the framework switches to log loss (also known as cross-entropy loss), which is specifically designed for evaluating probabilistic predictions of categorical outcomes. Log loss measures the performance of a classification model where the prediction output is a probability value between 0 and 1.
The log loss metric is calculated using the formula:
where:
is the number of samples
is the number of classes
is 1 if sample belongs to class , and 0 otherwise
is the predicted probability of sample belonging to class
Unlike quantile loss which evaluates numerical predictions at different percentiles, log loss evaluates the quality of probability estimates for categorical predictions. A perfect classifier would have a log loss of 0, while worse predictions yield increasingly higher values. The metric heavily penalizes confident misclassifications, predicting a class with high probability when it’s incorrect results in a large loss value.
This distinction is crucial for proper model evaluation:
Quantile loss is used for continuous numerical variables where we care about the distribution of predicted values
Log loss is used for categorical variables where we care about the accuracy of class probability predictions
The framework automatically detects the variable type and applies the appropriate metric. For models that handle both types of variables (like OLS and QRF), the evaluation will produce separate quantile loss results for numerical variables and log loss results for categorical variables.
Unified evaluation framework¶
The integration of these complementary metrics culminates in the compute_loss() and compare_loss() functions, which systematically evaluate multiple methods using the appropriate metric for each variable type. When dealing with mixed datasets containing both numerical and categorical variables, the framework produces separate evaluation results for each metric type, allowing researchers to assess model performance comprehensively across all variable types.
Distribution similarity metrics¶
The compare_distributions() function evaluates how well the imputed values preserve the distributional characteristics of the original data. It automatically selects the appropriate metric based on the variable type: Wasserstein distance for continuous numerical variables and Kullback-Leibler (KL) divergence for discrete categorical and boolean variables.
Wasserstein distance for numerical variables¶
For continuous numerical variables, the framework uses the Wasserstein distance (also known as Earth Mover’s Distance) to quantify the difference between distributions. The Wasserstein distance between two probability distributions and is defined as:
where denotes the set of all joint distributions whose marginals are and respectively.
The Wasserstein distance measures the minimum “work” required to transform one distribution into another, where work is defined as the amount of distribution mass moved times the distance it’s moved. Lower values indicate better preservation of the original distribution’s shape. In the SCF example, QRF shows the lowest Wasserstein distance (1.2e7), indicating it best preserves the distribution of net worth values, while QuantReg shows the highest distance (2.8e7), suggesting greater distributional distortion.
Kullback-Leibler divergence for categorical and boolean variables¶
For discrete distributions (categorical and boolean variables), the framework employs KL divergence, an information-theoretic measure that quantifies how one probability distribution diverges from a reference distribution. The KL divergence from distribution to distribution is defined as:
where:
is the reference distribution (original data)
is the approximation (imputed data)
is the set of all possible categorical values
In the context of imputation evaluation, KL divergence measures how much information is lost when using the imputed distribution to approximate the true distribution . Lower KL divergence values indicate better preservation of the original categorical distribution.
Predictor analysis and sensitivity evaluation¶
Beyond comparing imputation methods, understanding the relationship between predictors and target variables, as well as the sensitivity of imputation quality to predictor selection, provides crucial insights for model optimization and feature engineering.
Predictor-target mutual information¶
The compute_predictor_correlations() function with the imputed_variables parameter computes normalized mutual information between each predictor and the target variables. Mutual information measures the reduction in uncertainty about one variable given knowledge of another, making it particularly valuable for mixed data types (numeric and categorical). Unlike correlation coefficients that capture only linear relationships, mutual information detects any statistical dependency.
For discrete random variables and , the mutual information is defined as:
where:
is the joint probability distribution of and
and are the marginal probability distributions
For continuous variables, the summations are replaced by integrals:
The normalized mutual information (NMI) used in the implementation is:
where and are the entropies of and respectively.
The normalized values range from 0 (no relationship) to 1 (perfect dependency), allowing direct comparison of predictor importance across different variable types. In the SCF example, we see that income (MI=0.156) and wageinc (MI=0.122) have the strongest relationships with networth, while lf (labor force status, MI=0.011) has minimal information content.
Leave-one-out predictor analysis¶
The leave_one_out_analysis() function systematically evaluates the importance of each predictor by removing it from the model and measuring the resulting performance degradation. This approach provides a direct measure of each predictor’s contribution to imputation quality.
The analysis returns several key metrics:
Loss increase: The absolute increase in loss when the predictor is removed
Relative impact: The percentage increase in loss, indicating the predictor’s relative importance
Predictors with high relative impact are essential for accurate imputation and should not be removed from the model. Conversely, predictors with minimal impact might be candidates for removal to simplify the model and reduce computational costs.
Progressive predictor inclusion¶
The progressive_predictor_inclusion() function takes a complementary approach by starting with no predictors and iteratively adding the one that provides the greatest performance improvement. This greedy forward selection reveals:
Optimal inclusion order: The sequence in which predictors should be added for maximum benefit
Marginal contribution: The performance improvement from adding each predictor
Optimal subset: The minimal set of predictors achieving near-optimal performance
In the diabetes dataset example, the inclusion order [bp, age, bmi, sex] shows that blood pressure provides the most information, followed by age and BMI, with sex providing minimal additional benefit. The diminishing returns in loss reduction (from 0.0006 to 0.0003) illustrate when additional predictors provide negligible improvement.
Visualization¶
The method_comparison_results.plot() function generates bar charts that present benchmarking results grouping results by model, allowing quickly identifying patterns and trends in performance across different methods and different parts of the distribution. The metric parameter allows specifying “quantile_loss”, “log_loss” or “combined” as the metric of choice for the visualziation.
The function employs color coding to visually distinguish between different imputation models, making it easy to track the performance of a single method. Along the horizontal axis, when visualizing quantile loss the chart displays different quantiles (such as the 10th, 25th, 50th percentiles), allowing assessment across the entire distribution of interest. The vertical axis represents average loss, with lower values indicating better performance (both for quantile loss and log loss), giving an immediate visual indication of which models are performing well.
Extending the benchmarking framework¶
The Microimpute benchmarking framework was designed with extensibility as a core principle, allowing researchers to easily integrate and evaluate new imputation approaches. To incorporate your own custom imputation model into this evaluation framework, you can follow a straightforward process.
First, implement your custom model by extending the Imputer abstract base class, following the design patterns and interface requirements documented in the implementmodel_classes list alongside the built-in models you wish to compare against. Finally, execute the benchmarking process as described previously, and your custom model will be evaluated using the same rigorous methodology applied to the built-in models.
This seamless integration is possible because all models that implement the Imputer interface share a common API, allowing the benchmarking framework to interact with them in a consistent manner regardless of their internal implementation details. This architectural decision makes the framework inherently extensible while maintaining a clean separation between the benchmarking logic and the specific imputation methods being evaluated.
Best practices¶
Robust evaluation and benchmarking requires testing models across multiple diverse datasets rather than relying on a single test case. This approach helps identify which models perform consistently well across different data scenarios and which may be sensitive to particular data characteristics. By examining performance across varied contexts, you can make more confident generalizations about a method’s effectiveness.
A comprehensive evaluation should assess performance across different quantiles rather than focusing solely on central measures like the median. Many applications care about the tails of distributions, and models that perform well at the median might struggle with extreme quantiles. Evaluating across the full spectrum of quantiles provides a more complete picture of each method’s strengths and limitations.
While statistical performance is critical, practical considerations should not be overlooked. Different imputation methods can vary dramatically in their computational requirements, including training time, memory usage, and prediction speed. In many applications, a slightly less accurate method that runs orders of magnitude faster may be preferable. Consider these trade-offs explicitly in your evaluation framework.
For particularly important decisions, enhance the reliability of your performance estimates through cross-validation techniques. Cross-validation provides a more stable estimate of model performance by averaging results across multiple train-test splits, reducing the impact of any particular data division. This approach is especially valuable when working with smaller datasets where a single train-test split might not be representative.
The package also supports detailed assessment of model behavior through train-test performance comparisons via the model_performance_results() function. This visualization tool helps identify potential overfitting or underfitting issues by contrasting a model’s performance on training data with its performance on held-out test data. Significant disparities between training and testing performance can reveal important limitations in a model’s generalization capabilities.
For specialized applications with particular interest in certain parts of the distribution, the framework accommodates custom quantile sets for targeted evaluation. Rather than using the default (random) quantiles, researchers can specify exactly which quantiles to evaluate, allowing focused assessment of performance in regions of particular interest. This flexibility enables tailored evaluations that align precisely with application-specific requirements and priorities.