Statistical Matching imputation#

This notebook demonstrates how to use MicroImpute’s Matching imputer to impute values using the statistical matching approach. Statistical matching (also known as data fusion or synthetic matching) is a technique used to integrate information from different data sources.

The Matching model supports iterative imputation with a single object and workflow. Pass a list of imputed_variables with all variables that you hope to impute for and the model will do so without needing to fit and predict for each separately.

# Import needed libraries and setup R environment
import sys
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from rpy2.robjects import pandas2ri
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import warnings

# Set pandas display options to limit table width
pd.set_option("display.width", 600)
pd.set_option("display.max_columns", 10)
pd.set_option("display.expand_frame_repr", False)

# Import MicroImpute tools
from microimpute.evaluations import *
from microimpute.models import Matching
from microimpute.config import QUANTILES, RANDOM_STATE
from microimpute.visualizations.plotting import model_performance_results
from microimpute.comparisons.data import (
    preprocess_data,
    postprocess_imputations,
)

/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/opt/hostedtoolcache/Python/3.11.12/x64/lib", R: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "PWD" redefined by R and overriding existing variable. Current: "/home/runner/work/microimpute/microimpute/docs", R: "/home/runner/work/microimpute/microimpute/docs/models/matching"
  warnings.warn(

/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib", R: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_LIBS_SITE" redefined by R and overriding existing variable. Current: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library", R: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_PAPERSIZE_USER" redefined by R and overriding existing variable. Current: "a4", R: "letter"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_SESSION_TMPDIR" redefined by R and overriding existing variable. Current: "/tmp/Rtmp7infYN", R: "/tmp/RtmpRPiduz"
  warnings.warn(

/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pydantic/_internal/_generate_schema.py:623: UserWarning: <built-in function array> is not a Python type (it may be an instance of an object), Pydantic will allow any object with no validation since we cannot even enforce that the input is an instance of the given type. To get rid of this error wrap the type with `pydantic.SkipValidation`.
  warn(

# Load the diabetes dataset
diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)

# Add random boolean variable
df["bool"] = np.random.choice([True, False], size=len(df))
# Add synthetic weights
df["wgt"] = range(1, len(df) + 1)

# Display the first few rows of the dataset
df

	age	sex	bmi	bp	s1	...	s4	s5	s6	bool	wgt
0	0.038076	0.050680	0.061696	0.021872	-0.044223	...	-0.002592	0.019907	-0.017646	True	1
1	-0.001882	-0.044642	-0.051474	-0.026328	-0.008449	...	-0.039493	-0.068332	-0.092204	False	2
2	0.085299	0.050680	0.044451	-0.005670	-0.045599	...	-0.002592	0.002861	-0.025930	False	3
3	-0.089063	-0.044642	-0.011595	-0.036656	0.012191	...	0.034309	0.022688	-0.009362	True	4
4	0.005383	-0.044642	-0.036385	0.021872	0.003935	...	-0.002592	-0.031988	-0.046641	True	5
...	...	...	...	...	...	...	...	...	...	...	...
437	0.041708	0.050680	0.019662	0.059744	-0.005697	...	-0.002592	0.031193	0.007207	True	438
438	-0.005515	0.050680	-0.015906	-0.067642	0.049341	...	0.034309	-0.018114	0.044485	True	439
439	0.041708	0.050680	-0.015906	0.017293	-0.037344	...	-0.011080	-0.046883	0.015491	False	440
440	-0.045472	-0.044642	0.039062	0.001215	0.016318	...	0.026560	0.044529	-0.025930	True	441
441	-0.045472	-0.044642	-0.073030	-0.081413	0.083740	...	-0.039493	-0.004222	0.003064	False	442

442 rows × 12 columns

# Define variables for the model
predictors = ["age", "sex", "bmi", "bp"]
imputed_variables = [
    "s1",
    "s4",
    "bool",
]  # We'll impute 's1' (total serum cholesterol), 's4' (total cholesterol/HDL ratio), and the random boolean variable
weights = ["wgt"]

# Create a subset with only needed columns
diabetes_df = df[predictors + imputed_variables + weights]

# Display summary statistics
diabetes_df.describe()

	age	sex	bmi	bp	s1	s4	wgt
count	4.420000e+02	4.420000e+02	4.420000e+02	4.420000e+02	4.420000e+02	4.420000e+02	442.000000
mean	-2.511817e-19	1.230790e-17	-2.245564e-16	-4.797570e-17	-1.381499e-17	-9.042540e-18	221.500000
std	4.761905e-02	4.761905e-02	4.761905e-02	4.761905e-02	4.761905e-02	4.761905e-02	127.738666
min	-1.072256e-01	-4.464164e-02	-9.027530e-02	-1.123988e-01	-1.267807e-01	-7.639450e-02	1.000000
25%	-3.729927e-02	-4.464164e-02	-3.422907e-02	-3.665608e-02	-3.424784e-02	-3.949338e-02	111.250000
50%	5.383060e-03	-4.464164e-02	-7.283766e-03	-5.670422e-03	-4.320866e-03	-2.592262e-03	221.500000
75%	3.807591e-02	5.068012e-02	3.124802e-02	3.564379e-02	2.835801e-02	3.430886e-02	331.750000
max	1.107267e-01	5.068012e-02	1.705552e-01	1.320436e-01	1.539137e-01	1.852344e-01	442.000000

warnings.filterwarnings("ignore")

# Split data into training and testing sets, preprocessing data types all in one (this function also supports normalization)
X_train, X_test, dummy_info = preprocess_data(
    diabetes_df,
    test_size=0.2,
    normalize=False,
)

for col, dummy_cols in dummy_info["column_mapping"].items():
        if col in predictors:
            predictors.remove(col)
            predictors.extend(dummy_cols)
        elif col in imputed_variables:
            imputed_variables.remove(col)
            imputed_variables.extend(dummy_cols)

# Let's see how many records we have in each set
print(f"Training set size: {X_train.shape[0]} records")
print(f"Testing set size: {X_test.shape[0]} records")

Found 1 numeric columns with unique values < 10, treating as categorical: ['sex']. Converting to dummy variables.

Training set size: 353 records
Testing set size: 89 records

# The dummy_info dictionary contains information about the imputed variables to enable postprocessing
print("Dummy info:", dummy_info)

Dummy info: {'original_dtypes': {'bool': ('bool', dtype('bool')), 'sex': ('numeric categorical', dtype('float64'))}, 'column_mapping': {'bool': ['bool'], 'sex': ['sex_0.05068011873981862']}, 'original_categories': {'sex': [0.05068011873981862, -0.044641636506989144]}}

Simulating missing data#

For this example, we’ll simulate missing data in our test set by removing the values we want to impute.

# Create a copy of the test set with missing values
X_test_missing = X_test.copy()

# Store the actual values for later comparison
actual_values = X_test_missing[imputed_variables].copy()

# Remove the values to be imputed
X_test_missing[imputed_variables] = np.nan

X_test_missing.head()

	age	bmi	bp	s1	s4	bool	wgt	sex_0.05068011873981862
287	0.045341	-0.006206	-0.015999	NaN	NaN	NaN	288.0	0.0
211	0.092564	0.036907	0.021872	NaN	NaN	NaN	212.0	0.0
72	0.063504	-0.004050	-0.012556	NaN	NaN	NaN	73.0	1.0
321	0.096197	0.051996	0.079265	NaN	NaN	NaN	322.0	0.0
73	0.012648	-0.020218	-0.002228	NaN	NaN	NaN	74.0	1.0

Training and using the Matching imputer#

Now we’ll train the Matching imputer and use it to impute the missing values in our test set.

# Define quantiles we want to model
# We'll use the default quantiles from the config module
print(f"Modeling these quantiles: {QUANTILES}")

Modeling these quantiles: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]

# Initialize the Matching imputer
matching_imputer = Matching()

# Fit the model with our training data
# This trains a linear regression model
fitted_matching_imputer = matching_imputer.fit(
    X_train,
    predictors,
    imputed_variables,
    weight_col="wgt",  # weights will be used for sampling the training data
)

# Impute values in the test set
# This uses the trained Matching model to predict missing values
imputed_values = fitted_matching_imputer.predict(X_test_missing, QUANTILES)

# Display the first few imputed values at the median (0.5 quantile)
imputed_values[0.5].head()

	s1	s4	bool
287	0.024574	-0.039493	1.0
211	0.030078	-0.039493	0.0
72	0.038334	-0.039493	0.0
321	-0.013953	-0.002592	0.0
73	-0.031840	-0.039493	0.0

# Post-process the imputed values to restore original data types
postprocess_imputed_values = postprocess_imputations(
    imputed_values,
    dummy_info,
)

postprocess_imputed_values[0.5].head()

	s1	s4	bool
287	0.024574	-0.039493	True
211	0.030078	-0.039493	False
72	0.038334	-0.039493	False
321	-0.013953	-0.002592	False
73	-0.031840	-0.039493	False

Evaluating the imputation results#

Now let’s compare the imputed values with the actual values to evaluate the performance of our imputer. Matching does not have the ability to adapt its prediction to specific quantiles, which means no matter which quantile we select, we will obtain the same results

# Extract median predictions for evaluation
median_predictions = imputed_values[0.5]

# Create a scatter plot comparing actual vs. imputed values
min_val = min(actual_values.min().min(), median_predictions.min().min())
max_val = max(actual_values.max().max(), median_predictions.max().max())

# Convert data for plotting
plot_df = pd.DataFrame(
    {
        "Actual": actual_values.values.flatten(),
        "Imputed": median_predictions.values.flatten(),
    }
)

# Create the scatter plot
fig = px.scatter(
    plot_df,
    x="Actual",
    y="Imputed",
    opacity=0.7,
    title="Comparison of Actual vs. Imputed Values using Matching",
)

# Add the diagonal line (perfect prediction line)
fig.add_trace(
    go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode="lines",
        line=dict(color="red", dash="dash"),
        name="Perfect Prediction",
    )
)

# Update layout
fig.update_layout(
    xaxis_title="Actual Values",
    yaxis_title="Imputed Values",
    width=750,
    height=600,
    template="plotly_white",
    margin=dict(l=50, r=50, t=80, b=50),  # Adjust margins
)

fig.show()

This scatter plot presents the performance of a matching-based imputation method by comparing the actual values (x-axis) to the imputed values (y-axis). Each dot represents a data point where a missing value was imputed using the nearest matched donor based on covariates. The red dashed line represents the ideal scenario of perfect prediction, where imputed values would exactly match actual values. Unlike model-based approaches such as Quantile Regression or Random Forests, the matching method shows greater dispersion around the ideal line, with several imputed values either overestimating or underestimating the true values. The scatter reveals that while some imputations are close to accurate, many are not, and the overall alignment with the perfect prediction line is weaker. This suggests that matching may introduce higher imputation error, especially when suitable matches are not available or when the matching algorithm doesn’t capture complex relationships between covariates and the missing variable.

Examining quantile predictions#

The Matching imputer can also provide predictions at different quantiles, which can be useful for understanding the uncertainty in the imputation.

# Compare predictions at different quantiles for the first 5 records
quantiles_to_show = QUANTILES
comparison_df = pd.DataFrame(index=range(5))

# Add actual values
comparison_df["Actual"] = actual_values.iloc[:5, 0].values

# Add quantile predictions
for q in quantiles_to_show:
    comparison_df[f"Q{int(q*100)}"] = imputed_values[q].iloc[:5, 0].values

comparison_df

	Actual	Q5	Q10	Q15	Q20	...	Q75	Q80	Q85	Q90	Q95
0	0.125019	0.024574	0.024574	0.024574	0.024574	...	0.024574	0.024574	0.024574	0.024574	0.024574
1	-0.024960	0.030078	0.030078	0.030078	0.030078	...	0.030078	0.030078	0.030078	0.030078	0.030078
2	0.103003	0.038334	0.038334	0.038334	0.038334	...	0.038334	0.038334	0.038334	0.038334	0.038334
3	0.054845	-0.013953	-0.013953	-0.013953	-0.013953	...	-0.013953	-0.013953	-0.013953	-0.013953	-0.013953
4	0.038334	-0.031840	-0.031840	-0.031840	-0.031840	...	-0.031840	-0.031840	-0.031840	-0.031840	-0.031840

5 rows × 20 columns

Visualizing prediction intervals#

By visualizing the prediction intervals of the model’s imputations we can better understand the uncertainty in our imputed values.

# Create a prediction interval plot for the first 10 records
# Number of records to plot
n_records = 10

# Prepare data for plotting
records = list(range(n_records))
actuals = actual_values.iloc[:n_records, 0].values
medians = imputed_values[0.5].iloc[:n_records, 0].values
q30 = imputed_values[0.3].iloc[:n_records, 0].values
q70 = imputed_values[0.7].iloc[:n_records, 0].values
q10 = imputed_values[0.1].iloc[:n_records, 0].values
q90 = imputed_values[0.9].iloc[:n_records, 0].values

# Create the base figure
fig = go.Figure()

# Add 80% prediction interval (Q10-Q90)
for i in range(n_records):
    fig.add_trace(
        go.Scatter(
            x=[i, i],
            y=[q10[i], q90[i]],
            mode="lines",
            line=dict(width=10, color="rgba(173, 216, 230, 0.3)"),
            hoverinfo="none",
            showlegend=False,
        )
    )

# Add 40% prediction interval (Q30-Q70)
for i in range(n_records):
    fig.add_trace(
        go.Scatter(
            x=[i, i],
            y=[q30[i], q70[i]],
            mode="lines",
            line=dict(width=10, color="rgba(70, 130, 180, 0.5)"),
            hoverinfo="none",
            showlegend=False,
        )
    )

# Add actual values
fig.add_trace(
    go.Scatter(
        x=records,
        y=actuals,
        mode="markers",
        marker=dict(color="black", size=8),
        name="Actual",
    )
)

# Add median predictions
fig.add_trace(
    go.Scatter(
        x=records,
        y=medians,
        mode="markers",
        marker=dict(color="red", size=8),
        name="Median (Q50)",
    )
)

# Add dashed line for Q10
fig.add_trace(
    go.Scatter(
        x=[-1, -1],  # Dummy points for legend
        y=[0, 0],  # Dummy points for legend
        mode="lines",
        line=dict(color="rgba(173, 216, 230, 0.3)", width=10),
        name="80% PI (Q10-Q90)",
    )
)

# Add dashed line for Q30
fig.add_trace(
    go.Scatter(
        x=[-1, -1],  # Dummy points for legend
        y=[0, 0],  # Dummy points for legend
        mode="lines",
        line=dict(color="rgba(70, 130, 180, 0.5)", width=10),
        name="40% PI (Q30-Q70)",
    )
)

# Update layout with smaller width to fit in the book layout
fig.update_layout(
    title="Matching Imputation Prediction Intervals",
    xaxis=dict(
        title="Data Record Index",
        showgrid=True,
        gridwidth=1,
        gridcolor="rgba(211, 211, 211, 0.7)",
    ),
    yaxis=dict(
        title="Total Serum Cholesterol (s1)",
        showgrid=True,
        gridwidth=1,
        gridcolor="rgba(211, 211, 211, 0.7)",
    ),
    width=750,
    height=600,
    template="plotly_white",
    margin=dict(l=50, r=50, t=80, b=50),  # Adjust margins
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
)

fig.show()

This plot displays the prediction intervals produced by a Matching model for total serum cholesterol values across ten data records. Each red dot indicates the imputed median value (Q50) for a missing observation, while black dots represent the corresponding true values. Light blue and dark blue vertical bars would represent the 80% (Q10–Q90) and 40% (Q30–Q70) prediction intervals, respectively. Unlike model-based methods, all records lack visible interval bars entirely. This reflects the limited variability inherent in matching methods, where each imputed value is drawn from a single matched donor or a small set of similar units. As a result, the model cannot capture the full uncertainty of the imputed values, as all quantile estimates collapse to the same value. Additionally, many of the imputed medians lie far from the actual values. This highlights a key limitation of matching-based imputation: while simple and interpretable, it may lack the flexibility to accurately quantify uncertainty or represent the underlying distribution, especially in complex or high-variance data.

## Assesing the method’s performance

To check whether our model is overfitting and ensure robust results we can perform cross-validation and visualize the results.

predictors = ["age", "sex", "bmi", "bp"]
imputed_variables = ["s1", "s4"]

diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df["bool"] = np.random.choice([True, False], size=len(df))
df["wgt"] = range(1, len(df) + 1)
diabetes_df = df[predictors + imputed_variables + weights]

diabetes_df, dummy_info = preprocess_data(
    diabetes_df,
    full_data=True,
)

for col, dummy_cols in dummy_info["column_mapping"].items():
        if col in predictors:
            predictors.remove(col)
            predictors.extend(dummy_cols)
        elif col in imputed_variables:
            imputed_variables.remove(col)
            imputed_variables.extend(dummy_cols)

Found 1 numeric columns with unique values < 10, treating as categorical: ['sex']. Converting to dummy variables.

# Run cross-validation on the same data set removing the boolean variable
matching_results = cross_validate_model(
    Matching, diabetes_df, predictors, imputed_variables
)

# Display the results
matching_results

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.

/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pydantic/_internal/_generate_schema.py:623: UserWarning: <built-in function array> is not a Python type (it may be an instance of an object), Pydantic will allow any object with no validation since we cannot even enforce that the input is an instance of the given type. To get rid of this error wrap the type with `pydantic.SkipValidation`.
  warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pydantic/_internal/_generate_schema.py:623: UserWarning: <built-in function array> is not a Python type (it may be an instance of an object), Pydantic will allow any object with no validation since we cannot even enforce that the input is an instance of the given type. To get rid of this error wrap the type with `pydantic.SkipValidation`.
  warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pydantic/_internal/_generate_schema.py:623: UserWarning: <built-in function array> is not a Python type (it may be an instance of an object), Pydantic will allow any object with no validation since we cannot even enforce that the input is an instance of the given type. To get rid of this error wrap the type with `pydantic.SkipValidation`.
  warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pydantic/_internal/_generate_schema.py:623: UserWarning: <built-in function array> is not a Python type (it may be an instance of an object), Pydantic will allow any object with no validation since we cannot even enforce that the input is an instance of the given type. To get rid of this error wrap the type with `pydantic.SkipValidation`.
  warn(

/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib", R: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_LIBS_SITE" redefined by R and overriding existing variable. Current: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library", R: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_SESSION_TMPDIR" redefined by R and overriding existing variable. Current: "/tmp/RtmpRPiduz", R: "/tmp/RtmpP2Nnbq"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib", R: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_LIBS_SITE" redefined by R and overriding existing variable. Current: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library", R: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_SESSION_TMPDIR" redefined by R and overriding existing variable. Current: "/tmp/RtmpRPiduz", R: "/tmp/Rtmp2yRbIm"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib", R: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_LIBS_SITE" redefined by R and overriding existing variable. Current: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library", R: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_SESSION_TMPDIR" redefined by R and overriding existing variable. Current: "/tmp/RtmpRPiduz", R: "/tmp/RtmpvRjxHJ"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib", R: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_LIBS_SITE" redefined by R and overriding existing variable. Current: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library", R: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_SESSION_TMPDIR" redefined by R and overriding existing variable. Current: "/tmp/RtmpRPiduz", R: "/tmp/RtmpegBocy"
  warnings.warn(

/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib", R: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_LIBS_SITE" redefined by R and overriding existing variable. Current: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library", R: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_SESSION_TMPDIR" redefined by R and overriding existing variable. Current: "/tmp/RtmpP2Nnbq", R: "/tmp/Rtmp9XevSY"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib", R: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_LIBS_SITE" redefined by R and overriding existing variable. Current: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library", R: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_SESSION_TMPDIR" redefined by R and overriding existing variable. Current: "/tmp/Rtmp2yRbIm", R: "/tmp/RtmpwYtE6t"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib", R: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_LIBS_SITE" redefined by R and overriding existing variable. Current: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library", R: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_SESSION_TMPDIR" redefined by R and overriding existing variable. Current: "/tmp/RtmpvRjxHJ", R: "/tmp/Rtmpcta9Mt"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib", R: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.11.12/x64/lib"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_LIBS_SITE" redefined by R and overriding existing variable. Current: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library", R: "/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library/:/usr/local/lib/R/site-library:/usr/lib/R/site-library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library:/usr/lib/R/library"
  warnings.warn(
/opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/rpy2/rinterface/__init__.py:1211: UserWarning: Environment variable "R_SESSION_TMPDIR" redefined by R and overriding existing variable. Current: "/tmp/RtmpegBocy", R: "/tmp/Rtmp0MfSCR"
  warnings.warn(

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    7.7s remaining:   11.6s

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    8.2s remaining:    5.4s

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.8s finished

	0.05	0.10	0.15	0.20	0.25	...	0.75	0.80	0.85	0.90	0.95
train	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000
test	0.023949	0.023909	0.023868	0.023828	0.023787	...	0.023382	0.023342	0.023302	0.023261	0.023221

2 rows × 19 columns

# Plot the results
perf_results_viz = model_performance_results(
    results=matching_results,
    model_name="Matching",
    method_name="Cross-Validation Quantile Loss Average",
)
fig = perf_results_viz.plot(
    title="Matching Cross-Validation Performance",
)
fig.show()

We can observe how train loss will be 0 in this case, as each record will find itself in the training data as a perfect match, which is not the case for testing data. Note that when using donor and receiver data sets that are different from each other this will not occur.

# Tuning the Matching model

The Matching imputer supports various parameters that can be adjusted to improve performance. To set specific values you know increase performance for your specific dataset see below. Additionally, automatic hyperparameter tunning specific to the target dataset is enabled by setting the parameter tune_hyperparameters to True.

predictors = ["age", "sex", "bmi", "bp"]
imputed_variables = ["s1", "s4"]

diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df["bool"] = np.random.choice([True, False], size=len(df))
df["wgt"] = range(1, len(df) + 1)
diabetes_df = df[predictors + imputed_variables + weights]

X_train, X_test, dummy_info = preprocess_data(
    diabetes_df,
    full_data=False,
)

for col, dummy_cols in dummy_info["column_mapping"].items():
        if col in predictors:
            predictors.remove(col)
            predictors.extend(dummy_cols)
        elif col in imputed_variables:
            imputed_variables.remove(col)
            imputed_variables.extend(dummy_cols)

Found 1 numeric columns with unique values < 10, treating as categorical: ['sex']. Converting to dummy variables.

# To set specific hyperparameters pass them when fitting the model
fitted_matching_imputer = matching_imputer.fit(
    X_train=X_train,
    predictors=predictors,
    imputed_variables=imputed_variables,
    constrained=True,  # Use constrained matching
)

predictors = ["age", "sex", "bmi", "bp"]
imputed_variables = ["s1", "s4"]

diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df["bool"] = np.random.choice([True, False], size=len(df))
df["wgt"] = range(1, len(df) + 1)
diabetes_df = df[predictors + imputed_variables + weights]

X_train, X_test, dummy_info = preprocess_data(
    diabetes_df,
    full_data=False,
)

for col, dummy_cols in dummy_info["column_mapping"].items():
        if col in predictors:
            predictors.remove(col)
            predictors.extend(dummy_cols)
        elif col in imputed_variables:
            imputed_variables.remove(col)
            imputed_variables.extend(dummy_cols)

Found 1 numeric columns with unique values < 10, treating as categorical: ['sex']. Converting to dummy variables.

# To automatically tune hyperparameters to the specific dataset at hand
fitted_matching_imputer, best_tuned_params = matching_imputer.fit(
    X_train=X_train,
    predictors=predictors,
    imputed_variables=imputed_variables,
    tune_hyperparameters=True,
)

print(best_tuned_params)

{'dist_fun': 'Gower', 'constrained': True, 'constr_alg': 'hungarian', 'k': 5}