This notebook demonstrates the sparse matrix construction for local area (congressional district) calibration. It uses a subset of CDs from NC, HI, MT, and AK for manageable runtime.
Section 1: Setup & Configuration¶
from sqlalchemy import create_engine, text
import pandas as pd
import numpy as np
from policyengine_us import Microsimulation
from policyengine_us_data.storage import STORAGE_FOLDER
from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (
SparseMatrixBuilder,
)
from policyengine_us_data.datasets.cps.local_area_calibration.matrix_tracer import (
MatrixTracer,
)
from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
get_calculated_variables,
create_target_groups,
)db_path = STORAGE_FOLDER / "policy_data.db"
db_uri = f"sqlite:///{db_path}"
dataset_path = str(STORAGE_FOLDER / "stratified_extended_cps_2023.h5")
engine = create_engine(db_uri)Section 2: Select Test Congressional Districts¶
We use CDs from 4 states for testing:
NC (37): 14 CDs (3701-3714) - provides same-state different-CD test cases
HI (15): 2 CDs (1501-1502)
MT (30): 2 CDs (3001-3002)
AK (2): 1 CD (200)
query = """
SELECT DISTINCT sc.value as cd_geoid
FROM strata s
JOIN stratum_constraints sc ON s.stratum_id = sc.stratum_id
WHERE s.stratum_group_id = 1
AND sc.constraint_variable = 'congressional_district_geoid'
AND (
sc.value LIKE '37__'
OR sc.value LIKE '150_'
OR sc.value LIKE '300_'
OR sc.value = '200' OR sc.value = '201'
)
ORDER BY sc.value
"""
with engine.connect() as conn:
result = conn.execute(text(query)).fetchall()
test_cds = [row[0] for row in result]
print(f"Testing with {len(test_cds)} congressional districts:")
print(f" NC (37): {[cd for cd in test_cds if cd.startswith('37')]}")
print(f" HI (15): {[cd for cd in test_cds if cd.startswith('15')]}")
print(f" MT (30): {[cd for cd in test_cds if cd.startswith('30')]}")
print(f" AK (2): {[cd for cd in test_cds if cd.startswith('20')]}")Section 3: Build the Sparse Matrix¶
The sparse matrix X_sparse has:
Rows: Calibration targets (e.g., SNAP totals by geography)
Columns: (household × CD) pairs - each household appears once per CD
We filter to SNAP targets only (stratum_group_id=4) for this demonstration.
sim = Microsimulation(dataset=dataset_path)
builder = SparseMatrixBuilder(
db_uri,
time_period=2023,
cds_to_calibrate=test_cds,
dataset_path=dataset_path,
)
targets_df, X_sparse, household_id_mapping = builder.build_matrix(
sim, target_filter={"stratum_group_ids": [4], "variables": ["snap"]}
)
print(f"X_sparse shape: {X_sparse.shape}")
print(f" Rows (targets): {X_sparse.shape[0]}")
print(f" Columns (household × CD pairs): {X_sparse.shape[1]}")
print(f" Non-zero entries: {X_sparse.nnz:,}")
print(f" Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.2%}")Section 4: Understanding the Matrix Structure with MatrixTracer¶
The MatrixTracer helps navigate the sparse matrix by providing lookups between:
Column indices ↔ (household_id, CD) pairs
Row indices ↔ target definitions
tracer = MatrixTracer(
targets_df, X_sparse, household_id_mapping, test_cds, sim
)
tracer.print_matrix_structure()target_groups, group_info = create_target_groups(targets_df)target_group = tracer.get_group_rows(2)
row_loc = target_group.iloc[28]['row_index'] # Manually found the index value 28
row_info = tracer.get_row_info(row_loc)
var = row_info['variable']
var_desc = row_info['variable_desc']
target_geo_id = int(row_info['geographic_id'])
print("Row info for North Carolina's SNAP benefit amount:")
print(row_info)hh_snap_df = pd.DataFrame(sim.calculate_dataframe([
"household_id", "household_weight", "state_fips", "snap"])
)
print(hh_snap_df)If we were to include congressional_district_geoid above, they would all be zeros. It’s not until we do the calibration, i.e., come back with a vector of weights w to multiply X_sparse with, that we will set congressional_district_geoid.
However, every household is already a donor to every contressional district. You can get the column positions for every household (remember targets are on the rows, donor households on the columns) by running tracer’s get_household_column_positions with the original household_id.
# Reverse lookup: get all column positions for a specific household
hh_id = hh_snap_df.loc[hh_snap_df.snap > 0].household_id.values[0]
print(hh_snap_df.loc[hh_snap_df.household_id == hh_id])
print("\nEvaluating the tracer.get_household_column_positions dictionary:\n")
positions = tracer.get_household_column_positions(hh_id)
print(positions)Section 5: Understanding the cells of the X_Sparse matrix and Target vector¶
print("Remember, this is a North Carolina target:\n")
print(targets_df.iloc[row_loc])
print("\nHousehold donated to NC's 2nd district, 2023 SNAP dollars:")
print(X_sparse[row_loc, positions['3702']]) # Household donated to NC's 2nd district
print("\nHousehold donated to NC's 2nd district, 2023 SNAP dollars:")
print(X_sparse[row_loc, positions['201']]) # Household donated to AK's at Large DistrictKey property: For state-level targets, only CDs in that state should have non-zero values.
Example: A NC state SNAP target should have zeros for HI, MT, and AK CD columns.
So let’s see that same household’s value for the Alaska state target:
target_group = tracer.get_group_rows(2)
new_row_loc = target_group.iloc[10]['row_index'] # Manually found the index value 10
row_info = tracer.get_row_info(row_loc)
var = row_info['variable']
var_desc = row_info['variable_desc']
target_geo_id = int(row_info['geographic_id'])
print("Row info for Alaska's SNAP benefit amount:")
print(row_info)print("\nHousehold donated to AK's 1st district, 2023 SNAP dollars:")
print(X_sparse[new_row_loc, positions['201']]) # Household donated to AK's at Large DistrictSection 6: Simulating State-Swapped Calculations¶
When a household is “transplanted” to a different state, state-dependent benefits like SNAP are recalculated under the destination state’s rules.
def create_state_simulation(state_fips):
"""Create a simulation with all households assigned to a specific state."""
s = Microsimulation(dataset=dataset_path)
s.set_input(
"state_fips", 2023, np.full(hh_snap_df.shape[0], state_fips, dtype=np.int32)
)
for var in get_calculated_variables(s):
s.delete_arrays(var)
return s
# Compare SNAP for first 5 households under NC vs AK rules
nc_sim = create_state_simulation(37) # NC
ak_sim = create_state_simulation(2) # AK
nc_snap = nc_sim.calculate("snap", map_to="household").values[:5]
ak_snap = ak_sim.calculate("snap", map_to="household").values[:5]
print("SNAP values for first 5 households under different state rules:")
print(f" NC rules: {nc_snap}")
print(f" AK rules: {ak_snap}")
print(f" Difference: {ak_snap - nc_snap}")