Writing datasets#

A common use case of PolicyEngine Core country models is not just simulating for a few households, but thousands in the form of microsimulation on survey data. This technique can be used to simulate the impact of a policy on a population, or to compare the impact of different policies on the same population. To do this, we need to be able to load data into PolicyEngine Core, and to do this we use a standardised format using the Dataset class.

Example#

Here’s the Country Template’s default example for a dataset.

from policyengine_core.country_template.constants import COUNTRY_DIR
from policyengine_core.data import Dataset
from policyengine_core.periods import ETERNITY, MONTH, period


class CountryTemplateDataset(Dataset):
    # Specify metadata used to describe and store the dataset.
    name = "country_template_dataset"
    label = "Country template dataset"
    file_path = (
        COUNTRY_DIR / "data" / "storage" / "country_template_dataset.h5"
    )
    data_format = Dataset.TIME_PERIOD_ARRAYS

    # The generation function is the most important part: it defines
    # how the dataset is generated from the raw data.
    def generate(self) -> None:
        person_id = [0, 1, 2]
        household_id = [0, 1]
        person_household_id = [0, 0, 1]
        person_household_role = ["parent", "child", "parent"]
        salary = [100, 0, 200]
        salary_time_period = period("2022-01")
        weight = [1e6, 1.2e6]
        weight_time_period = period("2022")
        data = {
            "person_id": {ETERNITY: person_id},
            "household_id": {ETERNITY: household_id},
            "person_household_id": {ETERNITY: person_household_id},
            "person_household_role": {ETERNITY: person_household_role},
            "salary": {salary_time_period: salary},
            "household_weight": {weight_time_period: weight},
        }
        self.save_dataset(data)


from policyengine_core.country_template import Simulation

CountryTemplateDataset().generate()

simulation = Simulation(dataset=CountryTemplateDataset)

simulation.calculate("salary", "2022-01")
array([100.,   0., 200.], dtype=float32)