Source code for policyengine_core.data.dataset

from pathlib import Path
from typing import Dict, Union, List
import h5py
import numpy as np
import pandas as pd
import shutil
import requests
import os


[docs]class Dataset: """The `Dataset` class is a base class for datasets used directly or indirectly for microsimulation models. A dataset defines a generation function to create it from other data, and this class provides common features like storage, metadata and loading.""" name: str = None """The name of the dataset. This is used to generate filenames and is used as the key in the `datasets` dictionary.""" label: str = None """The label of the dataset. This is used for logging and is used as the key in the `datasets` dictionary.""" data_format: str = None """The format of the dataset. This can be either `Dataset.ARRAYS`, `Dataset.TIME_PERIOD_ARRAYS` or `Dataset.TABLES`. If `Dataset.ARRAYS`, the dataset is stored as a collection of arrays. If `Dataset.TIME_PERIOD_ARRAYS`, the dataset is stored as a collection of arrays, with one array per time period. If `Dataset.TABLES`, the dataset is stored as a collection of tables (DataFrames).""" file_path: Path = None """The path to the dataset file. This is used to load the dataset from a file.""" time_period: str = None """The time period of the dataset. This is used to automatically enter the values in the correct time period if the data type is `Dataset.ARRAYS`.""" url: str = None """The URL to download the dataset from. This is used to download the dataset if it does not exist.""" # Data formats TABLES = "tables" ARRAYS = "arrays" TIME_PERIOD_ARRAYS = "time_period_arrays" FLAT_FILE = "flat_file" _table_cache: Dict[str, pd.DataFrame] = None def __init__(self, require: bool = False): # Setup dataset if self.file_path is None: raise ValueError( "Dataset file_path must be specified in the dataset class definition." ) elif isinstance(self.file_path, str): self.file_path = Path(self.file_path) self.file_path.parent.mkdir(parents=True, exist_ok=True) assert ( self.name ), "You tried to instantiate a Dataset object, but no name has been provided." assert ( self.label ), "You tried to instantiate a Dataset object, but no label has been provided." assert self.data_format in [ Dataset.TABLES, Dataset.ARRAYS, Dataset.TIME_PERIOD_ARRAYS, Dataset.FLAT_FILE, ], f"You tried to instantiate a Dataset object, but your data_format attribute is invalid ({self.data_format})." self._table_cache = {} if not self.exists and require: if self.url is not None: self.download() else: self.generate()
[docs] def load( self, key: str = None, mode: str = "r" ) -> Union[h5py.File, np.array, pd.DataFrame, pd.HDFStore]: """Loads the dataset for a given year, returning a H5 file reader. You can then access the dataset like a dictionary (e.g.e Dataset.load(2022)["variable"]). Args: key (str, optional): The key to load. Defaults to None. mode (str, optional): The mode to open the file with. Defaults to "r". Returns: Union[h5py.File, np.array, pd.DataFrame, pd.HDFStore]: The dataset. """ file = self.file_path if self.data_format in (Dataset.ARRAYS, Dataset.TIME_PERIOD_ARRAYS): if key is None: # If no key provided, return the basic H5 reader. return h5py.File(file, mode=mode) else: # If key provided, return only the values requested. with h5py.File(file, mode=mode) as f: values = np.array(f[key]) return values elif self.data_format == Dataset.TABLES: if key is None: # Non-openfisca datasets are assumed to be of the format (table name: [table], ...). return pd.HDFStore(file) else: if key in self._table_cache: return self._table_cache[key] # If a table name is provided, return that table. with pd.HDFStore(file) as f: values = f[key] self._table_cache[key] = values return values elif self.data_format == Dataset.FLAT_FILE: if key is None: return pd.read_csv(file) else: raise ValueError( "You tried to load a key from a flat file dataset, but flat file datasets do not support keys." ) else: raise ValueError( f"Invalid data format {self.data_format} for dataset {self.label}." )
[docs] def save(self, key: str, values: Union[np.array, pd.DataFrame]): """Overwrites the values for `key` with `values`. Args: key (str): The key to save. values (Union[np.array, pd.DataFrame]): The values to save. """ file = self.file_path if self.data_format in (Dataset.ARRAYS, Dataset.TIME_PERIOD_ARRAYS): with h5py.File(file, "a") as f: # Overwrite if existing if key in f: del f[key] f.create_dataset(key, data=values) elif self.data_format == Dataset.TABLES: with pd.HDFStore(file, "a") as f: f.put(key, values) self._table_cache = {} elif self.data_format == Dataset.FLAT_FILE: values.to_csv(file, index=False) else: raise ValueError( f"Invalid data format {self.data_format} for dataset {self.label}." )
[docs] def save_dataset(self, data, file_path: str = None) -> None: """Writes a complete dataset to disk. Args: data: The data to save. >>> example_data: Dict[str, Dict[str, Sequence]] = { ... "employment_income": { ... "2022": np.array([25000, 25000, 30000, 30000]), ... }, ... } >>> example_data["employment_income"]["2022"] = [25000, 25000, 30000, 30000] """ if file_path is not None: file = Path(file_path) elif not isinstance(self.file_path, Path): self.file_path = Path(self.file_path) file = self.file_path if self.data_format == Dataset.TABLES: for table_name, dataframe in data.items(): self.save(table_name, dataframe) elif self.data_format == Dataset.TIME_PERIOD_ARRAYS: with h5py.File(file, "w") as f: for variable, values in data.items(): for time_period, value in values.items(): key = f"{variable}/{time_period}" # Overwrite if existing if key in f: del f[key] try: f.create_dataset(key, data=value) except: raise ValueError( f"Could not save {key} to {file}. The value is {value}." ) elif self.data_format == Dataset.ARRAYS: with h5py.File(file, "a" if file.exists() else "w") as f: for variable, value in data.items(): # Overwrite if existing if variable in f: del f[variable] try: f.create_dataset(variable, data=value) except: raise ValueError( f"Could not save {variable} to {file}. The value is {value}." ) elif self.data_format == Dataset.FLAT_FILE: data.to_csv(file, index=False)
[docs] def load_dataset( self, ): """Loads a complete dataset from disk. Returns: Dict[str, Dict[str, Sequence]]: The dataset. """ file = self.file_path if self.data_format == Dataset.TABLES: with pd.HDFStore(file) as f: data = {table_name: f[table_name] for table_name in f.keys()} elif self.data_format == Dataset.TIME_PERIOD_ARRAYS: with h5py.File(file, "r") as f: data = {} for variable in f.keys(): data[variable] = {} for time_period in f[variable].keys(): key = f"{variable}/{time_period}" data[variable][time_period] = np.array(f[key]) elif self.data_format == Dataset.ARRAYS: with h5py.File(file, "r") as f: data = { variable: np.array(f[variable]) for variable in f.keys() } return data
[docs] def generate(self): """Generates the dataset for a given year (all datasets should implement this method). Raises: NotImplementedError: If the function has not been overriden. """ raise NotImplementedError( f"You tried to generate the dataset for {self.label}, but no dataset generation implementation has been provided for {self.label}." )
@property def exists(self) -> bool: """Checks whether the dataset exists. Returns: bool: Whether the dataset exists. """ return self.file_path.exists() @property def variables(self) -> List[str]: """Returns the variables in the dataset. Returns: List[str]: The variables in the dataset. """ if self.data_format == Dataset.TABLES: with pd.HDFStore(self.file_path) as f: return list(f.keys()) elif self.data_format in (Dataset.ARRAYS, Dataset.TIME_PERIOD_ARRAYS): with h5py.File(self.file_path, "r") as f: return list(f.keys()) elif self.data_format == Dataset.FLAT_FILE: return pd.read_csv(self.file_path, nrows=0).columns.tolist() else: raise ValueError( f"Invalid data format {self.data_format} for dataset {self.label}." ) def __getattr__(self, name): """Allows the dataset to be accessed like a dictionary. Args: name (str): The key to access. Returns: Union[np.array, pd.DataFrame]: The dataset. """ return self.load(name)
[docs] def store_file(self, file_path: str): """Moves a file to the dataset's file path. Args: file_path (str): The file path to move. """ file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"File {file_path} does not exist.") shutil.move(file_path, self.file_path)
[docs] def download(self, url: str = None): """Downloads a file to the dataset's file path. Args: url (str): The url to download. """ if url is None: url = self.url if "POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN" not in os.environ: auth_headers = {} else: auth_headers = { "Authorization": f"token {os.environ['POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN']}", } # "release://" is a special protocol for downloading from GitHub releases # e.g. release://policyengine/policyengine-us/cps-2023/cps_2023.h5 # release://org/repo/release_tag/file_path # Use the GitHub API to get the download URL for the release asset if url.startswith("release://"): org, repo, release_tag, file_path = url.split("/")[2:] url = f"https://api.github.com/repos/{org}/{repo}/releases/tags/{release_tag}" response = requests.get(url, headers=auth_headers) if response.status_code != 200: raise ValueError( f"Invalid response code {response.status_code} for url {url}." ) assets = response.json()["assets"] for asset in assets: if asset["name"] == file_path: url = asset["url"] break else: raise ValueError( f"File {file_path} not found in release {release_tag} of {org}/{repo}." ) else: url = url response = requests.get( url, headers={ "Accept": "application/octet-stream", **auth_headers, }, ) if response.status_code != 200: raise ValueError( f"Invalid response code {response.status_code} for url {url}." ) with open(self.file_path, "wb") as f: f.write(response.content)
[docs] def remove(self): """Removes the dataset from disk.""" if self.exists: self.file_path.unlink()
[docs] @staticmethod def from_file(file_path: str, time_period: str = None): """Creates a dataset from a file. Args: file_path (str): The file path to create the dataset from. Returns: Dataset: The dataset. """ file_path = Path(file_path) dataset = type( "Dataset", (Dataset,), { "name": file_path.stem, "label": file_path.stem, "data_format": Dataset.FLAT_FILE, "file_path": file_path, "time_period": time_period, }, )() return dataset
[docs] @staticmethod def from_dataframe(dataframe: pd.DataFrame, time_period: str = None): """Creates a dataset from a DataFrame. Returns: Dataset: The dataset. """ dataset = type( "Dataset", (Dataset,), { "name": "dataframe", "label": "DataFrame", "data_format": Dataset.FLAT_FILE, "file_path": "dataframe", "time_period": time_period, "load": lambda self: dataframe, }, )() return dataset