{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# National dataset validation" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "
\n", "
\n", "This is the init_notebook_mode cell from ITables v2.2.1
\n", "(you should not see this message - is your notebook trusted?)\n", "
\n", "
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from policyengine_uk_data import EnhancedFRS_2022_23, FRS_2022_23, SPI_2020_21\n", "from policyengine_uk_data.utils.loss import get_loss_results\n", "import pandas as pd\n", "from itables import init_notebook_mode\n", "import itables.options as opt\n", "opt.maxBytes = \"1MB\"\n", "\n", "init_notebook_mode(all_interactive=True)\n", "\n", "def get_validation():\n", " df = pd.DataFrame()\n", " for dataset in [FRS_2022_23, EnhancedFRS_2022_23]:\n", " for year in range(2022, 2029):\n", " loss_results = get_loss_results(dataset, year)\n", " loss_results[\"time_period\"] = year\n", " loss_results[\"dataset\"] = dataset.label\n", " df = pd.concat([df, loss_results])\n", " df = df.reset_index(drop=True)\n", " return df\n", "\n", "df = get_validation()\n", "truth_df = df[df.dataset == df.dataset.unique()[0]].reset_index()\n", "truth_df[\"estimate\"] = truth_df[\"target\"]\n", "truth_df[\"error\"] = truth_df[\"estimate\"] - truth_df[\"target\"]\n", "truth_df[\"abs_error\"] = truth_df[\"error\"].abs()\n", "truth_df[\"rel_error\"] = truth_df[\"error\"] / truth_df[\"target\"]\n", "truth_df[\"abs_rel_error\"] = truth_df[\"rel_error\"].abs()\n", "truth_df[\"dataset\"] = \"Official\"\n", "df = pd.concat([df, truth_df]).reset_index(drop=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Calibration check: the table below shows how both the original and enhanced FRS datasets compare to over 2,000 official statistics (which the EFRS was explicitly calibrated to hit) from the OBR, DWP and HMRC.\n", "\n", "Since the EFRS is calibrated to these statistics, high performance is expected and achieved.\n", "\n", "## Full results" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "
nameestimatetargeterrorabs_errorrel_errorabs_rel_errortime_perioddataset
\n", "\n", "
\n", "Loading ITables v2.2.1 from the init_notebook_mode cell...\n", "(need help?)
\n", "\n" ], "text/plain": [ " name estimate \\\n", "0 obr/attendance_allowance 3.620932e+09 \n", "1 obr/carers_allowance 2.809091e+09 \n", "2 obr/dla 3.889957e+09 \n", "3 obr/esa 7.209292e+09 \n", "4 obr/esa_contrib 1.777680e+09 \n", "... ... ... \n", "7009 hmrc/property_income_count_income_band_97_12_5... 2.362992e+06 \n", "7010 hmrc/savings_interest_income_income_band_97_12... 4.164107e+09 \n", "7011 hmrc/savings_interest_income_count_income_band... 1.176774e+07 \n", "7012 hmrc/dividend_income_income_band_97_12_570.0_t... 1.200313e+11 \n", "7013 hmrc/dividend_income_count_income_band_97_12_5... 4.006604e+06 \n", "\n", " target error abs_error rel_error abs_rel_error \\\n", "0 5.700000e+09 -2.079068e+09 2.079068e+09 -0.364749 0.364749 \n", "1 3.300000e+09 -4.909088e+08 4.909088e+08 -0.148760 0.148760 \n", "2 6.000000e+09 -2.110043e+09 2.110043e+09 -0.351674 0.351674 \n", "3 1.210000e+10 -4.890708e+09 4.890708e+09 -0.404191 0.404191 \n", "4 4.500000e+09 -2.722320e+09 2.722320e+09 -0.604960 0.604960 \n", "... ... ... ... ... ... \n", "7009 2.362992e+06 0.000000e+00 0.000000e+00 0.000000 0.000000 \n", "7010 4.164107e+09 0.000000e+00 0.000000e+00 0.000000 0.000000 \n", "7011 1.176774e+07 0.000000e+00 0.000000e+00 0.000000 0.000000 \n", "7012 1.200313e+11 0.000000e+00 0.000000e+00 0.000000 0.000000 \n", "7013 4.006604e+06 0.000000e+00 0.000000e+00 0.000000 0.000000 \n", "\n", " time_period dataset \n", "0 2022 FRS (2022-23) \n", "1 2022 FRS (2022-23) \n", "2 2022 FRS (2022-23) \n", "3 2022 FRS (2022-23) \n", "4 2022 FRS (2022-23) \n", "... ... ... \n", "7009 2028 Official \n", "7010 2028 Official \n", "7011 2028 Official \n", "7012 2028 Official \n", "7013 2028 Official \n", "\n", "[7014 rows x 9 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop(columns=[\"index\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Comparisons" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "
nametime_periodtarget_frsestimate_frsestimate_efrserror_frserror_efrsabs_error_frsabs_error_efrsrel_error_frsrel_error_efrsabs_rel_error_frsabs_rel_error_efrsrel_error_change_under_efrs
\n", "\n", "
\n", "Loading ITables v2.2.1 from the init_notebook_mode cell...\n", "(need help?)
\n", "\n" ], "text/plain": [ " name time_period \\\n", "0 obr/attendance_allowance 2022 \n", "1 obr/carers_allowance 2022 \n", "2 obr/dla 2022 \n", "3 obr/esa 2022 \n", "4 obr/esa_contrib 2022 \n", "... ... ... \n", "2333 hmrc/property_income_count_income_band_97_12_5... 2028 \n", "2334 hmrc/savings_interest_income_income_band_97_12... 2028 \n", "2335 hmrc/savings_interest_income_count_income_band... 2028 \n", "2336 hmrc/dividend_income_income_band_97_12_570.0_t... 2028 \n", "2337 hmrc/dividend_income_count_income_band_97_12_5... 2028 \n", "\n", " target_frs estimate_frs estimate_efrs error_frs error_efrs \\\n", "0 5.700000e+09 3.620932e+09 6.027340e+09 -2.079068e+09 3.273395e+08 \n", "1 3.300000e+09 2.809091e+09 3.823339e+09 -4.909088e+08 5.233393e+08 \n", "2 6.000000e+09 3.889957e+09 6.276854e+09 -2.110043e+09 2.768544e+08 \n", "3 1.210000e+10 7.209292e+09 1.287792e+10 -4.890708e+09 7.779178e+08 \n", "4 4.500000e+09 1.777680e+09 4.733142e+09 -2.722320e+09 2.331423e+08 \n", "... ... ... ... ... ... \n", "2333 2.362992e+06 1.786734e+06 2.495508e+06 -5.762585e+05 1.325162e+05 \n", "2334 4.164107e+09 6.704610e+09 4.459873e+09 2.540503e+09 2.957657e+08 \n", "2335 1.176774e+07 1.615525e+07 1.260677e+07 4.387506e+06 8.390220e+05 \n", "2336 1.200313e+11 1.085593e+10 1.250550e+11 -1.091754e+11 5.023635e+09 \n", "2337 4.006604e+06 2.786889e+06 4.229811e+06 -1.219715e+06 2.232069e+05 \n", "\n", " abs_error_frs abs_error_efrs rel_error_frs rel_error_efrs \\\n", "0 2.079068e+09 3.273395e+08 -0.364749 0.057428 \n", "1 4.909088e+08 5.233393e+08 -0.148760 0.158588 \n", "2 2.110043e+09 2.768544e+08 -0.351674 0.046142 \n", "3 4.890708e+09 7.779178e+08 -0.404191 0.064291 \n", "4 2.722320e+09 2.331423e+08 -0.604960 0.051809 \n", "... ... ... ... ... \n", "2333 5.762585e+05 1.325162e+05 -0.243868 0.056080 \n", "2334 2.540503e+09 2.957657e+08 0.610096 0.071027 \n", "2335 4.387506e+06 8.390220e+05 0.372842 0.071298 \n", "2336 1.091754e+11 5.023635e+09 -0.909558 0.041853 \n", "2337 1.219715e+06 2.232069e+05 -0.304426 0.055710 \n", "\n", " abs_rel_error_frs abs_rel_error_efrs rel_error_change_under_efrs \n", "0 0.364749 0.057428 -0.307321 \n", "1 0.148760 0.158588 0.009827 \n", "2 0.351674 0.046142 -0.305531 \n", "3 0.404191 0.064291 -0.339900 \n", "4 0.604960 0.051809 -0.553151 \n", "... ... ... ... \n", "2333 0.243868 0.056080 -0.187788 \n", "2334 0.610096 0.071027 -0.539068 \n", "2335 0.372842 0.071298 -0.301543 \n", "2336 0.909558 0.041853 -0.867705 \n", "2337 0.304426 0.055710 -0.248716 \n", "\n", "[2338 rows x 14 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged = pd.merge(\n", " df[df.dataset == \"FRS (2022-23)\"],\n", " df[df.dataset == \"Enhanced FRS (2022-23)\"],\n", " on=[\"time_period\", \"name\"],\n", " suffixes=(\"_frs\", \"_efrs\"),\n", ")\n", "merged[\"rel_error_change_under_efrs\"] = merged[\"abs_rel_error_efrs\"] - merged[\"abs_rel_error_frs\"]\n", "# Sort columns\n", "merged = merged[\n", " [\n", " \"name\",\n", " \"time_period\",\n", " \"target_frs\",\n", " \"estimate_frs\",\n", " \"estimate_efrs\",\n", " \"error_frs\",\n", " \"error_efrs\",\n", " \"abs_error_frs\",\n", " \"abs_error_efrs\",\n", " \"rel_error_frs\",\n", " \"rel_error_efrs\",\n", " \"abs_rel_error_frs\",\n", " \"abs_rel_error_efrs\",\n", " \"rel_error_change_under_efrs\",\n", " ]\n", "]\n", "merged" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }