Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add calibration dashboard module #834

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
- bump: minor
changes:
added:
- Add test cases for housing benefit
changes:
fixed:
- Bugs in carbon tax modules for 2026-27 causing NANs.
added:
- Calibration dashboard.
539 changes: 15 additions & 524 deletions policyengine_uk/data/datasets/frs/calibration/calibrate.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def generate(self):
adjusted_weights = calibrate(
self.input_dataset.name,
time_period=year,
training_log_path="calibration_log_cps.csv.gz",
training_log_path=STORAGE_FOLDER / "calibration_log.csv.gz",
overwrite_existing_log=year == str(self.time_period),
)
for variable in input_dataset.variables:
Expand Down
97 changes: 97 additions & 0 deletions policyengine_uk/data/datasets/frs/calibration/dashboard/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import pandas as pd
import plotly.express as px
from policyengine_core.charts import *
import streamlit as st
import numpy as np
from policyengine_uk.data.storage import STORAGE_FOLDER

# st.set_page_config(layout="wide")

st.title("PolicyEngine UK microdata dashboard")

df = pd.read_csv(STORAGE_FOLDER / "dataset_losses.csv.gz")

df.dataset = df.dataset.replace(
{
"frs_2021": "FRS (2021)",
"calibrated_spi_enhanced_pooled_frs_2019_21": "Enhanced FRS",
}
)

st.dataframe(df, use_container_width=True)

df["rel_error"] = df.value / df.target - 1
df["abs_rel_error"] = (df.value / df.target - 1).abs()

left, right = st.columns(2)
with left:
metric = st.selectbox("Metric", df.name.unique())
with right:
time_period = st.selectbox("Time period", df.time_period.unique())


def capitalise(string):
return string[0].upper() + string[1:]

df["Text"] = df.rel_error.apply(lambda x: f"{x:+.0%}")

fig = px.bar(
df[(df.name == metric) & (df.time_period == time_period)],
x="dataset",
y="value",
color="dataset",
title=f"{capitalise(metric)} in {time_period}",
color_discrete_sequence=[MEDIUM_DARK_GRAY, BLUE],
text="Text",
).update_layout(
yaxis_title="Absolute relative error",
xaxis_title="Dataset",
showlegend=False,
)

# Add dotted line for truth

true_value = df[
(df.name == metric) & (df.time_period == time_period)
].target.mean()
fig.add_shape(
type="line",
x0=-0.5,
y0=true_value,
x1=len(df.dataset.unique()) - 0.5,
y1=true_value,
line=dict(
color=MEDIUM_DARK_GRAY,
width=3,
dash="dash",
),
)

st.plotly_chart(fig, use_container_width=True)

quantiles = []
datasets = []
values = []

for quantile in np.linspace(0.1, 0.9, 9):
for dataset in df.dataset.unique():
datasets.append(dataset)
quantiles.append(quantile)
values.append(df[df.dataset == dataset][df.time_period == time_period].abs_rel_error.quantile(quantile))

quantile_df = pd.DataFrame({"Dataset": datasets, "Quantile": quantiles, "Value": values})
quantile_df["Text"] = quantile_df.Value.apply(lambda x: f"{x:.0%}")

fig = px.bar(
quantile_df,
y="Value",
x="Quantile",
text="Text",
color="Dataset",
title="Quantiles of absolute relative error, by dataset",
labels={"Dataset": "Dataset", "Quantile": "Quantile"},
barmode="group",
color_discrete_sequence=[MEDIUM_DARK_GRAY, BLUE],
)

st.plotly_chart(fig, use_container_width=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from policyengine_uk.data.datasets.frs.calibration.loss import get_snapshot
from policyengine_uk.data.storage import STORAGE_FOLDER
import pandas as pd


def main():
YEARS = ["2023", "2024", "2025", "2026", "2027"]
DATASETS = ["frs_2021", "calibrated_spi_enhanced_pooled_frs_2019_21"]

dfs = []

for year in YEARS:
for dataset in DATASETS:
print(dataset, year)
df = get_snapshot(dataset, year)
df["time_period"] = year
df["dataset"] = dataset
dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

df.to_csv(
STORAGE_FOLDER / "dataset_losses.csv.gz",
index=False,
compression="gzip",
)


if __name__ == "__main__":
main()
Loading
Loading