import datetime
import os
from functools import reduce
import pandas as pd
[docs]def get_jhu(jhu_dir: str):
"""
Reads each COVID-19 JHU dataset located in /public/data/jhu/
Melts the dataframe to vertical format (1 row per country and date)
Merges all JHU dataframes into one with outer joins
Returns:
jhu {dataframe}
"""
jhu_variables = [
"total_cases",
"new_cases",
"weekly_cases",
"total_deaths",
"new_deaths",
"weekly_deaths",
"total_cases_per_million",
"new_cases_per_million",
"weekly_cases_per_million",
"total_deaths_per_million",
"new_deaths_per_million",
"weekly_deaths_per_million",
]
data_frames = []
# Process each file and melt it to vertical format
for jhu_var in jhu_variables:
tmp = pd.read_csv(os.path.join(jhu_dir, f"{jhu_var}.csv"))
country_cols = list(tmp.columns)
country_cols.remove("date")
# Carrying last observation forward for International totals to avoid discrepancies
if jhu_var[:5] == "total":
tmp = tmp.sort_values("date")
tmp["International"] = tmp["International"].ffill()
tmp = (
pd.melt(tmp, id_vars="date", value_vars=country_cols)
.rename(columns={"value": jhu_var, "variable": "location"})
.dropna()
)
if jhu_var[:7] == "weekly_":
tmp[jhu_var] = tmp[jhu_var].div(7).round(3)
tmp = tmp.rename(
errors="ignore",
columns={
"weekly_cases": "new_cases_smoothed",
"weekly_deaths": "new_deaths_smoothed",
"weekly_cases_per_million": "new_cases_smoothed_per_million",
"weekly_deaths_per_million": "new_deaths_smoothed_per_million",
},
)
else:
tmp[jhu_var] = tmp[jhu_var].round(3)
data_frames.append(tmp)
# Outer join between all files
jhu = reduce(
lambda left, right: pd.merge(left, right, on=["date", "location"], how="outer"),
data_frames,
)
return jhu
[docs]def add_cumulative_deaths_last12m(df: pd.DataFrame) -> pd.DataFrame:
df["daily_diff"] = df[["location", "total_deaths"]].groupby("location").fillna(0).diff()
date_cutoff = pd.to_datetime(df.date.max()) - datetime.timedelta(days=365.2425)
df.loc[pd.to_datetime(df.date) < date_cutoff, "daily_diff"] = 0
df["total_deaths_last12m"] = df[["location", "daily_diff"]].groupby("location").cumsum()
df.loc[(pd.to_datetime(df.date) < date_cutoff) | (df.new_deaths.isnull()), "total_deaths_last12m"] = pd.NA
df["total_deaths_last12m_per_million"] = df.total_deaths_last12m.mul(1000000).div(df.population)
return df.drop(columns="daily_diff")