Source code for cowidev.megafile.steps.jhu

import datetime
import os
from functools import reduce
import pandas as pd


[docs]def get_jhu(jhu_dir: str): """ Reads each COVID-19 JHU dataset located in /public/data/jhu/ Melts the dataframe to vertical format (1 row per country and date) Merges all JHU dataframes into one with outer joins Returns: jhu {dataframe} """ jhu_variables = [ "total_cases", "new_cases", "weekly_cases", "total_deaths", "new_deaths", "weekly_deaths", "total_cases_per_million", "new_cases_per_million", "weekly_cases_per_million", "total_deaths_per_million", "new_deaths_per_million", "weekly_deaths_per_million", ] data_frames = [] # Process each file and melt it to vertical format for jhu_var in jhu_variables: tmp = pd.read_csv(os.path.join(jhu_dir, f"{jhu_var}.csv")) country_cols = list(tmp.columns) country_cols.remove("date") # Carrying last observation forward for International totals to avoid discrepancies if jhu_var[:5] == "total": tmp = tmp.sort_values("date") tmp["International"] = tmp["International"].ffill() tmp = ( pd.melt(tmp, id_vars="date", value_vars=country_cols) .rename(columns={"value": jhu_var, "variable": "location"}) .dropna() ) if jhu_var[:7] == "weekly_": tmp[jhu_var] = tmp[jhu_var].div(7).round(3) tmp = tmp.rename( errors="ignore", columns={ "weekly_cases": "new_cases_smoothed", "weekly_deaths": "new_deaths_smoothed", "weekly_cases_per_million": "new_cases_smoothed_per_million", "weekly_deaths_per_million": "new_deaths_smoothed_per_million", }, ) else: tmp[jhu_var] = tmp[jhu_var].round(3) data_frames.append(tmp) # Outer join between all files jhu = reduce( lambda left, right: pd.merge(left, right, on=["date", "location"], how="outer"), data_frames, ) return jhu
[docs]def add_cumulative_deaths_last12m(df: pd.DataFrame) -> pd.DataFrame: df["daily_diff"] = df[["location", "total_deaths"]].groupby("location").fillna(0).diff() date_cutoff = pd.to_datetime(df.date.max()) - datetime.timedelta(days=365.2425) df.loc[pd.to_datetime(df.date) < date_cutoff, "daily_diff"] = 0 df["total_deaths_last12m"] = df[["location", "daily_diff"]].groupby("location").cumsum() df.loc[(pd.to_datetime(df.date) < date_cutoff) | (df.new_deaths.isnull()), "total_deaths_last12m"] = pd.NA df["total_deaths_last12m_per_million"] = df.total_deaths_last12m.mul(1000000).div(df.population) return df.drop(columns="daily_diff")