import os
from glob import glob
import pandas as pd
from cowidev.utils.utils import make_monotonic as _make_monotonic
[docs]def get_latest_file(path, extension):
files = glob(os.path.join(path, f"*.{extension}"))
return max(files, key=os.path.getctime)
[docs]def make_monotonic(df: pd.DataFrame, max_removed_rows=10, new_version=False) -> pd.DataFrame:
# Forces vaccination time series to become monotonic.
# The algorithm assumes that the most recent values are the correct ones,
# and therefore removes previous higher values.
if new_version:
return _make_monotonic(
df=df,
column_date="date",
column_metrics=["total_vaccinations", "people_vaccinated", "people_fully_vaccinated"],
max_removed_rows=max_removed_rows,
strict=False,
)
n_rows_before = len(df)
dates_before = set(df.date)
df_before = df.copy()
df = df.sort_values("date")
metrics = ("total_vaccinations", "people_vaccinated", "people_fully_vaccinated")
for metric in metrics:
while not df[metric].ffill().fillna(0).is_monotonic:
diff = df[metric].ffill().shift(-1) - df[metric].ffill()
df = df[(diff >= 0) | (diff.isna())]
dates_now = set(df.date)
if max_removed_rows is not None:
num_removed_rows = n_rows_before - len(df)
if num_removed_rows > max_removed_rows:
dates_wrong = dates_before.difference(dates_now)
df_wrong = df_before[df_before.date.isin(dates_wrong)]
raise Exception(
f"{num_removed_rows} rows have been removed. That is more than maximum allowed ({max_removed_rows}) by"
f" make_monotonic() - check the data. Check \n{df_wrong}" # {', '.join(sorted(dates_wrong))}"
)
return df
[docs]def build_vaccine_timeline(df: pd.DataFrame, vaccine_timeline: dict) -> pd.DataFrame:
"""Add vaccines to the data based on admin. starting date.
vaccine_timeline: dictionary of "vaccine" -> "start_date"
Example:
{
"Pfizer/BioNTech": "2021-02-24",
"Sinovac": "2021-03-03",
"Oxford/AstraZeneca": "2021-05-03",
"CanSino": "2021-05-09",
"Sinopharm": "2021-09-18",
}
"""
def _build_vaccine_row(date, vaccine_timeline: dict):
vaccines = [k for k, v in vaccine_timeline.items() if v <= date]
return ", ".join(sorted(list(set(vaccines))))
df = df.assign(vaccine=df.date.apply(_build_vaccine_row, vaccine_timeline=vaccine_timeline))
return df
[docs]def add_latest_who_values(df: pd.DataFrame, who_location_name: str, metrics: list):
"""
Inserts the latest data available from the WHO vaccination dataset into the existing dataframe.
metrics: list of metrics to be used from the WHO dataset. Other metrics that aren't listed
will be automatically set to pd.NA for this specific row.
"""
assert isinstance(metrics, list), "The `metrics` argument in add_latest_who_values should be a list!"
if isinstance(df, pd.Series):
df = df.to_frame().T
df["date"] = df.date.astype(str)
df = df.sort_values("date")
who = pd.read_csv(
"https://covid19.who.int/who-data/vaccination-data.csv",
usecols=[
"COUNTRY",
"DATA_SOURCE",
"DATE_UPDATED",
"TOTAL_VACCINATIONS",
"PERSONS_VACCINATED_1PLUS_DOSE",
"PERSONS_FULLY_VACCINATED",
],
)
who = who[(who.COUNTRY == who_location_name) & (who.DATA_SOURCE == "REPORTING")]
if len(who) == 0:
raise Exception(f"No row of type REPORTING was found in the WHO dataset for location '{who_location_name}'")
last_who_report_date = who.DATE_UPDATED.values[0]
who_row = df[df.date <= last_who_report_date].tail(1).copy()
original_rows = df[df.date != last_who_report_date].copy()
who_row["date"] = last_who_report_date
who_row["total_vaccinations"] = who.TOTAL_VACCINATIONS.values[0] if "total_vaccinations" in metrics else pd.NA
who_row["people_vaccinated"] = (
who.PERSONS_VACCINATED_1PLUS_DOSE.values[0] if "people_vaccinated" in metrics else pd.NA
)
who_row["people_fully_vaccinated"] = (
who.PERSONS_FULLY_VACCINATED.values[0] if "people_fully_vaccinated" in metrics else pd.NA
)
if "total_boosters" in who_row.columns:
who_row["total_boosters"] = pd.NA
who_row["source_url"] = "https://covid19.who.int/"
df = pd.concat([original_rows, who_row], ignore_index=True).sort_values("date")
return df