import os
import datetime
import re
import numbers
import pandas as pd
import requests
from cowidev import PATHS
from cowidev.vax.utils.utils import make_monotonic
GH_LINK = "https://github.com/owid/covid-19-data/raw/master/public/data/vaccinations/country_data"
[docs]def enrich_data(ds: pd.Series, row, value) -> pd.Series:
return ds.append(pd.Series({row: value}))
[docs]def increment(
location,
total_vaccinations,
date,
vaccine,
source_url,
people_vaccinated=None,
people_partly_vaccinated=None,
people_fully_vaccinated=None,
total_boosters=None,
make_series_monotonic=False,
):
# Check fields
_check_fields(
location=location,
vaccine=vaccine,
source_url=source_url,
date=date,
total_vaccinations=total_vaccinations,
people_vaccinated=people_vaccinated,
people_partly_vaccinated=people_partly_vaccinated,
people_fully_vaccinated=people_fully_vaccinated,
total_boosters=total_boosters,
)
_from_gh_to_scripts(location)
filepath_automated = PATHS.out_vax(location)
# Update file in output/
if os.path.isfile(filepath_automated):
df = _increment(
filepath=filepath_automated,
location=location,
total_vaccinations=total_vaccinations,
date=date,
vaccine=vaccine,
source_url=source_url,
people_vaccinated=people_vaccinated,
people_partly_vaccinated=people_partly_vaccinated,
people_fully_vaccinated=people_fully_vaccinated,
total_boosters=total_boosters,
)
# If not available, create new file
else:
df = _build_df(
location=location,
total_vaccinations=total_vaccinations,
date=date,
vaccine=vaccine,
source_url=source_url,
people_vaccinated=people_vaccinated,
people_partly_vaccinated=people_partly_vaccinated,
people_fully_vaccinated=people_fully_vaccinated,
total_boosters=total_boosters,
)
# To Integer type
col_ints = [
"total_vaccinations",
"people_vaccinated",
"people_partly_vaccinated",
"people_fully_vaccinated",
"total_boosters",
]
col_ints_have = [col for col in col_ints if col in df.columns]
for col in col_ints_have:
df[col] = df[col].astype("Int64").fillna(pd.NA)
df = df[["location", "date", "vaccine", "source_url"] + col_ints_have]
# Make series monotonic
if make_series_monotonic:
df = make_monotonic(df)
df.to_csv(PATHS.out_vax(location), index=False)
# print(f"NEW: {total_vaccinations} doses on {date}")
[docs]def _from_gh_to_scripts(location):
filepath_automated = PATHS.out_vax(location)
filepath_public = f"{GH_LINK}/{location}.csv".replace(" ", "%20")
# Move from public to output folder
if not os.path.isfile(filepath_automated) and requests.get(filepath_public).ok:
pd.read_csv(filepath_public).to_csv(filepath_automated, index=False)
[docs]def _check_fields(
location,
source_url,
vaccine,
date,
total_vaccinations,
people_vaccinated,
people_partly_vaccinated,
people_fully_vaccinated,
total_boosters,
):
# Check location, vaccine, source_url
if not isinstance(location, str):
type_wrong = type(location).__name__
raise TypeError(f"Check `location` type! Should be a str, found {type_wrong}. Value was {location}")
if not isinstance(vaccine, str):
type_wrong = type(vaccine).__name__
raise TypeError(f"Check `vaccine` type! Should be a str, found {type_wrong}. Value was {vaccine}")
if not isinstance(source_url, str):
type_wrong = type(source_url).__name__
raise TypeError(f"Check `source_url` type! Should be a str, found {type_wrong}. Value was {source_url}")
# Check metrics
if not isinstance(total_vaccinations, numbers.Number):
type_wrong = type(location).__name__
raise TypeError(
f"Check `total_vaccinations` type! Should be numeric, found {type_wrong}. Value was {total_vaccinations}"
)
if not (isinstance(total_boosters, numbers.Number) or pd.isnull(total_boosters)):
type_wrong = type(total_boosters).__name__
raise TypeError(
f"Check `total_boosters` type! Should be numeric, found {type_wrong}. Value was {total_boosters}"
)
if not (isinstance(people_vaccinated, numbers.Number) or pd.isnull(people_vaccinated)):
type_wrong = type(people_vaccinated).__name__
raise TypeError(
f"Check `people_vaccinated` type! Should be numeric, found {type_wrong}. Value was {people_vaccinated}"
)
if not (isinstance(people_partly_vaccinated, numbers.Number) or pd.isnull(people_partly_vaccinated)):
type_wrong = type(people_partly_vaccinated).__name__
raise TypeError(
f"Check `people_vaccinated` type! Should be numeric, found {type_wrong}. Value was {people_vaccinated}"
)
if not (isinstance(people_fully_vaccinated, numbers.Number) or pd.isnull(people_fully_vaccinated)):
type_wrong = type(people_fully_vaccinated).__name__
raise TypeError(
f"Check `people_fully_vaccinated` type! Should be numeric, found {type_wrong}. Value was "
f"{people_fully_vaccinated}"
)
# Check date
if not isinstance(date, str):
type_wrong = type(date).__name__
raise TypeError(f"Check `date` type! Should be numeric, found {type_wrong}. Value was {date}")
if not (re.match(r"\d{4}-\d{2}-\d{2}", date) and date <= str(datetime.date.today() + datetime.timedelta(days=1))):
raise ValueError(f"Check `date`. It either does not match format YYYY-MM-DD or exceeds todays'date: {date}")
[docs]def _increment(
filepath,
location,
total_vaccinations,
date,
vaccine,
source_url,
people_vaccinated=None,
people_partly_vaccinated=None,
people_fully_vaccinated=None,
total_boosters=None,
):
prev = pd.read_csv(filepath)
if total_vaccinations <= prev["total_vaccinations"].max() or date < prev["date"].max():
df = prev.copy()
elif date == prev["date"].max():
df = prev.copy()
df.loc[df["date"] == date, "total_vaccinations"] = total_vaccinations
df.loc[df["date"] == date, "people_vaccinated"] = people_vaccinated
if people_partly_vaccinated is not None:
df.loc[df["date"] == date, "people_partly_vaccinated"] = people_partly_vaccinated
df.loc[df["date"] == date, "people_fully_vaccinated"] = people_fully_vaccinated
df.loc[df["date"] == date, "total_boosters"] = total_boosters
df.loc[df["date"] == date, "source_url"] = source_url
else:
new = _build_df(
location,
total_vaccinations,
date,
vaccine,
source_url,
people_vaccinated,
people_partly_vaccinated,
people_fully_vaccinated,
total_boosters,
)
df = pd.concat([prev, new])
return df.sort_values("date")
[docs]def _build_df(
location,
total_vaccinations,
date,
vaccine,
source_url,
people_vaccinated=None,
people_partly_vaccinated=None,
people_fully_vaccinated=None,
total_boosters=None,
):
new = pd.DataFrame(
{
"location": location,
"date": date,
"vaccine": vaccine,
"total_vaccinations": [total_vaccinations],
"source_url": source_url,
}
)
if people_vaccinated is not None:
new["people_vaccinated"] = people_vaccinated
if people_partly_vaccinated is not None:
new["people_partly_vaccinated"] = people_partly_vaccinated
if people_fully_vaccinated is not None:
new["people_fully_vaccinated"] = people_fully_vaccinated
if total_boosters is not None:
new["total_boosters"] = total_boosters
return new
[docs]def merge_with_current_data(df: pd.DataFrame, filepath: str) -> pd.DataFrame:
col_ints = [
"total_vaccinations",
"people_vaccinated",
"people_partly_vaccinated",
"people_fully_vaccinated",
"total_boosters",
]
# Load current data
if os.path.isfile(filepath):
df_current = pd.read_csv(filepath)
# Merge
df_current = df_current[~df_current.date.isin(df.date)]
df = pd.concat([df, df_current]).sort_values(by="date")
# Int values
col_ints = list(df.columns.intersection(col_ints))
if col_ints:
df[col_ints] = df[col_ints].astype("Int64").fillna(pd.NA)
columns = ["location", "date", "vaccine", "source_url"] + col_ints
return df[columns]