import pandas as pd
from cowidev.utils import clean_date, clean_date_series
from cowidev.utils.utils import check_known_columns
from cowidev.utils.web.download import read_csv_from_url
from cowidev.vax.utils.base import CountryVaxBase
from cowidev.vax.utils.utils import build_vaccine_timeline
[docs]class Australia(CountryVaxBase):
source_url = {
"main": "https://covidbaseau.com/people-vaccinated.csv",
"age_1d": "https://covidbaseau.com/historical/Vaccinations%20By%20Age%20Group%20and%20State%20First.csv",
"age_2d": "https://covidbaseau.com/historical/Vaccinations%20By%20Age%20Group%20and%20State%20Second.csv",
}
source_url_ref = "https://covidbaseau.com/"
source_file = "https://covidbaseau.com/people-vaccinated.csv"
location = "Australia"
columns_rename = {
"dose_1": "people_vaccinated",
"dose_2": "people_fully_vaccinated",
"dose_3": "total_boosters",
}
vaccine_timeline = {
"Pfizer/BioNTech": "2021-01-01",
"Moderna": "2021-03-06",
"Oxford/AstraZeneca": "2021-03-06",
"Novavax": "2022-02-17",
}
[docs] def read(self) -> pd.DataFrame:
df = read_csv_from_url(self.source_url["main"])
check_known_columns(df, ["date", "dose_1", "dose_2", "dose_3"])
return df
[docs] def read_age(self) -> pd.DataFrame:
df_1 = read_csv_from_url(self.source_url["age_1d"], header=1).dropna(axis=1, how="all")
df_1 = df_1.melt("Date", var_name="age_group", value_name="people_vaccinated_per_hundred")
df_2 = read_csv_from_url(self.source_url["age_2d"], header=1).dropna(axis=1, how="all")
df_2 = df_2.melt("Date", var_name="age_group", value_name="people_fully_vaccinated_per_hundred")
df = df_1.merge(df_2, on=["Date", "age_group"], how="left")
return df
[docs] def pipe_total_vaccinations(self, df: pd.DataFrame) -> pd.DataFrame:
return df.assign(total_vaccinations=df.dose_1 + df.dose_2 + df.dose_3)
[docs] def pipe_rename_columns(self, df: pd.DataFrame) -> pd.DataFrame:
return df.rename(columns=self.columns_rename)
[docs] def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.assign(date=df.date.apply(clean_date, fmt="%Y-%m-%d", minus_days=1))
return df
[docs] def pipe_vaccine(self, df: pd.DataFrame) -> pd.DataFrame:
return df.pipe(build_vaccine_timeline, self.vaccine_timeline)
[docs] def pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
return (
df.pipe(self.pipe_total_vaccinations)
.pipe(self.pipe_rename_columns)
.pipe(self.pipe_date)
.pipe(self.pipe_vaccine)
.pipe(self.pipe_metadata)
.pipe(self.make_monotonic)
.sort_values("date")
)
[docs] def pipe_age_groups(self, df):
regex = r"(\d{1,2})+?(?:-(\d{1,2}))?"
df[["age_group_min", "age_group_max"]] = df.age_group.str.extract(regex)
return df
[docs] def pipe_age_numeric(self, df):
regex = r"([\d\.]+).*"
metrics = ["people_vaccinated_per_hundred", "people_fully_vaccinated_per_hundred"]
for metric in metrics:
df.loc[:, metric] = df[metric].str.extract(regex, expand=False).astype(float)
return df
[docs] def pipeline_age(self, df: pd.DataFrame) -> pd.DataFrame:
return (
df.pipe(self.pipe_age_groups)
.pipe(self.pipe_age_numeric)
.pipe(self.pipe_age_metadata)
# .dropna(subset=["people_vaccinated_per_hundred", "people_fully_vaccinated_per_hundred"], how="all")
.drop_duplicates(subset=["people_vaccinated_per_hundred", "people_fully_vaccinated_per_hundred"])
.sort_values(["date", "age_group_min"])[
[
"location",
"date",
"age_group_min",
"age_group_max",
"people_vaccinated_per_hundred",
"people_fully_vaccinated_per_hundred",
]
]
)
[docs] def export(self):
# Main
df = self.read().pipe(self.pipeline)
# Age
df_age = self.read_age().pipe(self.pipeline_age)
self.export_datafile(
df=df,
df_age=df_age,
meta_age={"source_name": "Ministry of Health via covidbaseau.com", "source_url": self.source_url_ref},
)
[docs]def main():
Australia().export()