import re
import pandas as pd
from bs4 import BeautifulSoup
from cowidev.utils import clean_date, clean_date_series, get_soup
from cowidev.utils.web.download import read_xlsx_from_url
from cowidev.vax.utils.base import CountryVaxBase
from cowidev.vax.utils.utils import build_vaccine_timeline
[docs]class NewZealand(CountryVaxBase):
# Consider: https://github.com/minhealthnz/nz-covid-data/tree/main/vaccine-data
source_url_ref = "https://www.health.govt.nz/our-work/diseases-and-conditions/covid-19-novel-coronavirus/covid-19-data-and-statistics/covid-19-vaccine-data"
base_url = "https://www.health.govt.nz"
location = "New Zealand"
rename_columns = {
"First doses": "people_vaccinated",
"Second doses": "people_fully_vaccinated",
"Third primary doses": "third_dose",
"First Boosters": "total_boosters",
"Second Boosters": "total_boosters_2",
"Date": "date",
}
vaccines_start_date = {
"Pfizer/BioNTech": "2021-01-01",
"Oxford/AstraZeneca": "2021-11-26",
"Novavax": "2022-03-14",
}
columns_cumsum = [
"people_vaccinated",
"people_fully_vaccinated",
"third_dose",
"total_boosters",
"total_boosters_2",
]
[docs] def read(self) -> pd.DataFrame:
"""Reads the data from the source."""
soup = get_soup(self.source_url_ref)
# self._read_latest(soup)
link = self._parse_file_link(soup)
df = read_xlsx_from_url(link, sheet_name="Date")
return df
[docs] def _read_latest(self, soup):
"""Reads the latest data from the soup."""
tables = pd.read_html(str(soup))
latest = tables[0].set_index("Unnamed: 0")
latest_kids = tables[1].set_index("Unnamed: 0")
latest_date = re.search(r"Data in this section is as at 11:59pm ([\d]+ [A-Za-z]+ 20\d{2})", soup.text).group(1)
self.latest = pd.DataFrame(
{
"people_vaccinated": latest.loc["First dose", "Cumulative total"]
+ latest_kids.loc["First dose", "Cumulative total"],
"people_fully_vaccinated": latest.loc["Second dose", "Cumulative total"]
+ latest_kids.loc["Second dose", "Cumulative total"],
"total_boosters": latest.loc["Boosters", "Cumulative total"]
+ latest.loc["Third primary", "Cumulative total"],
"date": [clean_date(latest_date, "%d %B %Y")],
}
)
[docs] def _parse_file_link(self, soup: BeautifulSoup) -> str:
"""Parses the link from the soup."""
href = soup.find(id="download").find_next("a")["href"]
link = f"{self.base_url}{href}"
return link
[docs] def pipe_cumsum(self, df: pd.DataFrame) -> pd.DataFrame:
"""Calculates cumulative sum of the columns."""
df[self.columns_cumsum] = df[self.columns_cumsum].cumsum()
return df
[docs] def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame:
"""Formats the date column."""
return df.assign(date=clean_date_series(df.date, "%d/%m/%Y"))
[docs] def pipe_boosters(self, df: pd.DataFrame) -> pd.DataFrame:
"""Calculates the total boosters."""
return df.assign(total_boosters=df.total_boosters + df.third_dose + df.total_boosters_2)
[docs] def pipe_latest_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
"""pipes the latest metrics."""
return df.sort_values("date").append(self.latest, ignore_index=True).drop_duplicates("date", keep="last")
[docs] def pipe_total_vaccinations(self, df: pd.DataFrame) -> pd.DataFrame:
"""Calculates the total vaccinations."""
return df.assign(total_vaccinations=df.people_vaccinated + df.people_fully_vaccinated + df.total_boosters)
[docs] def pipe_vaccine(self, df: pd.DataFrame) -> pd.DataFrame:
"""Builds the vaccine timeline."""
return build_vaccine_timeline(df, self.vaccines_start_date)
[docs] def pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
"""Pipeline for the data"""
return (
df.pipe(self.pipe_rename_columns)
.pipe(self.pipe_cumsum)
.pipe(self.pipe_date)
.pipe(self.pipe_boosters)
# .pipe(self.pipe_latest_metrics)
.pipe(self.pipe_total_vaccinations)
.pipe(self.pipe_vaccine)
.pipe(self.pipe_metadata)
.pipe(self.make_monotonic)
)
[docs] def export(self):
"""Exports the data to CSV"""
df = self.read().pipe(self.pipeline)
self.export_datafile(df, valid_cols_only=True)
[docs]def main():
NewZealand().export()