Source code for cowidev.vax.batch.new_zealand

import re

import pandas as pd
from bs4 import BeautifulSoup

from cowidev.utils import clean_date, clean_date_series, get_soup
from cowidev.utils.web.download import read_xlsx_from_url
from cowidev.vax.utils.base import CountryVaxBase
from cowidev.vax.utils.utils import build_vaccine_timeline


[docs]class NewZealand(CountryVaxBase): # Consider: https://github.com/minhealthnz/nz-covid-data/tree/main/vaccine-data source_url_ref = "https://www.health.govt.nz/our-work/diseases-and-conditions/covid-19-novel-coronavirus/covid-19-data-and-statistics/covid-19-vaccine-data" base_url = "https://www.health.govt.nz" location = "New Zealand" rename_columns = { "First doses": "people_vaccinated", "Second doses": "people_fully_vaccinated", "Third primary doses": "third_dose", "First Boosters": "total_boosters", "Second Boosters": "total_boosters_2", "Date": "date", } vaccines_start_date = { "Pfizer/BioNTech": "2021-01-01", "Oxford/AstraZeneca": "2021-11-26", "Novavax": "2022-03-14", } columns_cumsum = [ "people_vaccinated", "people_fully_vaccinated", "third_dose", "total_boosters", "total_boosters_2", ]
[docs] def read(self) -> pd.DataFrame: """Reads the data from the source.""" soup = get_soup(self.source_url_ref) # self._read_latest(soup) link = self._parse_file_link(soup) df = read_xlsx_from_url(link, sheet_name="Date") return df
[docs] def _read_latest(self, soup): """Reads the latest data from the soup.""" tables = pd.read_html(str(soup)) latest = tables[0].set_index("Unnamed: 0") latest_kids = tables[1].set_index("Unnamed: 0") latest_date = re.search(r"Data in this section is as at 11:59pm ([\d]+ [A-Za-z]+ 20\d{2})", soup.text).group(1) self.latest = pd.DataFrame( { "people_vaccinated": latest.loc["First dose", "Cumulative total"] + latest_kids.loc["First dose", "Cumulative total"], "people_fully_vaccinated": latest.loc["Second dose", "Cumulative total"] + latest_kids.loc["Second dose", "Cumulative total"], "total_boosters": latest.loc["Boosters", "Cumulative total"] + latest.loc["Third primary", "Cumulative total"], "date": [clean_date(latest_date, "%d %B %Y")], } )
[docs] def pipe_cumsum(self, df: pd.DataFrame) -> pd.DataFrame: """Calculates cumulative sum of the columns.""" df[self.columns_cumsum] = df[self.columns_cumsum].cumsum() return df
[docs] def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: """Formats the date column.""" return df.assign(date=clean_date_series(df.date, "%d/%m/%Y"))
[docs] def pipe_boosters(self, df: pd.DataFrame) -> pd.DataFrame: """Calculates the total boosters.""" return df.assign(total_boosters=df.total_boosters + df.third_dose + df.total_boosters_2)
[docs] def pipe_latest_metrics(self, df: pd.DataFrame) -> pd.DataFrame: """pipes the latest metrics.""" return df.sort_values("date").append(self.latest, ignore_index=True).drop_duplicates("date", keep="last")
[docs] def pipe_total_vaccinations(self, df: pd.DataFrame) -> pd.DataFrame: """Calculates the total vaccinations.""" return df.assign(total_vaccinations=df.people_vaccinated + df.people_fully_vaccinated + df.total_boosters)
[docs] def pipe_vaccine(self, df: pd.DataFrame) -> pd.DataFrame: """Builds the vaccine timeline.""" return build_vaccine_timeline(df, self.vaccines_start_date)
[docs] def pipeline(self, df: pd.DataFrame) -> pd.DataFrame: """Pipeline for the data""" return ( df.pipe(self.pipe_rename_columns) .pipe(self.pipe_cumsum) .pipe(self.pipe_date) .pipe(self.pipe_boosters) # .pipe(self.pipe_latest_metrics) .pipe(self.pipe_total_vaccinations) .pipe(self.pipe_vaccine) .pipe(self.pipe_metadata) .pipe(self.make_monotonic) )
[docs] def export(self): """Exports the data to CSV""" df = self.read().pipe(self.pipeline) self.export_datafile(df, valid_cols_only=True)
[docs]def main(): NewZealand().export()