Source code for cowidev.vax.batch.denmark

import requests
import zipfile
import io
import tempfile
import os

import pandas as pd

from cowidev.utils import get_soup
from cowidev.utils.clean.dates import clean_date, localdatenow

from cowidev.vax.utils.base import CountryVaxBase
from cowidev.vax.utils.checks import VACCINES_ONE_DOSE


[docs]class Denmark(CountryVaxBase):
    location = "Denmark"
    source_url_ref = "https://covid19.ssi.dk/overvagningsdata/download-fil-med-vaccinationsdata"
    vaccines_mapping = {
        "AstraZeneca Covid-19 vaccine": "Oxford/AstraZeneca",
        "Janssen COVID-19 vaccine": "Johnson&Johnson",
        "Moderna Covid-19 Vaccine": "Moderna",
        "Moderna/Spikevax Covid-19 Vacc.": "Moderna",
        "Moderna/Spikevax Covid-19 0,5 ml": "Moderna",
        "Pfizer BioNTech Covid-19 vacc": "Pfizer/BioNTech",
        "Pfizer/Comirnaty Original/Omikron BA1": "Pfizer/BioNTech",
    }
    regions_accepted = {
        "Nordjylland",
        "Midtjylland",
        "Syddanmark",
        "Hovedstaden",
        "Sjælland",
    }
    date_limit_one_dose = "2021-05-27"

    @property
    def date_limit_one_dose_ddmmyyyy(self):
        return clean_date(self.date_limit_one_dose, "%Y-%m-%d", output_fmt="%d%m%Y")

[docs]    def read(self, gap_days, bfill=True) -> pd.DataFrame:
        url = self._parse_link_zip()
        with tempfile.TemporaryDirectory() as tf:
            # Download and extract
            self._download_and_extract_data(url, tf)
            # Load data
            df = self._load_data(tf)
        if bfill:
            df_bfill = self._read_single_shots_bfill(index=gap_days)
            df = df.merge(df_bfill, on="date", how="left")
            df = df.assign(
                single_shots=df.single_shots_x.fillna(df.single_shots_y),
                single_shots_2nd=df.single_shots_2nd_x.fillna(df.single_shots_2nd_y),
            )
        return df

[docs]    def _load_data(self, path):
        df = self._read_data(path)
        df_ss = pd.DataFrame([self._read_single_shots_daily(path)])
        df = df.merge(df_ss, on="date", how="left")
        return df

[docs]    def _parse_link_zip(self) -> str:
        """Get link to latest pdf."""
        soup = get_soup(self.source_url_ref)
        url = soup.find("a", string="Download her").get("href")
        return url

[docs]    def _download_and_extract_data(self, url, output_path):
        r = requests.get(url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(output_path)

[docs]    def _read_data(self, path) -> pd.DataFrame:
        path = _build_filepath(path, "Vaccine_dato.csv")

        df = (
            _load_datafile(path)
            .rename(
                columns={
                    "Dato": "date",
                    "Antal 1. stik": "people_vaccinated",
                    "Antal 2. stik": "people_fully_vaccinated",
                    "Antal 3. stik": "total_boosters",
                }
            )
            # .groupby("date", as_index=False)
            # .sum()
            .sort_values("date")
            .transform(
                {
                    "date": lambda x: x,
                    "people_vaccinated": lambda x: x.cumsum(),
                    "people_fully_vaccinated": lambda x: x.cumsum(),
                    "total_boosters": lambda x: x.cumsum(),
                }
            )
        )
        return df

[docs]    def _read_single_shots_bfill(self, index=None, date_limit=None):
        """Read single shots using bfill (iterates over old links)"""
        links = self._get_file_links_bfill(index=index, date_limit=date_limit)
        records = []
        for link in links[:1]:
            # print("Back filling (single shots)", link)
            with tempfile.TemporaryDirectory() as tf:
                self._download_and_extract_data(link, tf)
                records.append(self._read_single_shots_daily(tf))
        df = pd.DataFrame(records).drop_duplicates(subset=["date"], keep="last")
        return df

[docs]    def _read_single_shots_daily(self, path) -> dict:
        # single shots
        path_ = _build_filepath(path, "Vaccine_type_region.csv")
        df = _load_datafile(path_)
        msk = df["Vaccinenavn"].replace(self.vaccines_mapping).isin(VACCINES_ONE_DOSE)
        single_shots = df.loc[msk, "Antal 1. stik"].sum()
        single_shots_2nd = df.loc[msk, "Antal 2. stik"].sum()
        # Check vaccine names
        vaccines_wrong = set(df.Vaccinenavn).difference(self.vaccines_mapping)
        if vaccines_wrong:
            raise ValueError(f"Unknown vaccine(s) {vaccines_wrong}")
        regions_wrong = set(df.Region).difference(self.regions_accepted)
        if vaccines_wrong:
            raise ValueError(f"Unknown region(s) {regions_wrong}")
        # Load date
        path_ = _build_filepath(path, "Vaccine_dato.csv")
        df = _load_datafile(path_)
        date = df.Dato.max()
        return {
            "date": date,
            "single_shots": single_shots,
            "single_shots_2nd": single_shots_2nd,
        }

[docs]    def _get_file_links_bfill(self, index=None, date_limit=None):
        soup = get_soup(self.source_url_ref)
        links = [x.a.get("href") for x in soup.find_all("h5")]
        if index is None:
            date_limit = date_limit if date_limit is not None else self.date_limit_one_dose_ddmmyyyy
            i = [i for i, l in enumerate(links) if date_limit in l]
            index = i[0]
        links = links[:index]
        return links

[docs]    def pipe_vaccine(self, df: pd.DataFrame) -> pd.DataFrame:
        def _enrich_vaccine(date: str) -> str:
            if date >= self.date_limit_one_dose:
                return "Johnson&Johnson, Moderna, Pfizer/BioNTech"
            if date >= "2021-04-14":
                return "Moderna, Pfizer/BioNTech"
            if date >= "2021-02-08":
                return "Moderna, Oxford/AstraZeneca, Pfizer/BioNTech"
            if date >= "2021-01-13":
                return "Moderna, Pfizer/BioNTech"
            return "Pfizer/BioNTech"

        return df.assign(vaccine=df.date.astype(str).apply(_enrich_vaccine))

[docs]    def pipe_metrics(self, df: pd.DataFrame, df_current: pd.DataFrame) -> pd.DataFrame:
        # Merge current data with new
        df = df.merge(df_current, on="date", how="left")
        df = df.assign(
            single_shots=df.single_shots.fillna(df.single_shots_current),
            single_shots_2nd=df.single_shots_2nd.fillna(df.single_shots_2nd_current),
        )
        df = df.assign(
            total_vaccinations=(
                df.people_vaccinated.ffill().fillna(0)  # first dose + single shots
                + df.people_fully_vaccinated.ffill().fillna(0)  # second doses (inc. from single shot vax)
                + df.total_boosters.ffill().fillna(0)  # third dose
            ),
            people_fully_vaccinated=(
                df.people_fully_vaccinated.ffill().fillna(0)  # second doses (inc. from single shot vax)
                + df.single_shots.ffill().fillna(0)  # single shots
                - df.single_shots_2nd.ffill().fillna(0)  # secon doses of single shots
            ),
            total_boosters=(
                df.total_boosters.ffill().fillna(0)  # single shots
                + df.single_shots_2nd.ffill().fillna(0)  # secon doses of single shots
            ),
        )
        return df

[docs]    def pipeline(self, df: pd.DataFrame, df_current: pd.DataFrame) -> pd.DataFrame:
        return (
            df.assign(
                location=self.location,
                source_url=self.source_url_ref,
            )
            .pipe(self.pipe_vaccine)
            .pipe(self.pipe_metrics, df_current)[
                [
                    "location",
                    "date",
                    "vaccine",
                    "source_url",
                    "total_vaccinations",
                    "people_vaccinated",
                    "people_fully_vaccinated",
                    "total_boosters",
                    "single_shots",
                    "single_shots_2nd",
                ]
            ]
        )

[docs]    def read_current(self):
        return pd.read_csv(self.output_path, usecols=["date", "single_shots", "single_shots_2nd"]).rename(
            columns={"single_shots": "single_shots_current", "single_shots_2nd": "single_shots_2nd_current"}
        )

[docs]    def _get_num_gap_days(self, df_current):
        return (
            localdatenow(tz=None, as_datetime=True) - clean_date(df_current.date.max(), "%Y-%m-%d", as_datetime=True)
        ).days

[docs]    def export(self):
        # Read current
        df_current = self.read_current()
        # print(df_current.columns)
        index = self._get_num_gap_days(df_current)
        # Read new
        df = self.read(index).pipe(self.pipeline, df_current)
        # Export
        df.to_csv(self.output_path, index=False)


[docs]def _load_datafile(path):
    """Read csv file."""
    df = pd.read_csv(path, encoding="iso-8859-1", sep=";")
    if len(df.columns) == 1:
        df = pd.read_csv(path, encoding="iso-8859-1", sep=",")
    return df


[docs]def _build_filepath(path, filename):
    """Build filepath."""
    return os.path.join(path, "Vaccine_DB", filename)


[docs]def main():
    Denmark().export()