Source code for cowidev.vax.batch.jersey

import requests
import tempfile
import re

import pandas as pd

from cowidev.vax.utils.base import CountryVaxBase


[docs]class Jersey(CountryVaxBase):
    def __init__(self):
        """Constructor.

        Args:
            source_url (str): Source data url
            location (str): Location name
            columns_rename (dict, optional): Maps original to new names. Defaults to None.
        """
        self.source_url = "https://www.gov.je/Datasets/ListOpenData?ListName=COVID19Weekly&clean=true"
        self.location = "Jersey"
        self.columns_rename = {
            "Date": "date",
            "VaccinationsTotalNumberDoses": "total_vaccinations",
            "VaccinationsTotalNumberFirstDoseVaccinations": "people_vaccinated",
            "VaccinationsTotalNumberSecondDoseVaccinations": "people_fully_vaccinated",
            "VaccinationsTotalNumberThirdDoseVaccinations": "total_boosters",
            "VaccinationsTotalNumberFourthDoseVaccinations": "total_boosters_2",
        }

[docs]    def read(self):
        with tempfile.NamedTemporaryFile() as tf:
            with open(tf.name, mode="wb") as f:
                f.write(requests.get(self.source_url).content)
            return pd.read_csv(tf.name)

[docs]    def pipe_select_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        return df[self.columns_rename.keys()]

[docs]    def pipe_rename_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        return df[self.columns_rename.keys()].rename(columns=self.columns_rename)

[docs]    def pipe_enrich_vaccine_name(self, df: pd.DataFrame) -> pd.DataFrame:
        def _enrich_vaccine(date: str) -> str:
            if date >= "2021-04-07":
                return "Moderna, Oxford/AstraZeneca, Pfizer/BioNTech"
            return "Oxford/AstraZeneca, Pfizer/BioNTech"

        return df.assign(vaccine=df.date.astype(str).apply(_enrich_vaccine))

[docs]    def pipe_enrich_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(location=self.location, source_url=self.source_url)

[docs]    def pipe_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(total_boosters=df.total_boosters + df.total_boosters_2.fillna(0))

[docs]    def pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
        return (
            df.pipe(self.pipe_select_columns)
            .pipe(self.pipe_rename_columns)
            .pipe(self.pipe_enrich_vaccine_name)
            .pipe(self.pipe_enrich_columns)
            .pipe(self.pipe_metrics)
            .sort_values("date")[
                [
                    "location",
                    "date",
                    "vaccine",
                    "source_url",
                    "total_vaccinations",
                    "people_vaccinated",
                    "people_fully_vaccinated",
                    "total_boosters",
                ]
            ]
        )

[docs]    def pipe_age_select_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        # print(df.columns[-15:])
        return df[
            [
                "Date",
                "VaccinationsPercentagePopulationVaccinatedFirstDose80yearsandover",
                "VaccinationsPercentagePopulationVaccinatedFirstDose75to79years",
                "VaccinationsPercentagePopulationVaccinatedFirstDose70to74years",
                "VaccinationsPercentagePopulationVaccinatedFirstDose65to69years",
                "VaccinationsPercentagePopulationVaccinatedFirstDose60to64years",
                "VaccinationsPercentagePopulationVaccinatedFirstDose55to59years",
                "VaccinationsPercentagePopulationVaccinatedFirstDose50to54years",
                "VaccinationsPercentagePopulationVaccinatedFirstDose40to49years",
                "VaccinationsPercentagePopulationVaccinatedFirstDose30to39years",
                "VaccinationsPercentagePopulationVaccinatedFirstDose18to29years",
                "VaccinationsPercentagePopulationVaccinatedFirstDose16to17years",
                "VaccinationsPercentagePopulationVaccinatedFirstDose12to15years",
                "VaccinationsPercentagePopulationVaccinatedFirstDose5to11years",
                # "VaccinationsPercentagePopulationVaccinatedFirstDose0to5years",
                "VaccinationsPercentagePopulationVaccinatedSecondDose80yearsandover",
                "VaccinationsPercentagePopulationVaccinatedSecondDose75to79years",
                "VaccinationsPercentagePopulationVaccinatedSecondDose70to74years",
                "VaccinationsPercentagePopulationVaccinatedSecondDose65to69years",
                "VaccinationsPercentagePopulationVaccinatedSecondDose60to64years",
                "VaccinationsPercentagePopulationVaccinatedSecondDose55to59years",
                "VaccinationsPercentagePopulationVaccinatedSecondDose50to54years",
                "VaccinationsPercentagePopulationVaccinatedSecondDose40to49years",
                "VaccinationsPercentagePopulationVaccinatedSecondDose30to39years",
                "VaccinationsPercentagePopulationVaccinatedSecondDose18to29years",
                "VaccinationsPercentagePopulationVaccinatedSecondDose16to17years",
                "VaccinationsPercentagePopulationVaccinatedSecondDose12to15years",
                "VaccinationsPercentagePopulationVaccinatedSecondDose5to11years",
                # "VaccinationsPercentagePopulationVaccinatedSecondDose0to5years",
                "VaccinationsPercentagePopulationVaccinatedThirdDose80yearsandover",
                "VaccinationsPercentagePopulationVaccinatedThirdDose75to79years",
                "VaccinationsPercentagePopulationVaccinatedThirdDose70to74years",
                "VaccinationsPercentagePopulationVaccinatedThirdDose65to69years",
                "VaccinationsPercentagePopulationVaccinatedThirdDose60to64years",
                "VaccinationsPercentagePopulationVaccinatedThirdDose55to59years",
                "VaccinationsPercentagePopulationVaccinatedThirdDose50to54years",
                "VaccinationsPercentagePopulationVaccinatedThirdDose40to49years",
                "VaccinationsPercentagePopulationVaccinatedThirdDose30to39years",
                "VaccinationsPercentagePopulationVaccinatedThirdDose18to29years",
                "VaccinationsPercentagePopulationVaccinatedThirdDose16to17years",
                "VaccinationsPercentagePopulationVaccinatedThirdDose12to15years",
                "VaccinationsPercentagePopulationVaccinatedThirdDose5to11years",
                # "VaccinationsPercentagePopulationVaccinatedThirdDose0to5years",
                "VaccinationsPercentagePopulationVaccinatedFourthDose80yearsandover",
                "VaccinationsPercentagePopulationVaccinatedFourthDose75to79years",
                "VaccinationsPercentagePopulationVaccinatedFourthDose70to74years",
                "VaccinationsPercentagePopulationVaccinatedFourthDose65to69years",
                "VaccinationsPercentagePopulationVaccinatedFourthDose60to64years",
                "VaccinationsPercentagePopulationVaccinatedFourthDose55to59years",
                "VaccinationsPercentagePopulationVaccinatedFourthDose50to54years",
                "VaccinationsPercentagePopulationVaccinatedFourthDose40to49years",
                "VaccinationsPercentagePopulationVaccinatedFourthDose30to39years",
                "VaccinationsPercentagePopulationVaccinatedFourthDose18to29years",
                "VaccinationsPercentagePopulationVaccinatedFourthDose16to17years",
                "VaccinationsPercentagePopulationVaccinatedFourthDose12to15years",
                "VaccinationsPercentagePopulationVaccinatedFourthDose5to11years",
            ]
        ]

[docs]    def _extract_age_group(self, age_group_raw):
        # regex_17 = r"VaccinationsPercentagePopulationVaccinated(?:First|Second|Third)Dose17yearsandunder"
        regex_80 = r"VaccinationsPercentagePopulationVaccinated(?:First|Second|Third|Fourth)Dose80yearsandover"
        regex = r"VaccinationsPercentagePopulationVaccinated(?:First|Second|Third|Fourth)Dose(\d+)to(\d+)years"
        # if re.match(regex_17, age_group_raw):
        #     age_group = "0-17"
        if re.match(regex_80, age_group_raw):
            age_group = "80-"
        elif re.match(regex, age_group_raw):
            age_group = "-".join(re.match(regex, age_group_raw).group(1, 2))
        return age_group

[docs]    def pipe_age_create_groups(self, df: pd.DataFrame) -> pd.DataFrame:
        # Split data in dataframes with first and second doses
        df1 = df.filter(regex=r"Date|VaccinationsPercentagePopulationVaccinatedFirstDose.*")
        df2 = df.filter(regex=r"Date|VaccinationsPercentagePopulationVaccinatedSecondDose.*")
        df3 = df.filter(regex=r"Date|VaccinationsPercentagePopulationVaccinatedThirdDose.*")
        df4 = df.filter(regex=r"Date|VaccinationsPercentagePopulationVaccinatedFourthDose.*")
        # Melt dataframes
        df1 = df1.melt(
            id_vars="Date",
            var_name="age_group",
            value_name="people_vaccinated_per_hundred",
        )
        df2 = df2.melt(
            id_vars="Date",
            var_name="age_group",
            value_name="people_fully_vaccinated_per_hundred",
        )
        df3 = df3.melt(
            id_vars="Date",
            var_name="age_group",
            value_name="people_with_booster_per_hundred",
        )
        df4 = df4.melt(
            id_vars="Date",
            var_name="age_group",
            value_name="people_with_booster_2_per_hundred",
        )
        # Process and merge dataframes
        df1 = df1.assign(age_group=df1.age_group.apply(self._extract_age_group))
        df2 = df2.assign(age_group=df2.age_group.apply(self._extract_age_group))
        df3 = df3.assign(age_group=df3.age_group.apply(self._extract_age_group))
        df4 = df4.assign(age_group=df4.age_group.apply(self._extract_age_group))
        df = df1.merge(df2, on=["Date", "age_group"]).dropna(subset=["Date"])
        df = df.merge(df3, on=["Date", "age_group"]).dropna(subset=["Date"])
        return df

[docs]    def pipe_age_rename_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.rename(columns={"Date": "date"})

[docs]    def pipe_age_minmax_values(self, df: pd.DataFrame) -> pd.DataFrame:
        df[["age_group_min", "age_group_max"]] = df.age_group.str.split("-", expand=True)
        return df

[docs]    def pipe_metrics_scale_100(self, df: pd.DataFrame) -> pd.DataFrame:
        column_metrics = [
            "people_vaccinated_per_hundred",
            "people_fully_vaccinated_per_hundred",
            "people_with_booster_per_hundred",
        ]
        df[column_metrics] = (df[column_metrics] * 100).round(2)
        return df

[docs]    def pipe_age_fix_dp(self, df: pd.DataFrame) -> pd.DataFrame:
        column_metrics = [
            "people_vaccinated_per_hundred",
            "people_fully_vaccinated_per_hundred",
            "people_with_booster_per_hundred",
        ]
        dt_min = "2021-09-05"
        dt_max = "2021-09-22"
        msk = (df.date >= dt_min) & (df.date <= dt_max)
        df.loc[msk, column_metrics] = df.loc[msk, column_metrics] * 100
        msk = df[column_metrics] > 100
        if (df[column_metrics] > 100).any(None):
            raise ValueError(f"Check fixed datapoints ({dt_min}<date<{dt_max}), they might be already fine!")
        return df

[docs]    def pipe_age_filter(self, df: pd.DataFrame) -> pd.DataFrame:
        df.loc[(df.date == "2021-08-29"), "people_fully_vaccinated_per_hundred"] = None
        df.loc[(df.date == "2021-09-05") & (df.age_group_min == "18"), "people_vaccinated_per_hundred"] = None
        df.loc[(df.date == "2021-09-05") & (df.age_group_min == "40"), "people_vaccinated_per_hundred"] = None
        return df
        # df.pipe(make_monotonic, "date", ["people_vaccinated_per_hundred", "people_fully_vaccinated_per_hundred"])

[docs]    def pipeline_age(self, df: pd.DataFrame) -> pd.DataFrame:
        return (
            df.pipe(self.pipe_age_select_columns)
            .pipe(self.pipe_age_create_groups)
            .pipe(self.pipe_age_rename_columns)
            .pipe(self.pipe_age_minmax_values)
            .pipe(self.pipe_enrich_columns)
            .pipe(self.pipe_metrics_scale_100)
            # .pipe(self.pipe_age_fix_dp)
            .pipe(self.pipe_age_filter)
            .sort_values(["date", "age_group_min"])[
                [
                    "location",
                    "date",
                    "age_group_min",
                    "age_group_max",
                    "people_vaccinated_per_hundred",
                    "people_fully_vaccinated_per_hundred",
                    "people_with_booster_per_hundred",
                ]
            ]
        )

[docs]    def pipeline_base(self, df: pd.DataFrame) -> pd.DataFrame:
        return (
            df.drop_duplicates()
            .sort_values(["Date", "VaccinationsPercentagePopulationVaccinatedFirstDose80yearsandover"])
            .drop_duplicates("Date", keep="last")
        )

[docs]    def export(self):
        """Generalized."""
        df_base = self.read().pipe(self.pipeline_base)
        # Main data
        df = df_base.pipe(self.pipeline)
        # Age data
        df_age = df_base.pipe(self.pipeline_age)
        # Export
        self.export_datafile(
            df, df_age=df_age, meta_age={"source_name": "Government of Jersey", "source_url": self.source_url}
        )


[docs]def main():
    Jersey().export()