Source code for cowidev.vax.utils.base

import os
import pandas as pd

from cowidev import PATHS
from cowidev.utils.s3 import S3, obj_from_s3
from cowidev.utils.utils import make_monotonic as mkm
from cowidev.utils.clean.dates import localdate
from cowidev.utils.clean.numbers import metrics_to_num_int, metrics_to_num_float
from cowidev.vax.utils.files import export_metadata


COLUMNS_ORDER = [
    "location",
    "date",
    "vaccine",
    "source_url",
    "total_vaccinations",
    "people_vaccinated",
    "people_fully_vaccinated",
    "total_boosters",
]

COLUMNS_ORDER_AGE = [
    "location",
    "date",
    "age_group_min",
    "age_group_max",
    "people_vaccinated_per_hundred",
    "people_fully_vaccinated_per_hundred",
    "people_with_booster_per_hundred",
]

COLUMNS_ORDER_MANUF = [
    "location",
    "date",
    "vaccine",
    "total_vaccinations",
]

METRICS = [
    "total_vaccinations",
    "people_vaccinated",
    "people_fully_vaccinated",
    "total_boosters",
    "total_vaccinations_per_hundred",
    "people_vaccinated_per_hundred",
    "people_fully_vaccinated_per_hundred",
    "people_with_booster_per_hundred",
]


[docs]class CountryVaxBase:
    location: str = None

    def __init__(self):
        if self.location == None:
            raise NotImplementedError("Please define class attribute `location`")

[docs]    def from_ice(self):
        """Loads single CSV `location.csv` from S3 as DataFrame."""
        path = f"{PATHS.S3_VAX_ICE_DIR}/{self.location}.csv"
        _check_last_update(path, self.location)
        df = obj_from_s3(path)
        return df

    @property
    def output_path(self):
        """Country output file."""
        return os.path.join(PATHS.INTERNAL_OUTPUT_VAX_MAIN_DIR, f"{self.location}.csv")

    @property
    def output_path_age(self):
        """Country output file for age-group data."""
        return os.path.join(PATHS.INTERNAL_OUTPUT_VAX_AGE_DIR, f"{self.location}.csv")

    @property
    def output_path_manufacturer(self):
        """Country output file for manufacturer data."""
        return os.path.join(PATHS.INTERNAL_OUTPUT_VAX_MANUFACT_DIR, f"{self.location}.csv")

[docs]    def get_output_path(self, filename=None, age=False, manufacturer=False):
        if age:
            if filename is None:
                return self.output_path_age
            return os.path.join(PATHS.INTERNAL_OUTPUT_VAX_AGE_DIR, f"{filename}.csv")
        elif manufacturer:
            if filename is None:
                return self.output_path_manufacturer
            return os.path.join(PATHS.INTERNAL_OUTPUT_VAX_MANUFACT_DIR, f"{filename}.csv")
        else:
            if filename is None:
                return self.output_path
            return os.path.join(PATHS.INTERNAL_OUTPUT_VAX_MAIN_DIR, f"{filename}.csv")

[docs]    def load_datafile(self, **kwargs):
        return pd.read_csv(self.output_path, **kwargs)

[docs]    def last_update(self, **kwargs):
        df = self.load_datafile(**kwargs)
        return df.date.max()

[docs]    def make_monotonic(self, df, group_cols=None, max_removed_rows=10, strict=False):
        if group_cols:
            dfg = df.groupby(group_cols)
            dfg = list(dfg)
            dfs = []
            for df_vax in dfg:
                _df = mkm(
                    df=df_vax[1],
                    column_date="date",
                    column_metrics=[m for m in METRICS if m in df.columns],
                    max_removed_rows=max_removed_rows,
                    strict=strict,
                    new=True,
                )
                dfs.append(_df)
            return pd.concat(dfs, ignore_index=True)
        else:
            return mkm(
                df=df,
                column_date="date",
                column_metrics=[m for m in METRICS if m in df.columns],
                max_removed_rows=max_removed_rows,
                strict=strict,
                new=True,
            )

[docs]    def _postprocessing(self, df, valid_cols_only):
        """Minor post processing after all transformations.

        Basically sort by date, ensure correct column order, correct type for metrics.
        """
        df = metrics_to_num_int(df, METRICS)
        df = df.sort_values("date")
        cols = [col for col in COLUMNS_ORDER if col in df.columns]
        if not valid_cols_only:
            cols += [col for col in df.columns if col not in COLUMNS_ORDER]
        df = df[cols]
        df = df.drop_duplicates(subset=[m for m in METRICS if m in df.columns], keep="first")
        df = df.drop_duplicates(subset=["date"], keep="last")
        return df

[docs]    def _postprocessing_age(self, df):
        """Minor post processing after all transformations.

        Basically sort by date, ensure correct column order, correct type for metrics.
        """
        df = metrics_to_num_float(df, METRICS)
        df = df.sort_values(["date", "age_group_min", "age_group_max"])
        cols = [col for col in COLUMNS_ORDER_AGE if col in df.columns]
        df = df[cols]
        return df

[docs]    def _postprocessing_manufacturer(self, df):
        """Minor post processing after all transformations.

        Basically sort by date, ensure correct column order, correct type for metrics.
        """
        df = metrics_to_num_int(df, METRICS)
        df = df.sort_values(["vaccine", "date"])
        cols = [col for col in COLUMNS_ORDER_MANUF if col in df.columns]
        df = df[cols]

        df = df.drop_duplicates(subset=[m for m in METRICS + ["vaccine"] if m in df.columns], keep="first")
        df = df.drop_duplicates(subset=["date", "vaccine"], keep="last")
        return df

[docs]    def export_datafile(
        self,
        df=None,
        df_age=None,
        df_manufacturer=None,
        meta_age=None,
        meta_manufacturer=None,
        filename=None,
        attach=False,
        attach_age=False,
        attach_manufacturer=False,
        reset_index=False,
        valid_cols_only=False,
        **kwargs,
    ):
        """Export country data.

        Args:
            df (pd.DataFrame): Main country data.
            df_age (pd.DataFrame, optional): Country data by age group. Defaults to None.
            df_manufacturer (pd.DataFrame, optional): Country data by manufacturer. Defaults to None.
            meta_age (dict, optional): Country metadata by age. Defaults to None.
            meta_manufacturer (dict, optional): Country metadata by manufacturer. Defaults to None.
            filename (str, optional): Name of output file. If None, defaults to country name.
            attach (bool, optional): Set to True to attach to already existing data. Defaults to False.
            attach_age (bool, optional): Set to True to attach to already existing data. Defaults to False.
            attach_manufacturer (bool, optional): Set to True to attach to already existing data. Defaults to False.
            valid_cols_only (bool, optional): Export only valid columns. Defaults to False.
            reset_index (bool, optional): Brin index back as a column. Defaults to False.
        """
        if df is not None:
            self._export_datafile_main(
                df,
                filename=filename,
                attach=attach,
                reset_index=reset_index,
                valid_cols_only=valid_cols_only,
                **kwargs,
            )
        if df_age is not None:
            self._export_datafile_age(df_age, meta_age, filename=filename, attach=attach_age)
        if df_manufacturer is not None:
            self._export_datafile_manufacturer(
                df_manufacturer, meta_manufacturer, filename=filename, attach=attach_manufacturer
            )

[docs]    def pipe_merge_with_current(self, df, filename=None):
        filename = self.get_output_path(filename)
        df = merge_with_current_data(df, filename)
        return df

[docs]    def _export_datafile_main(self, df, filename, attach=False, reset_index=False, valid_cols_only=False, **kwargs):
        """Export main data."""
        filename = self.get_output_path(filename)
        if attach:
            df = merge_with_current_data(df, filename)
        if not isinstance(df, pd.DataFrame):
            raise TypeError(f"df must be a pandas DataFrame!. Isntead {type(df).__name__} was detected.")
        if "Cayman" in filename:
            print(filename, df.shape)
        df = self._postprocessing(df, valid_cols_only)
        if reset_index:
            df = df.reset_index(drop=True)
        df.to_csv(filename, index=False, **kwargs)

[docs]    def _export_datafile_age(self, df, metadata, filename, attach):
        """Export age data."""
        filename = self.get_output_path(filename, age=True)
        if attach:
            df = merge_with_current_data(df, filename)
        df = self._postprocessing_age(df)
        self._export_datafile_secondary(df, metadata, filename, PATHS.INTERNAL_OUTPUT_VAX_META_AGE_FILE)

[docs]    def _export_datafile_manufacturer(self, df, metadata, filename, attach):
        """Export manufacturer data"""
        filename = self.get_output_path(filename, manufacturer=True)
        if attach:
            df = merge_with_current_data(df, filename)
        df = self._postprocessing_manufacturer(df)
        self._export_datafile_secondary(df, metadata, filename, PATHS.INTERNAL_OUTPUT_VAX_META_MANUFACT_FILE)

[docs]    def _export_datafile_secondary(self, df, metadata, output_path, output_path_meta):
        """Export secondary data."""
        # Check metadata
        self._check_metadata(metadata)
        # Export data
        df.to_csv(output_path, index=False)
        # Export metadata
        export_metadata(df, metadata["source_name"], metadata["source_url"], output_path_meta)

[docs]    def _check_metadata(self, metadata):
        if not isinstance(metadata, dict):
            raise ValueError("The `metadata` object must be a dictionary!")
        if ("source_name" not in metadata) or ("source_url" not in metadata):
            raise ValueError("`metadata` must contain keys 'source_name' and 'source_url'")
        if not (isinstance(metadata["source_name"], str) and isinstance(metadata["source_url"], str)):
            raise ValueError("metadata['source_name'] and metadata['source_url'] must be strings!")

[docs]    def _check_attributes(self, mapping):
        for field_raw, field in mapping.items():
            if field is None:
                raise ValueError(f"Please check class attribute {field_raw}, it can't be None!")

[docs]    def pipe_metadata(self, df: pd.DataFrame) -> pd.DataFrame:
        if is_series := isinstance(df, pd.Series):
            df = pd.DataFrame(df).T
        mapping = {
            "location": self.location,
            "source_url": self.source_url_ref,
        }
        mapping = {k: v for k, v in mapping.items() if k not in df}
        self._check_attributes(mapping)
        df = df.assign(**mapping)
        if is_series:
            return df.iloc[0]
        else:
            return df

[docs]    def pipe_rename_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.rename(columns=self.rename_columns)

[docs]    def force_monotonic(self):
        df = pd.read_csv(self.output_path).pipe(self.make_monotonic)
        self.export_datafile(df)

[docs]    def pipe_age_per_capita(self, df: pd.DataFrame) -> pd.DataFrame:
        # Build population df by age group
        pop_age = _build_population_age_group_df(self.location, df)
        # Normalize
        df = df.merge(pop_age, on=["age_group_min", "age_group_max"])
        metrics = ["people_vaccinated", "people_fully_vaccinated", "people_with_booster"]
        for metric in metrics:
            df = df.assign(**{f"{metric}_per_hundred": (df[metric] / df.population * 100).round(2)})
        return df

[docs]    def pipe_check_vaccine(self, df: pd.DataFrame, vaccines_accepted=None) -> pd.DataFrame:
        if vaccines_accepted is None:
            vaccines_accepted = self.vaccine_mapping.keys()
        self.check_column_values(df, "vaccine", vaccines_accepted)
        return df

[docs]    def check_column_values(self, df: pd.DataFrame, col_name: str, values_accepted: list) -> pd.DataFrame:
        values = set(df[col_name])
        unknown_vaccines = set(values).difference(values_accepted)
        if unknown_vaccines:
            raise ValueError(f"Found unknown values for `{col_name}`: {unknown_vaccines}")


[docs]def _build_population_age_group_df(location, df):
    # Read raw population by age
    pop_age = pd.read_csv(PATHS.INTERNAL_INPUT_UN_POPULATION_AGE_FILE, index_col="location")
    # Filter location
    pop_age = pop_age.loc[location]
    # Extract age groups of interest
    ages = df[["age_group_min", "age_group_max"]].drop_duplicates()
    # ages = df[["age_group_min", "age_group_max"]].drop_duplicates().replace("", 1000).astype(float).values.tolist()
    ages["age_group_max"] = ages["age_group_max"].replace("", 1000).astype(float).fillna(1000)
    ages["age_group_min"] = ages["age_group_min"].replace("", -1000).astype(float).fillna(-1000)
    ages = ages.values.tolist()
    # # Build population dataframe for age groups
    records = []
    for age_min, age_max in ages:
        msk = (pop_age.age >= age_min) & (pop_age.age <= age_max)
        records.append(
            {
                "age_group_min": age_min,
                "age_group_max": age_max,
                "population": pop_age.loc[msk, "population"].sum(),
            }
        )
    # Build Dataframe
    pop_age = pd.DataFrame(records)
    # return pop_age
    pop_age = pop_age.astype(int).astype({"age_group_min": str, "age_group_max": str})
    pop_age = pop_age.assign(
        age_group_max=pop_age.age_group_max.replace("1000", ""),
        age_group_min=pop_age.age_group_min.replace("-1000", ""),
    )
    return pop_age


[docs]def _check_last_update(path, country):
    metadata = S3().get_metadata(path)
    last_update = metadata["LastModified"]
    now = localdate(force_today=True, as_datetime=True)
    num_days = (now - last_update).days
    if num_days > 4:  # Allow maximum 4 days delay
        raise FileExistsError(
            f"ICE File for {country} is too old ({num_days} days old)! Please check cowidev.vax.icer"
        )


[docs]def merge_with_current_data(df: pd.DataFrame, filepath: str) -> pd.DataFrame:
    if os.path.isfile(filepath):
        # Load
        df_current = pd.read_csv(filepath)
        # Type check
        if isinstance(df, pd.Series):
            df = df.to_frame().T
        elif not isinstance(df, pd.DataFrame):
            raise TypeError(f"`df` must be a pandas DataFrame!. Instead {type(df).__name__} was detected.")
        # Merge
        df_current = df_current[~df_current.date.isin(df.date)]
        df = pd.concat([df, df_current]).sort_values(by="date")
    return df