Source code for cowidev.vax.incremental.paho

import os
import time
from glob import glob

import pandas as pd

from cowidev.utils.clean import clean_date
from cowidev.utils.web.scraping import get_soup, get_driver

# from cowidev.utils.log import get_logger
from cowidev.vax.utils.files import get_file_encoding
from cowidev.vax.utils.incremental import increment
from cowidev.vax.utils.orgs import WHO_VACCINES, PAHO_COUNTRIES


# logger = get_logger()


[docs]class PAHO:
    source_url = "https://ais.paho.org/imm/IM_DosisAdmin-Vacunacion.asp"
    _download_path = "/tmp"
    columns_mapping = {
        "Country/ Territory": "location",
        "Country code": "country_code",
        "Single dose": "single_dose",
        "First dose": "dose_1",
        "Second dose": "dose_2",
        # "Complete Schedule": "people_fully_vaccinated",
        "Total doses": "total_vaccinations",
        "1st additional dose": "total_boosters_1",
        "2nd additional dose": "total_boosters_2",
        "date": "date",
    }

[docs]    def read(self):
        url = self._parse_iframe_link()
        df = self._parse_data(url)
        return df

[docs]    def _parse_iframe_link(self):
        html = get_soup(self.source_url)
        url = html.find("iframe").get("src")
        return url

[docs]    def _parse_data(self, url: str):
        with get_driver(download_folder=self._download_path) as driver:
            # Go to page
            driver.get(url)
            time.sleep(7.5)
            # Go to tab
            driver.find_element_by_id("tableauTabbedNavigation_tab_2").click()
            time.sleep(5)
            # time.sleep(1)
            # Download data
            self._download_csv(driver, "Crosstab", "RDT: Overview Table")
            # Load downloadded file
            filename = self._get_downloaded_filename()
            df = pd.read_csv(filename, sep="\t", encoding=get_file_encoding(filename), thousands=",")
            os.remove(filename)
            # Get date
            date = self._parse_date(driver)
            df = df.assign(date=date)
        return df

[docs]    def _download_csv(self, driver, option: str, filename: str):
        # Click on download
        driver.find_element_by_id("download-ToolbarButton").click()
        time.sleep(1)
        # Click on Crosstab
        driver.find_element_by_xpath(f"//button[contains(text(),'{option}')]").click()
        time.sleep(3)
        # Select RDT Overview option
        driver.find_element_by_xpath(f"//span[contains(text(),'{filename}')]").click()
        time.sleep(2)
        # Choose CSV
        driver.find_element_by_xpath("//div[contains(text(),'CSV')]").click()
        time.sleep(2)
        # Select RDT Overview option
        # driver.find_element_by_xpath(f"//span[contains(text(),'{filename}')]").click()
        # time.sleep(2)
        # Download
        driver.find_element_by_xpath("//button[contains(text(),'Download')]").click()
        time.sleep(5)

[docs]    def _parse_date(self, driver):
        # fix
        driver.find_element_by_id("tableauTabbedNavigation_tab_0").click()
        time.sleep(5)
        driver.find_element_by_id("tabZoneId77").click()  #87
        time.sleep(1)
        #
        driver.find_element_by_id("download-ToolbarButton").click()
        time.sleep(2)
        driver.find_element_by_xpath(f"//button[contains(text(),'Data')]").click()
        time.sleep(4)
        window_after = driver.window_handles[1]
        driver.switch_to.window(window_after)
        time.sleep(2)
        date_str = driver.find_element_by_tag_name("tbody").text
        date_str = clean_date(date_str, "%m/%d/%Y")
        time.sleep(2)
        window_before = driver.window_handles[0]
        driver.switch_to.window(window_before)
        time.sleep(2)
        return date_str

[docs]    def _get_downloaded_filename(self):
        files = glob(os.path.join(self._download_path, "*.csv"))
        # print(files)
        return max(files, key=os.path.getctime)

[docs]    def pipe_check_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        df.columns = df.columns.str.replace(" \[\d.*", "", regex=True)
        columns_missing = set(self.columns_mapping).difference(df.columns)
        if columns_missing:
            raise ValueError(f"Missing column fields: {columns_missing}. Present columns are: {df.columns}")
        return df

[docs]    def pipe_check_countries(self, df: pd.DataFrame) -> pd.DataFrame:
        pass

[docs]    def pipe_rename_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.rename(columns=self.columns_mapping)

[docs]    def pipe_filter_countries(self, df: pd.DataFrame) -> pd.DataFrame:
        """Get rows from selected countries."""
        countries_wrong = set(PAHO_COUNTRIES).difference(df.location)
        if countries_wrong:
            raise ValueError(f"Invalid country(s) {countries_wrong}")
        df = df[df.location.isin(PAHO_COUNTRIES)]
        df.loc[:, "location"] = df.location.replace(PAHO_COUNTRIES)
        return df

[docs]    def pipe_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(
            people_vaccinated=df["single_dose"] + df["dose_1"],
            people_fully_vaccinated=df["single_dose"] + df["dose_2"],
        )

[docs]    def pipe_metadata(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(
            source_url=self.source_url,
        )

[docs]    def pipe_vaccine(self, df: pd.DataFrame) -> pd.DataFrame:
        url = "https://covid19.who.int/who-data/vaccination-data.csv"
        df_who = pd.read_csv(url, usecols=["ISO3", "VACCINES_USED"]).rename(columns={"VACCINES_USED": "vaccine"})
        df_who = df_who.dropna(subset=["vaccine"])
        df_who = df_who.assign(
            vaccine=df_who.vaccine.apply(
                lambda x: ", ".join(
                    sorted(set(WHO_VACCINES[xx.strip()] for xx in x.split(",") if xx != "Unknown Vaccine"))
                )
            )
        )
        df = df.merge(df_who, left_on="country_code", right_on="ISO3")
        return df

[docs]    def pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
        return (
            df.pipe(self.pipe_check_columns)
            .pipe(self.pipe_rename_columns)
            .pipe(self.pipe_filter_countries)
            .pipe(self.pipe_metrics)
            .pipe(self.pipe_metadata)
            .pipe(self.pipe_vaccine)
        )

[docs]    def increment_countries(self, df: pd.DataFrame):
        for row in df.sort_values("location").iterrows():
            row = row[1]
            increment(
                location=row["location"],
                total_vaccinations=row["total_vaccinations"],
                people_vaccinated=row["people_vaccinated"],
                people_fully_vaccinated=row["people_fully_vaccinated"],
                total_boosters=row["total_boosters_1"] + row["total_boosters_2"],
                date=row["date"],
                vaccine=row["vaccine"],
                source_url=row["source_url"],
            )
            country = row["location"]
            # logger.info(f"\tVAX - vax.incremental.paho.{country}: SUCCESS ✅")

[docs]    def export(self):
        df = self.read().pipe(self.pipeline)
        self.increment_countries(df)


[docs]def main():
    PAHO().export()


if __name__=="__main__":
    main()