Source code for cowidev.vax.incremental.paho

import os
import time
from glob import glob

import pandas as pd

from cowidev.utils.clean import clean_date
from cowidev.utils.web.scraping import get_soup, get_driver

# from cowidev.utils.log import get_logger
from cowidev.vax.utils.files import get_file_encoding
from cowidev.vax.utils.incremental import increment
from cowidev.vax.utils.orgs import WHO_VACCINES, PAHO_COUNTRIES


# logger = get_logger()


[docs]class PAHO: source_url = "https://ais.paho.org/imm/IM_DosisAdmin-Vacunacion.asp" _download_path = "/tmp" columns_mapping = { "Country/ Territory": "location", "Country code": "country_code", "Single dose": "single_dose", "First dose": "dose_1", "Second dose": "dose_2", # "Complete Schedule": "people_fully_vaccinated", "Total doses": "total_vaccinations", "1st additional dose": "total_boosters_1", "2nd additional dose": "total_boosters_2", "date": "date", }
[docs] def read(self): url = self._parse_iframe_link() df = self._parse_data(url) return df
[docs] def _parse_data(self, url: str): with get_driver(download_folder=self._download_path) as driver: # Go to page driver.get(url) time.sleep(7.5) # Go to tab driver.find_element_by_id("tableauTabbedNavigation_tab_2").click() time.sleep(5) # time.sleep(1) # Download data self._download_csv(driver, "Crosstab", "RDT: Overview Table") # Load downloadded file filename = self._get_downloaded_filename() df = pd.read_csv(filename, sep="\t", encoding=get_file_encoding(filename), thousands=",") os.remove(filename) # Get date date = self._parse_date(driver) df = df.assign(date=date) return df
[docs] def _download_csv(self, driver, option: str, filename: str): # Click on download driver.find_element_by_id("download-ToolbarButton").click() time.sleep(1) # Click on Crosstab driver.find_element_by_xpath(f"//button[contains(text(),'{option}')]").click() time.sleep(3) # Select RDT Overview option driver.find_element_by_xpath(f"//span[contains(text(),'{filename}')]").click() time.sleep(2) # Choose CSV driver.find_element_by_xpath("//div[contains(text(),'CSV')]").click() time.sleep(2) # Select RDT Overview option # driver.find_element_by_xpath(f"//span[contains(text(),'{filename}')]").click() # time.sleep(2) # Download driver.find_element_by_xpath("//button[contains(text(),'Download')]").click() time.sleep(5)
[docs] def _parse_date(self, driver): # fix driver.find_element_by_id("tableauTabbedNavigation_tab_0").click() time.sleep(5) driver.find_element_by_id("tabZoneId77").click() #87 time.sleep(1) # driver.find_element_by_id("download-ToolbarButton").click() time.sleep(2) driver.find_element_by_xpath(f"//button[contains(text(),'Data')]").click() time.sleep(4) window_after = driver.window_handles[1] driver.switch_to.window(window_after) time.sleep(2) date_str = driver.find_element_by_tag_name("tbody").text date_str = clean_date(date_str, "%m/%d/%Y") time.sleep(2) window_before = driver.window_handles[0] driver.switch_to.window(window_before) time.sleep(2) return date_str
[docs] def _get_downloaded_filename(self): files = glob(os.path.join(self._download_path, "*.csv")) # print(files) return max(files, key=os.path.getctime)
[docs] def pipe_check_columns(self, df: pd.DataFrame) -> pd.DataFrame: df.columns = df.columns.str.replace(" \[\d.*", "", regex=True) columns_missing = set(self.columns_mapping).difference(df.columns) if columns_missing: raise ValueError(f"Missing column fields: {columns_missing}. Present columns are: {df.columns}") return df
[docs] def pipe_check_countries(self, df: pd.DataFrame) -> pd.DataFrame: pass
[docs] def pipe_rename_columns(self, df: pd.DataFrame) -> pd.DataFrame: return df.rename(columns=self.columns_mapping)
[docs] def pipe_filter_countries(self, df: pd.DataFrame) -> pd.DataFrame: """Get rows from selected countries.""" countries_wrong = set(PAHO_COUNTRIES).difference(df.location) if countries_wrong: raise ValueError(f"Invalid country(s) {countries_wrong}") df = df[df.location.isin(PAHO_COUNTRIES)] df.loc[:, "location"] = df.location.replace(PAHO_COUNTRIES) return df
[docs] def pipe_metrics(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign( people_vaccinated=df["single_dose"] + df["dose_1"], people_fully_vaccinated=df["single_dose"] + df["dose_2"], )
[docs] def pipe_metadata(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign( source_url=self.source_url, )
[docs] def pipe_vaccine(self, df: pd.DataFrame) -> pd.DataFrame: url = "https://covid19.who.int/who-data/vaccination-data.csv" df_who = pd.read_csv(url, usecols=["ISO3", "VACCINES_USED"]).rename(columns={"VACCINES_USED": "vaccine"}) df_who = df_who.dropna(subset=["vaccine"]) df_who = df_who.assign( vaccine=df_who.vaccine.apply( lambda x: ", ".join( sorted(set(WHO_VACCINES[xx.strip()] for xx in x.split(",") if xx != "Unknown Vaccine")) ) ) ) df = df.merge(df_who, left_on="country_code", right_on="ISO3") return df
[docs] def pipeline(self, df: pd.DataFrame) -> pd.DataFrame: return ( df.pipe(self.pipe_check_columns) .pipe(self.pipe_rename_columns) .pipe(self.pipe_filter_countries) .pipe(self.pipe_metrics) .pipe(self.pipe_metadata) .pipe(self.pipe_vaccine) )
[docs] def increment_countries(self, df: pd.DataFrame): for row in df.sort_values("location").iterrows(): row = row[1] increment( location=row["location"], total_vaccinations=row["total_vaccinations"], people_vaccinated=row["people_vaccinated"], people_fully_vaccinated=row["people_fully_vaccinated"], total_boosters=row["total_boosters_1"] + row["total_boosters_2"], date=row["date"], vaccine=row["vaccine"], source_url=row["source_url"], ) country = row["location"]
# logger.info(f"\tVAX - vax.incremental.paho.{country}: SUCCESS ✅")
[docs] def export(self): df = self.read().pipe(self.pipeline) self.increment_countries(df)
[docs]def main(): PAHO().export()
if __name__=="__main__": main()