Source code for cowidev.testing.incremental.emro

import requests

from bs4 import BeautifulSoup
import pandas as pd
import tabula

from cowidev.utils import get_soup
from cowidev.utils.log import get_logger
from cowidev.utils.clean import extract_clean_date
from cowidev.testing.utils.orgs import EMRO_COUNTRIES
from cowidev.testing.utils.base import CountryTestBase

logger = get_logger()


[docs]class EMRO(CountryTestBase):
    location: str = "EMRO"  # Arbitrary location to pass checks
    units: str = "tests performed"
    source_url: str = "http://www.emro.who.int/health-topics/corona-virus/situation-reports.html"
    _base_url: str = "http://www.emro.who.int"
    source_label: str = "WHO Regional Office for the Eastern Mediterranean"
    date: str = None
    regex: dict = {
        "date": r"(\d{1,2} \w+ 20\d{2})",
    }
    columns_use: list = [
        "Country",
        "Total Tests",
    ]
    rename_columns: dict = {
        "Country": "location",
        "Total Tests": "Cumulative total",
    }
    columns_to_check: dict = {
        "tests": "Total Tests",
        "date": "Table 1: Epidemiological situation in the Eastern Mediterranean Region",
    }

    @property
    def area(Self) -> list:
        """
        Areas of pdf to be extracted

        Returns:
            list: [[y1, x1, y2, x2], ...]

        For more info see: https://github.com/tabulapdf/tabula-java/wiki/Using-the-command-line-tabula-extractor-tool
        """
        return [[56, 36, 98, 511], [119, 41, 488, 551]]

[docs]    def read(self) -> pd.DataFrame:
        """Reads data from source."""
        soup = get_soup(self.source_url)
        df = self._parse_data(soup)
        return df

[docs]    def _parse_data(self, soup: BeautifulSoup) -> pd.DataFrame:
        """Parses data from soup"""
        # Obtain pdf url
        self.source_url_ref = self._parse_pdf_url(soup)
        # Extract data table
        df_list = self._parse_pdf_table()
        # Parse date
        self.date = self._parse_date(df_list)
        # Parse metrics
        df = self._parse_metrics(df_list)
        return df

[docs]    def _parse_pdf_url(self, soup: BeautifulSoup) -> str:
        """Parses pdf url from soup"""
        elem = soup.find(class_="download").find("a")
        if not elem:
            raise ValueError("Element not found, please update the script")
        href = elem.get("href")
        return f"{self._base_url}{href}"

[docs]    def _parse_pdf_table(self) -> list:
        """Parses pdf table"""
        response = requests.get(self.source_url_ref, stream=True, verify=True)
        df_list = tabula.read_pdf(response.raw, pages="all", area=self.area)
        return df_list

[docs]    def _parse_date(self, df_list: list) -> str:
        """Parses date from DataFrame list"""
        df_date = [df for df in df_list if self.columns_to_check["date"] in df.columns][0]
        date_str = df_date.iat[0, 0]
        date = extract_clean_date(date_str.lower(), regex=self.regex["date"], date_format="%d %B %Y")
        return date

[docs]    def _parse_metrics(self, df_list: list) -> pd.DataFrame:
        """Parses metrics from DataFrame list"""
        df = [table for table in df_list if self.columns_to_check["tests"] in table.columns][0]
        df = df.loc[:, self.columns_use]
        df.loc[:, self.columns_to_check["tests"]] = df.loc[:, self.columns_to_check["tests"]].str.replace(" ", "")
        df.loc[:, self.columns_to_check["tests"]] = pd.to_numeric(df.loc[:, self.columns_to_check["tests"]])
        return df

[docs]    def pipe_rename_countries(self, df: pd.DataFrame) -> pd.DataFrame:
        """Renames countries to match OWID naming convention."""
        df["location"] = df.location.replace(EMRO_COUNTRIES)
        return df

[docs]    def pipe_filter_entries(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Gets valid entries:

        - Countries not coming from OWID (avoid loop)
        """
        df = df[df.location.isin(EMRO_COUNTRIES.values())]
        return df

[docs]    def pipe_metadata(self, df: pd.DataFrame) -> pd.DataFrame:
        """Adds metadata to DataFrame"""
        mapping = {
            "Country": df["location"],
            "Units": self.units,
            "Notes": self.notes,
            "Source URL": self.source_url_ref,
            "Source label": self.source_label,
            "Date": self.date,
        }
        mapping = {k: v for k, v in mapping.items() if k not in df}
        self._check_attributes(mapping)
        return df.assign(**mapping)

[docs]    def increment_countries(self, df: pd.DataFrame):
        """Exports data to the relevant csv and logs the confirmation."""
        locations = set(df.location)
        for location in locations:
            df_c = df[df.location == location]
            df_c = df_c.dropna(
                subset=["Cumulative total"],
                how="all",
            )
            if not df_c.empty:
                self.export_datafile(df_c, filename=location, attach=True)
                logger.info(f"\tcowidev.testing.incremental.emro.{location}: SUCCESS ✅")

[docs]    def pipeline(self, df: pd.DataFrame):
        """Pipeline for data"""
        return (
            df.pipe(self.pipe_rename_columns)
            .pipe(self.pipe_rename_countries)
            .pipe(self.pipe_filter_entries)
            .pipe(self.pipe_metadata)
        )

[docs]    def export(self):
        """Exports data to csv."""
        df = self.read().pipe(self.pipeline)
        self.increment_countries(df)


[docs]def main():
    EMRO().export()