Source code for cowidev.testing.incremental.nicaragua

import re

from bs4 import BeautifulSoup
import pandas as pd
import requests
import tempfile
from pdfminer.high_level import extract_text
from typing import Iterator
from epiweeks import Week


from cowidev.utils import get_soup, clean_count, clean_date
from cowidev.utils.utils import download_file_from_url
from cowidev.testing import CountryTestBase


[docs]class Nicaragua(CountryTestBase):
    location: str = "Nicaragua"
    units: str = "tests performed"
    source_label: str = "Ministry of Health"
    _base_url: str = "http://www.minsa.gob.ni/index.php/repository/func-download"
    source_url_ref: str = "http://www.minsa.gob.ni/index.php/repository/Descargas-MINSA/COVID-19/Boletines-Epidemiol%C3%B3gico/Boletines-2022/"
    regex: dict = {
        "title": r"Boletín Epidemiológico de la Semana No. ",
    }

[docs]    def read(self) -> pd.DataFrame:
        """Read data from source"""
        soup = get_soup(self.source_url_ref)
        df = self._parse_data(soup)
        return df

[docs]    def _parse_data(self, soup: BeautifulSoup) -> pd.DataFrame:
        """Parse data from soup"""
        # Get the download URL
        link = self._get_download_url(soup)
        # Parse count from pdf
        count = self._extract_text_from_url(link)
        # Get the date from week num
        date = self._parse_date(soup)
        df = pd.DataFrame(
            {
                "Date": [date],
                "Cumulative total": [count],
            }
        )
        return df

[docs]    def _get_download_url(self, soup: BeautifulSoup) -> str:
        source_url_ref = soup.find("a", text=re.compile(self.regex["title"]))["href"]
        if not source_url_ref:
            raise ValueError("Article not found, please update the script")
        response = requests.get(source_url_ref, allow_redirects=True)
        text = response.content.decode("utf-8")
        result = re.search("func-download(.*)'}", text).group(1)
        link = f"{self._base_url}{result}"
        return link

[docs]    def _extract_text_from_url(self, link) -> str:
        """Extracts text from pdf."""
        with tempfile.NamedTemporaryFile() as tmp:
            download_file_from_url(link, tmp.name)
            with open(tmp.name, "rb") as f:
                text = extract_text(f).replace("\n", " ")
        count = clean_count(re.search("• Acumulado: (.*)  Recuperados", text).group(1))
        if not count:
            raise ValueError("Count not found, please update the script")
        return count

[docs]    def _parse_date(self, soup: BeautifulSoup) -> Iterator:
        """parses the date from the week number."""
        week_num = int(
            re.search(
                "Boletín Epidemiológico de la Semana No. (.*)",
                soup.find("a", text=re.compile(self.regex["title"])).text,
            ).group(1)
        )
        date = Week(2022, week_num, system="iso").enddate()
        return clean_date(date)

[docs]    def pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
        return (
            df.pipe(self.pipe_metadata)
            .pipe(self.pipe_merge_current)
            .drop_duplicates(subset=["Cumulative total"], keep="first")
        )

[docs]    def export(self):
        df = self.read().pipe(self.pipeline)
        self.export_datafile(df)


[docs]def main():
    Nicaragua().export()