Source code for cowidev.testing.incremental.haiti
import re
import tempfile
from bs4 import BeautifulSoup
import pandas as pd
from pdfminer.high_level import extract_text
from cowidev.utils import get_soup, clean_count
from cowidev.utils.clean import extract_clean_date
from cowidev.utils.utils import download_file_from_url
from cowidev.testing.utils.base import CountryTestBase
[docs]class Haiti(CountryTestBase):
location: str = "Haiti"
units: str = "tests performed"
source_url: dict = "https://www.mspp.gouv.ht/documentation/"
source_url_ref: str = None
source_label: str = "Ministry of Public Health and Population"
regex: dict = {
"title": r"surveillance du nouveau Coronavirus \(COVID-19\)",
"date": r"(\d{1,2}\-\d{1,2}\-20\d{2})",
"metrics": r"INDICATEURS ([\d,]+)",
}
[docs] def read(self) -> pd.DataFrame:
"""Reads data from source."""
soup = get_soup(self.source_url)
df = self._parse_data(soup)
return df
[docs] def _parse_data(self, soup: BeautifulSoup) -> pd.DataFrame:
"""Parses data from soup."""
# Obtain pdf url
self.source_url_ref = soup.find(
text=re.compile("surveillance du nouveau Coronavirus \(COVID-19\)")
).parent.findNext("a")["href"]
# Extract text from pdf url
text = self._extract_text_from_url()
# Clean data
df = self._parse_metrics(text)
return df
[docs] def _extract_text_from_url(self) -> str:
"""Extracts text from pdf."""
with tempfile.NamedTemporaryFile() as tmp:
download_file_from_url(self.source_url_ref, tmp.name)
with open(tmp.name, "rb") as f:
text = extract_text(f).replace("\n", " ")
text = re.sub(r"\s+", " ", text)
return text
[docs] def _parse_metrics(self, text: str) -> pd.DataFrame:
"""Parses metrics from data."""
# Extract data
match_count = re.search(self.regex["metrics"], text)
if not match_count:
raise ValueError("Unable to extract data from text, please update the regex.")
count = clean_count(match_count.group(1))
# Create dataframe
df = {
"Cumulative total": [count],
}
return pd.DataFrame(df)
[docs] def _parse_date(self, link: str) -> str:
"""Gets date from link."""
return extract_clean_date(link, self.regex["date"], "%d-%m-%Y")
[docs] def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame:
"""Pipes date."""
return df.assign(Date=self._parse_date(self.source_url_ref))
[docs] def pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
"""Pipeline for data."""
return df.pipe(self.pipe_date).pipe(self.pipe_metadata)
[docs] def export(self):
"""Exports data to CSV."""
df = self.read().pipe(self.pipeline)
# Export to CSV
self.export_datafile(df, attach=True)