Source code for cowidev.testing.incremental.syria

import re
import tempfile


from bs4 import BeautifulSoup, element
import pandas as pd
from pdfminer.high_level import extract_text

from cowidev.utils import get_soup
from cowidev.utils.web.download import download_file_from_url
from cowidev.utils.clean import extract_clean_date, clean_count
from cowidev.testing.utils.incremental import increment
from cowidev.testing import CountryTestBase


[docs]class Syria(CountryTestBase): location: str = "Syria" units: str = "tests performed" source_label: str = "WHO Syrian Arab Republic" source_url: str = ( "https://reliefweb.int/updates?advanced-search=%28C226%29_%28S1275%29_%28DT4642%29_%28T4595%29_%28F10%29" ) regex: dict = {"date": r"(\d{1,2} \w+ 20\d{2})", "count": r"Total Test (\d+) (\d+)"}
[docs] def read(self) -> pd.Series: """Read data from source.""" data = [] soup = get_soup(self.source_url) data = self._parse_data(soup) return pd.Series(data)
[docs] def _parse_data(self, soup: BeautifulSoup) -> tuple: """Parses data from soup""" # Get relevant element elem = self._get_relevant_element(soup) if not elem: return None, True # Extract url and date from element pdf_url, date = self._get_link_and_date_from_element(elem) # Extract text text = self._parse_pdf_url(pdf_url) # Get metrics from text count = self._parse_metrics(text) record = { "source_url": pdf_url, "date": date, "count": count, } return record, False
[docs] def _get_relevant_element(self, soup: BeautifulSoup) -> element.Tag: """Parses pdf url from soup""" elem = soup.find_all("h4", "title")[0].find("a") if not elem: raise ValueError("Element not found, please check the source") return elem
[docs] def _parse_date_from_element(self, elem: element.Tag) -> str: """Get date from relevant element.""" return extract_clean_date(elem.text, regex=self.regex["date"], date_format="%d %B %Y")
[docs] def _parse_pdf_url(self, pdf_url: str) -> str: """Parses pdf text""" with tempfile.NamedTemporaryFile() as tmp: download_file_from_url(pdf_url, tmp.name) with open(tmp.name, "rb") as f: text = extract_text(f) text = text.replace("\n", "") return text
[docs] def _parse_metrics(self, text: str) -> int: """Get metrics from report text.""" return clean_count( int(re.search(self.regex["count"], text).group(1) + re.search(self.regex["count"], text).group(2)) )
[docs] def export(self): """Export data to csv.""" data = self.read()[0] increment( sheet_name=self.location, country=self.location, units=self.units, date=data["date"], source_url=data["source_url"], source_label=self.source_label, count=data["count"], )
[docs]def main(): Syria().export()