Source code for cowidev.testing.incremental.moldova

import re

from bs4 import BeautifulSoup, element
import pandas as pd

from cowidev.utils.web import get_soup
from cowidev.utils.clean import clean_count, clean_date
from cowidev.testing.utils.incremental import increment


[docs]class Moldova: location = "Moldova" units = "tests performed" source_label = "Ministry of Health of the Republic of Moldova" notes = "" source_url = "https://msmps.gov.md/media/comunicate/" regex = { "title": r"(cazuri noi de COVID-19)|(cazuri de COVID-19)|(cazuri de COVID-19,)", "date": r"(\d+\/\d+\/\d+)", "count": r"((\d+)) teste.|(\d+) de teste", } # Initial value for cumulative total: 364317
[docs] def read(self) -> pd.Series: """Read data from source.""" soup = get_soup(self.source_url) data = self._parse_data(soup) return pd.Series(data)
[docs] def _parse_data(self, soup: BeautifulSoup) -> tuple: """Get data from the source page.""" # Get relevant element url = self._get_relevant_element(soup) # Extract URL from element # url = self._parse_link_from_element(elem) # Extract text from url text = self._get_text_from_url(url) # Extract date from text date = self._parse_date_from_text(text) # # Extract metrics from text daily_change = self._parse_metrics(text) record = { "source_url": url, "date": date, "daily_change": daily_change, } return record
[docs] def _get_relevant_element(self, soup: BeautifulSoup) -> element.Tag: """Get the relevant element in news feed.""" for a in soup.find_all("a", class_="list__news_item-link font__bigger"): if "cazuri" in a.get("href"): link = a.get("href") if not link: raise ValueError("No data found, Please check the source.") return link
[docs] def _get_text_from_url(self, url: str) -> str: """Extract text from the url.""" text = get_soup(url).get_text() text = text.replace("-", "").replace(",", "") text = re.sub(r"(\d)\s(\d)", r"\1\2", text) return text
[docs] def _parse_date_from_text(self, text: str) -> str: """Get date from text.""" match = re.search(self.regex["date"], text) if not match: raise ValueError("No date found, Please check the source.") date = clean_date(match.group(1), "%d/%m/%Y", as_datetime=True) - pd.Timedelta(days=1) return str(date.date())
# def _parse_link_from_element(self, elem: element.Tag) -> str: # """Get link from relevant element.""" # link = elem.get("href") # return link
[docs] def _parse_metrics(self, text: str) -> int: """Get metrics from news text.""" count = clean_count(re.search(self.regex["count"], text).group(0)) return count
[docs] def export(self): """Export data to CSV.""" data = self.read() increment( sheet_name=self.location, country=self.location, units=self.units, date=data["date"], source_url=data["source_url"], source_label=self.source_label, daily_change=data["daily_change"], count=pd.NA, )
[docs]def main(): Moldova().export()