Source code for cowidev.testing.incremental.cape_verde

import re

from bs4 import BeautifulSoup, element
import pandas as pd

from cowidev.utils.web import get_soup
from cowidev.utils.clean import clean_count, extract_clean_date
from cowidev.testing.utils.incremental import increment


[docs]class CapeVerde: location = "Cape Verde" units = "tests performed" source_label = "Government of Cape Verde" source_url = "https://covid19.cv/category/boletim-epidemiologico/" regex = { "date": r"(\d+) (?:de )?(\w+) de (20\d+)", "count": r"(?:total|totais) (?:de|dos|das) (\d+) (?:resultados|amostras)", }
[docs] def read(self) -> pd.Series: """Read data from source.""" soup = get_soup(self.source_url) data = self._parse_data(soup) return pd.Series(data)
[docs] def _parse_data(self, soup: BeautifulSoup) -> dict: """Get data from the source page.""" # Get relevant element elem = self._get_relevant_element(soup) # Extract url from element url = self._parse_link_from_element(elem) # Extract text from url text = self._get_text_from_url(url) # Extract metrics from text daily_change = self._parse_metrics(text) # Extract date from text date = self._parse_date(text) record = { "source_url": url, "date": date, "daily_change": daily_change, } return record
[docs] def _get_relevant_element(self, soup: BeautifulSoup) -> element.Tag: """Get the relevant element from the source page.""" elem = soup.find("h3", class_="elementor-post__title") if not elem: raise TypeError("Website Structure Changed, please update the script") return elem
[docs] def _parse_date(self, text: str) -> str: """Get date from relevant element.""" return extract_clean_date(text.lower(), self.regex["date"], "%d %B %Y", lang="pt")
[docs] def _get_text_from_url(self, url: str) -> str: """Extract text from the url.""" soup = get_soup(url) text = soup.find(class_="page-content").get_text(strip=True).replace(",", "") return text
[docs] def _parse_metrics(self, text: str) -> int: """Get metrics from the text.""" match = re.search(self.regex["count"], text) if not match: raise ValueError("Website Structure Changed, please update the script") return clean_count(match.group(1))
[docs] def export(self): """Export data to CSV.""" data = self.read() increment( sheet_name=self.location, country=self.location, units=self.units, date=data["date"], source_url=data["source_url"], source_label=self.source_label, daily_change=data["daily_change"], )
[docs]def main(): CapeVerde().export()