Source code for cowidev.testing.incremental.fiji

import re

from bs4 import BeautifulSoup, element
import pandas as pd

from cowidev.utils.web import get_soup
from cowidev.utils.clean import clean_count, clean_date
from cowidev.testing.utils.incremental import increment


[docs]class Fiji: location = "Fiji" units = "tests performed" source_label = "Fiji Ministry of Health & Medical Services" notes = "" source_url = "https://www.health.gov.fj/page/" _num_max_pages = 3 _num_rows_per_page = 3 __element = None regex = { "title": r"COVID-19 Update", "year": r"\d{4}", "date": r"tests have been reported for (\w+ \d+)", "count": r"tests since 2020 are (\d+,\d+)", }
[docs] def read(self) -> pd.Series: """Read data from source.""" data = [] for cnt in range(1, self._num_max_pages + 1): url = f"{self.source_url}{cnt}/" soup = get_soup(url) for _ in range(self._num_rows_per_page): data, proceed = self._parse_data(soup) if not proceed: return pd.Series(data) return None
[docs] def _parse_data(self, soup: BeautifulSoup) -> tuple: """Get data from the source page.""" # Get relevant element list self._get_list_of_elements(soup) if not self.__element: return None, True # Get relevant element and year from element list elem, year = self._get_relevant_element_and_year() # Extract url and date from element url = self._parse_link_from_element(elem) # Extract text from url text = self._get_text_from_url(url) # Extract metrics from text date = self._parse_date_from_text(year, text) if not date: return None, True # Extract metrics from text count = self._parse_metrics(text) record = { "source_url": url, "date": date, "count": count, } return record, False
[docs] def _get_list_of_elements(self, soup: BeautifulSoup) -> None: """Get the relevant elements list from the source page.""" elem_list = soup.find_all("h2") self.__element = [title for title in elem_list if self.regex["title"] in title.text]
[docs] def _get_relevant_element_and_year(self) -> tuple: """Get the relevant element and year from the element list.""" elem = self.__element.pop(0) year = re.search(self.regex["year"], elem.text).group() return elem, year
[docs] def _parse_date_from_text(self, year: str, text: str) -> str: """Get date from relevant element.""" match = re.search(self.regex["date"], text) if not match: return None month_day = match.group(1) return clean_date(f"{month_day} {year}", "%B %d %Y")
[docs] def _get_text_from_url(self, url: str) -> str: """Extract text from the url.""" soup = get_soup(url) text = soup.get_text().replace("\n", " ").replace("\xa0", "").lower() return text
[docs] def _parse_metrics(self, text: str) -> int: """Get metrics from news text.""" match = re.search(self.regex["count"], text) if not match: raise TypeError(("Website Structure Changed, please update the script")) count = match.group(1) return clean_count(count)
[docs] def export(self): """Export data to csv.""" data = self.read() increment( sheet_name=self.location, country=self.location, units=self.units, date=data["date"], source_url=data["source_url"], source_label=self.source_label, count=data["count"], )
[docs]def main(): Fiji().export()