Source code for cowidev.vax.incremental.myanmar

import re

from bs4 import BeautifulSoup, element
import pandas as pd

from cowidev.utils.web import get_soup
from cowidev.utils.clean import clean_count, clean_date
from cowidev.vax.utils.incremental import increment, enrich_data


[docs]class Myanmar:
    location = "Myanmar"
    _base_url = "https://mohs.gov.mm"
    _url_subdirectory = "/main/content/new/list?pagesize=9&pagenumber="
    _num_max_pages = 3
    regex = {
        "title": r"ကိုဗစ်-19 ရောဂါ ကာကွယ်ဆေး ထိုးနှံပြီးစီးမှု",
        "date": r"(\d{1,2}\-\d{1,2}\-20\d{2})",
        "people_vaccinated": r"(\d+) \(Cumulative vaccinated people\)",
        "people_fully_vaccinated": r"(\d+) \(Cumulative fully vaccinated people\)",
        "total_vaccinations": r"(\d+) \(Cumulative vaccinated doses\)",
    }

[docs]    def read(self) -> pd.Series:
        """Reads data from source."""
        data = []

        for cnt in range(1, self._num_max_pages + 1):
            url = f"{self._base_url}{self._url_subdirectory}{cnt}"
            soup = get_soup(url, verify=False)
            data, proceed = self._parse_data(soup)
            if not proceed:
                break

        return pd.Series(data)

[docs]    def _parse_data(self, soup: BeautifulSoup) -> tuple:
        """Get data from the source page."""
        # Get relevant element
        elem = self._get_relevant_element(soup)
        if not elem:
            return None, True
        # Extract url and date from element
        url, date = self._get_link_and_date_from_element(elem)
        # Extract text from url
        text = self._get_text_from_url(url)

        record = {
            "source_url": url,
            "date": date,
            **self._parse_metrics(text),
        }
        return record, False

[docs]    def _get_relevant_element(self, soup: BeautifulSoup) -> element.NavigableString:
        """Gets the relevant element in news feed."""
        elem = soup.find(text=re.compile(self.regex["title"]))
        return elem

[docs]    def _get_text_from_url(self, url: str) -> str:
        """Extracts text from the url."""
        soup = get_soup(url, verify=False)
        text = soup.text.replace(",", "")
        text = re.sub(r"\s+", " ", text)
        return text

[docs]    def _get_link_and_date_from_element(self, elem: element.NavigableString) -> tuple:
        """Extracts link and date from relevant element."""
        link = self._parse_link_from_element(elem)
        date = self._parse_date_from_element(elem)
        return link, date

[docs]    def _parse_date_from_element(self, elem: element.NavigableString) -> str:
        """Gets date from relevant element."""
        date = re.search(self.regex["date"], elem).group(1)
        return clean_date(date, "%d-%m-%Y")

[docs]    def _parse_link_from_element(self, elem: element.NavigableString) -> str:
        """Gets link from relevant element."""
        href = elem.findParent("a")["href"]
        url = f"{self._base_url}{href}"
        return url

[docs]    def _parse_metrics(self, text: str) -> dict:
        """Gets metrics from news text."""
        people_vaccinated = re.search(self.regex["people_vaccinated"], text).group(1)
        people_fully_vaccinated = re.search(self.regex["people_fully_vaccinated"], text).group(1)
        total_vaccinations = re.search(self.regex["total_vaccinations"], text).group(1)
        return {
            "people_vaccinated": clean_count(people_vaccinated),
            "people_fully_vaccinated": clean_count(people_fully_vaccinated),
            "total_vaccinations": clean_count(total_vaccinations),
        }

[docs]    def pipe_location(self, data_series: pd.Series) -> pd.Series:
        """Pipes location."""
        return enrich_data(data_series, "location", self.location)

[docs]    def pipe_vaccine(self, data_series: pd.Series) -> pd.Series:
        """Pipes vaccine names."""
        return enrich_data(data_series, "vaccine", "Oxford/AstraZeneca, Sinopharm/Beijing")

[docs]    def pipeline(self, data_series: pd.Series) -> pd.Series:
        """Pipeline for data."""
        return data_series.pipe(self.pipe_location).pipe(self.pipe_vaccine)

[docs]    def export(self):
        """Exports data to csv."""
        data = self.read().pipe(self.pipeline)
        increment(
            location=data["location"],
            total_vaccinations=data["total_vaccinations"],
            people_vaccinated=data["people_vaccinated"],
            people_fully_vaccinated=data["people_fully_vaccinated"],
            date=data["date"],
            source_url=data["source_url"],
            vaccine=data["vaccine"],
        )


[docs]def main():
    Myanmar().export()