Source code for cowidev.vax.incremental.hungary

import re

from bs4 import BeautifulSoup
import pandas as pd

from cowidev.utils.clean import clean_count, clean_string, extract_clean_date
from cowidev.utils.web.scraping import get_soup
from cowidev.vax.utils.base import CountryVaxBase


[docs]class Hungary(CountryVaxBase):
    def __init__(self):
        self.source_url = "https://koronavirus.gov.hu"
        self.location = "Hungary"
        self._num_max_pages = 10
        self.regex = {
            "title": r"\d+ [millió]+ \d+ [ezer]+ a beoltott, [\d\s]+ az új fertőzött",
            "metrics": (
                r"A beoltottak száma ([\d\s]+) fő, közülük ([\d\s]+) fő a második, ([\d\s]+) fő (?:a|már) harmadik"
                r"(?:, ([\d\s]+) fő már ?a negyedik)? oltását is felvette"
            ),
        }

[docs]    def read(self, last_update: str) -> pd.DataFrame:
        data = []
        for cnt in range(0, self._num_max_pages):
            # print(f"page: {cnt}")
            url = f"{self.source_url}/hirek?page={cnt}/"
            soup = get_soup(url)
            data_, proceed = self.parse_data(soup, last_update)
            data.extend(data_)
            if not proceed:
                break
        return pd.DataFrame(data)

[docs]    def parse_data(self, soup: BeautifulSoup, last_update: str) -> tuple:
        elems = self.get_elements(soup)
        records = []
        for elem in elems:
            # print(elem)
            soup = get_soup(elem["link"])
            record = {
                "source_url": elem["link"],
                **self.parse_data_news_page(soup),
            }
            # print("----")
            # print(record)
            if record["date"] > last_update:
                # print(record, "added")
                records.append(record)
            else:
                # print(record["date"], "END")
                return records, False
        return records, True

[docs]    def get_elements(self, soup: BeautifulSoup) -> list:
        elems = soup.find_all("h3", text=re.compile(self.regex["title"]))
        elems = [{"link": self.parse_link(elem)} for elem in elems]
        return elems

[docs]    def parse_data_news_page(self, soup: BeautifulSoup):
        """
        2021-09-10
        We received confirmation from the International Communications Office, State Secretariat
        for International Communications and Relations, that the part of the report referring to
        people who received the 2nd dose ("közülük ([\d ]+) fő már a második oltását is megkapt")
        also included those who have received the J&J vaccine.
        On the other hand, we cannot estimate the number of vaccinations administered, as adding
        the two reported metrics would count J&J vaccines twice.
        """

        text = clean_string(soup.find(class_="page_body").text)
        match = re.search(self.regex["metrics"], text)

        people_vaccinated = clean_count(match.group(1))
        people_fully_vaccinated = clean_count(match.group(2))
        total_boosters = clean_count(match.group(3)) + clean_count(match.group(4))

        date = extract_clean_date(
            soup.find("p").text,
            regex="(202\d. .* \d+.) - .*",
            date_format="%Y. %B %d.",
            # loc="hu_HU.UTF-8",
            lang="hu",
            minus_days=1,
        )

        return {
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "total_boosters": total_boosters,
            "date": date,
        }

[docs]    def parse_link(self, elem):
        href = elem.parent["href"]
        return f"{self.source_url}/{href}"

[docs]    def pipe_drop_duplicates(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.sort_values("date").drop_duplicates(keep="first")

[docs]    def pipe_location(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(location=self.location)

[docs]    def pipe_vaccine(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(
            vaccine="Johnson&Johnson, Moderna, Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing, Sputnik V"
        )

[docs]    def pipe_select_output_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        return df[
            [
                "location",
                "date",
                "vaccine",
                "source_url",
                "people_vaccinated",
                "people_fully_vaccinated",
                "total_boosters",
            ]
        ]

[docs]    def pipeline(self, df: pd.Series) -> pd.Series:
        return (
            df.pipe(self.pipe_drop_duplicates)
            .pipe(self.pipe_location)
            .pipe(self.pipe_vaccine)
            .pipe(self.pipe_select_output_columns)
            .sort_values(by="date")
        )

[docs]    def export(self):
        """Generalized."""
        last_update = self.load_datafile().date.max()
        df = self.read(last_update)
        if not df.empty and "people_vaccinated" in df.columns:
            df = df.pipe(self.pipeline)
            df = df.pipe(self.pipe_drop_duplicates)
            self.export_datafile(df, attach=True)


[docs]def main():
    Hungary().export()