Source code for cowidev.vax.incremental.hungary

import re

from bs4 import BeautifulSoup
import pandas as pd

from cowidev.utils.clean import clean_count, clean_string, extract_clean_date
from cowidev.utils.web.scraping import get_soup
from cowidev.vax.utils.base import CountryVaxBase


[docs]class Hungary(CountryVaxBase): def __init__(self): self.source_url = "https://koronavirus.gov.hu" self.location = "Hungary" self._num_max_pages = 10 self.regex = { "title": r"\d+ [millió]+ \d+ [ezer]+ a beoltott, [\d\s]+ az új fertőzött", "metrics": ( r"A beoltottak száma ([\d\s]+) fő, közülük ([\d\s]+) fő a második, ([\d\s]+) fő (?:a|már) harmadik" r"(?:, ([\d\s]+) fő már ?a negyedik)? oltását is felvette" ), }
[docs] def read(self, last_update: str) -> pd.DataFrame: data = [] for cnt in range(0, self._num_max_pages): # print(f"page: {cnt}") url = f"{self.source_url}/hirek?page={cnt}/" soup = get_soup(url) data_, proceed = self.parse_data(soup, last_update) data.extend(data_) if not proceed: break return pd.DataFrame(data)
[docs] def parse_data(self, soup: BeautifulSoup, last_update: str) -> tuple: elems = self.get_elements(soup) records = [] for elem in elems: # print(elem) soup = get_soup(elem["link"]) record = { "source_url": elem["link"], **self.parse_data_news_page(soup), } # print("----") # print(record) if record["date"] > last_update: # print(record, "added") records.append(record) else: # print(record["date"], "END") return records, False return records, True
[docs] def get_elements(self, soup: BeautifulSoup) -> list: elems = soup.find_all("h3", text=re.compile(self.regex["title"])) elems = [{"link": self.parse_link(elem)} for elem in elems] return elems
[docs] def parse_data_news_page(self, soup: BeautifulSoup): """ 2021-09-10 We received confirmation from the International Communications Office, State Secretariat for International Communications and Relations, that the part of the report referring to people who received the 2nd dose ("közülük ([\d ]+) fő már a második oltását is megkapt") also included those who have received the J&J vaccine. On the other hand, we cannot estimate the number of vaccinations administered, as adding the two reported metrics would count J&J vaccines twice. """ text = clean_string(soup.find(class_="page_body").text) match = re.search(self.regex["metrics"], text) people_vaccinated = clean_count(match.group(1)) people_fully_vaccinated = clean_count(match.group(2)) total_boosters = clean_count(match.group(3)) + clean_count(match.group(4)) date = extract_clean_date( soup.find("p").text, regex="(202\d. .* \d+.) - .*", date_format="%Y. %B %d.", # loc="hu_HU.UTF-8", lang="hu", minus_days=1, ) return { "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "date": date, }
[docs] def pipe_drop_duplicates(self, df: pd.DataFrame) -> pd.DataFrame: return df.sort_values("date").drop_duplicates(keep="first")
[docs] def pipe_location(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign(location=self.location)
[docs] def pipe_vaccine(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign( vaccine="Johnson&Johnson, Moderna, Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing, Sputnik V" )
[docs] def pipe_select_output_columns(self, df: pd.DataFrame) -> pd.DataFrame: return df[ [ "location", "date", "vaccine", "source_url", "people_vaccinated", "people_fully_vaccinated", "total_boosters", ] ]
[docs] def pipeline(self, df: pd.Series) -> pd.Series: return ( df.pipe(self.pipe_drop_duplicates) .pipe(self.pipe_location) .pipe(self.pipe_vaccine) .pipe(self.pipe_select_output_columns) .sort_values(by="date") )
[docs] def export(self): """Generalized.""" last_update = self.load_datafile().date.max() df = self.read(last_update) if not df.empty and "people_vaccinated" in df.columns: df = df.pipe(self.pipeline) df = df.pipe(self.pipe_drop_duplicates) self.export_datafile(df, attach=True)
[docs]def main(): Hungary().export()