Source code for cowidev.vax.incremental.azerbaijan

import tempfile
import re

import pandas as pd
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text

from cowidev.utils import clean_date, clean_count, get_soup
from cowidev.utils.web.download import download_file_from_url
from cowidev.vax.utils.base import CountryVaxBase
from cowidev.vax.utils.incremental import enrich_data


[docs]class Azerbaijan(CountryVaxBase): location = "Azerbaijan" source_url = "https://koronavirusinfo.az" regex = { "title": r"Vaksinasiya", "date": r"(\d{2}\.\d{2}\.20\d{2})", "doses": r"\"Buster\" doza vaksinlərin sayı (\d+) (\d+) (\d+) (\d+) (\d+) Gün", }
[docs] def read(self) -> pd.Series: """Read data from source.""" soup = get_soup(self.source_url, verify=False) data = self._parse_data(soup) return pd.Series(data)
[docs] def _parse_data(self, soup: BeautifulSoup) -> dict: """get data from the source page.""" # Get pdf url url = self._parse_pdf_link(soup) if not url.endswith(".pdf"): raise ValueError(f"File reporting metrics is not a PDF: {url}!") # Extract pdf text text = self._parse_pdf_text(url) # Extract date from text date = self._parse_date(text) # Extract metrics from text ( total_vaccinations, people_vaccinated, people_fully_vaccinated, total_boosters, doses_after_positive, ) = self._parse_metrics(text) record = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "doses_after_positive": doses_after_positive, "source_url": self.source_url, "date": date, } return record
[docs] def _parse_pdf_text(self, url: str) -> str: """Parse pdf text from url.""" with tempfile.NamedTemporaryFile() as tmp: download_file_from_url(url, tmp.name, verify=False) with open(tmp.name, "rb") as f: text = extract_text(f) text = re.sub(r"(\d) (\d)", r"\1\2", text) text = re.sub(r"\s+", " ", text) return text
[docs] def _parse_date(self, text: str) -> str: """Parse date from text.""" date_str = re.search(self.regex["date"], text).group(1) return clean_date(date_str, "%d.%m.%Y")
[docs] def _parse_metrics(self, text: str) -> tuple: """Parse metrics from text.""" total_vaccinations = re.search(self.regex["doses"], text).group(1) people_vaccinated = re.search(self.regex["doses"], text).group(2) people_fully_vaccinated = re.search(self.regex["doses"], text).group(3) total_boosters = re.search(self.regex["doses"], text).group(4) dose_after_positive = re.search(self.regex["doses"], text).group(5) return ( clean_count(total_vaccinations), clean_count(people_vaccinated), clean_count(people_fully_vaccinated), clean_count(total_boosters), clean_count(dose_after_positive), )
[docs] def enrich_vaccine(self, ds: pd.Series) -> pd.Series: """Enrich data with vaccine names.""" return enrich_data(ds, "vaccine", "Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac, Sputnik V")
[docs] def enrich_location(self, ds: pd.Series) -> pd.Series: """Enrich data with locationß.""" return enrich_data(ds, "location", self.location)
[docs] def pipeline(self, df: pd.DataFrame) -> pd.DataFrame: """Pipeline for data.""" return df.pipe(self.enrich_vaccine).pipe(self.enrich_location)
[docs] def export(self): """Export data to csv.""" data = self.read().pipe(self.pipeline) self.export_datafile(df=data, attach=True)
[docs]def main(): Azerbaijan().export()