import tempfile
import re
import pandas as pd
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text
from cowidev.utils import clean_date, clean_count, get_soup
from cowidev.utils.web.download import download_file_from_url
from cowidev.vax.utils.base import CountryVaxBase
from cowidev.vax.utils.incremental import enrich_data
[docs]class Azerbaijan(CountryVaxBase):
location = "Azerbaijan"
source_url = "https://koronavirusinfo.az"
regex = {
"title": r"Vaksinasiya",
"date": r"(\d{2}\.\d{2}\.20\d{2})",
"doses": r"\"Buster\" doza vaksinlərin sayı (\d+) (\d+) (\d+) (\d+) (\d+) Gün",
}
[docs] def read(self) -> pd.Series:
"""Read data from source."""
soup = get_soup(self.source_url, verify=False)
data = self._parse_data(soup)
return pd.Series(data)
[docs] def _parse_data(self, soup: BeautifulSoup) -> dict:
"""get data from the source page."""
# Get pdf url
url = self._parse_pdf_link(soup)
if not url.endswith(".pdf"):
raise ValueError(f"File reporting metrics is not a PDF: {url}!")
# Extract pdf text
text = self._parse_pdf_text(url)
# Extract date from text
date = self._parse_date(text)
# Extract metrics from text
(
total_vaccinations,
people_vaccinated,
people_fully_vaccinated,
total_boosters,
doses_after_positive,
) = self._parse_metrics(text)
record = {
"total_vaccinations": total_vaccinations,
"people_vaccinated": people_vaccinated,
"people_fully_vaccinated": people_fully_vaccinated,
"total_boosters": total_boosters,
"doses_after_positive": doses_after_positive,
"source_url": self.source_url,
"date": date,
}
return record
[docs] def _parse_pdf_link(self, soup: BeautifulSoup) -> str:
"""Parse pdf link from source page."""
href = soup.find("a", string=self.regex["title"]).get("href")
return f"{self.source_url}{href}"
[docs] def _parse_pdf_text(self, url: str) -> str:
"""Parse pdf text from url."""
with tempfile.NamedTemporaryFile() as tmp:
download_file_from_url(url, tmp.name, verify=False)
with open(tmp.name, "rb") as f:
text = extract_text(f)
text = re.sub(r"(\d) (\d)", r"\1\2", text)
text = re.sub(r"\s+", " ", text)
return text
[docs] def _parse_date(self, text: str) -> str:
"""Parse date from text."""
date_str = re.search(self.regex["date"], text).group(1)
return clean_date(date_str, "%d.%m.%Y")
[docs] def _parse_metrics(self, text: str) -> tuple:
"""Parse metrics from text."""
total_vaccinations = re.search(self.regex["doses"], text).group(1)
people_vaccinated = re.search(self.regex["doses"], text).group(2)
people_fully_vaccinated = re.search(self.regex["doses"], text).group(3)
total_boosters = re.search(self.regex["doses"], text).group(4)
dose_after_positive = re.search(self.regex["doses"], text).group(5)
return (
clean_count(total_vaccinations),
clean_count(people_vaccinated),
clean_count(people_fully_vaccinated),
clean_count(total_boosters),
clean_count(dose_after_positive),
)
[docs] def enrich_vaccine(self, ds: pd.Series) -> pd.Series:
"""Enrich data with vaccine names."""
return enrich_data(ds, "vaccine", "Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac, Sputnik V")
[docs] def enrich_location(self, ds: pd.Series) -> pd.Series:
"""Enrich data with locationß."""
return enrich_data(ds, "location", self.location)
[docs] def pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
"""Pipeline for data."""
return df.pipe(self.enrich_vaccine).pipe(self.enrich_location)
[docs] def export(self):
"""Export data to csv."""
data = self.read().pipe(self.pipeline)
self.export_datafile(df=data, attach=True)
[docs]def main():
Azerbaijan().export()