import re
import json
from bs4 import BeautifulSoup, element
import pandas as pd
from cowidev.utils import clean_count, clean_date_series, get_soup
from cowidev.testing.utils.base import CountryTestBase
[docs]class Andorra(CountryTestBase):
location = "Andorra"
units = "tests performed"
source_label = "Tauler COVID-19, Govern d'Andorra"
source_url_ref = "https://covid19.govern.ad"
regex = {
"script": r"'n_serologics': {",
"pcr": r"'n_pcr': { type: 'line', data: { labels:(.*?), datasets: .*? data: (.*?), fill:",
"tma": r"'n_tma': { type: 'line', data: { labels:(.*?), datasets: .*? data: (.*?), fill:",
}
[docs] def read(self) -> pd.DataFrame:
"""Reads data from the source page."""
soup = get_soup(self.source_url_ref)
data = self._parse_data(soup)
return data
[docs] def _parse_data(self, soup: BeautifulSoup) -> pd.DataFrame:
"""Gets data from the source page."""
# Get relevant element
elem = self._get_relevant_element(soup)
# Extract text from element
text = self._get_text_from_element(elem)
# Extract data from text
data = self._parse_metrics(text)
return data
[docs] def _get_relevant_element(self, soup: BeautifulSoup) -> element.Tag:
"""Gets the relevant element."""
elem = soup.find("script", text=re.compile(self.regex["script"]))
if not elem:
raise ValueError("No element found, please update the script")
return elem
[docs] def _get_text_from_element(self, elem: element.Tag) -> str:
"""Extracts text from the element."""
text = re.sub(r"\s+", " ", str(elem))
return text
[docs] def _parse_metrics(self, text: str) -> pd.DataFrame:
"""Get metrics from text."""
df_pcr = self._df_builder("pcr", text)
df_tma = self._df_builder("tma", text)
df = pd.merge(df_pcr, df_tma)
return df
[docs] def _df_builder(self, regex_key: str, text: str) -> pd.DataFrame:
"""Builds Dataframe"""
match = re.search(self.regex[regex_key], text)
if not match:
raise ValueError("No match found, please update the regex")
df = pd.DataFrame([json.loads(match.group(1)), json.loads(match.group(2))], index=["Date", f"{regex_key}"]).T
return df
[docs] def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame:
"""Pipes date column."""
return df.assign(Date=clean_date_series(df.Date, "%d/%m/%y"))
[docs] def pipe_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
"""Pipes metrics."""
return df.assign(
**{
"Cumulative total": df.pcr.apply(clean_count) + df.tma.apply(clean_count),
}
)
[docs] def pipe_correct_dp(self, df: pd.DataFrame):
"""Pipes the replacement data point."""
date = "2021-03-22"
correct_dp = 164665
df.loc[df.Date == date, "Cumulative total"] = correct_dp
return df
[docs] def pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
"""Pipeline for data."""
return (
df.pipe(self.pipe_date)
.pipe(self.pipe_metrics)
.pipe(self.pipe_correct_dp)
.pipe(self.pipe_metadata)
.sort_values("Date")
)
[docs] def export(self):
"""Exports data to CSV."""
df = self.read().pipe(self.pipeline)
self.export_datafile(df)
[docs]def main():
Andorra().export()