import json
import re
from datetime import datetime, timedelta
import pandas as pd
from cowidev.testing import CountryTestBase
from cowidev.utils.clean import clean_date, clean_count
from cowidev.utils.web import get_soup
[docs]class ElSalvador(CountryTestBase):
location: str = "El Salvador"
units: str = "tests performed"
source_label: str = "Government of El Salvador"
source_url_ref: str = "https://covid19.gob.sv/"
source_url: str = "https://e.infogram.com/"
regex: dict = {
"title": r"\'PRUEBAS REALIZADAS\'\, \'CASOS POSITIVOS\'",
"element": r"window\.infographicData=({.*})",
}
rename_columns: dict = {
"CASOS POSITIVOS": "positive",
"PRUEBAS REALIZADAS": "Daily change in cumulative total",
}
[docs] def read(self) -> pd.DataFrame:
"""Read data from source"""
data_id = self._get_data_id_from_source(self.source_url_ref)
df = self._load_data(data_id)
return df
[docs] def _get_data_id_from_source(self, source_url: str) -> str:
"""Get Data ID from source"""
soup = get_soup(source_url)
data_id = soup.find(class_="infogram-embed")["data-id"]
return data_id
[docs] def _load_data(self, data_id: str) -> pd.DataFrame:
"""Load data from source"""
url = f"{self.source_url}{data_id}"
soup = get_soup(url)
match = re.search(self.regex["element"], str(soup))
if not match:
raise ValueError("Website Structure Changed, please update the script")
data = json.loads(match.group(1))
data = data["elements"]["content"]["content"]["entities"]
data = [data[idx] for idx in data if re.search(self.regex["title"], str(data[idx].values()))][0]
data_list = data["props"]["chartData"]["data"]
df = pd.DataFrame()
for frame in data_list:
col = frame.pop(0)
col[0] = "Date"
df = df.append(pd.DataFrame(frame, columns=col), ignore_index=True)
return df
[docs] def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean date"""
last_date = clean_date(
df[-1:]["Date"].item() + "-" + str(datetime.today().year), "%d-%b-%Y", lang="es", as_datetime=True
)
first_date = last_date - timedelta(len(df.index) - 1)
df["Date"] = pd.Series(pd.date_range(first_date, last_date).astype(str))
return df
[docs] def pipe_numeric(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean numeric columns"""
df["positive"] = df["positive"].apply(clean_count)
df["Daily change in cumulative total"] = df["Daily change in cumulative total"].apply(clean_count)
return df
[docs] def pipe_pr(self, df: pd.DataFrame) -> pd.DataFrame:
"""Calculate Positive Rate"""
df["Positive rate"] = (
df["positive"].rolling(7).sum().div(df["Daily change in cumulative total"].rolling(7).sum()).round(3)
).fillna(0)
return df
[docs] def pipe_merge(self, df: pd.DataFrame) -> pd.DataFrame:
df_current = pd.read_csv(self.get_output_path())
df_current = df_current[df_current.Date < df.Date.min()]
df = pd.concat([df_current, df]).sort_values("Date")
return df
[docs] def pipe_positive(self, df: pd.DataFrame) -> pd.DataFrame:
return df
[docs] def pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
"""Pipeline for data processing"""
return (
df.pipe(self.pipe_rename_columns)
.pipe(self.pipe_date)
.pipe(self.pipe_metadata)
.pipe(self.pipe_merge)
.pipe(self.pipe_positive)
.pipe(self.pipe_numeric)
.pipe(self.pipe_pr)
)
[docs] def export(self):
"""Export data to csv"""
df = self.read().pipe(self.pipeline)
# self.export_datafile(df, float_format="%.5f")
df.to_csv(self.get_output_path(), index=False)
[docs]def main():
ElSalvador().export()