import json
import re
from datetime import datetime, timedelta

import pandas as pd
from cowidev.testing import CountryTestBase
from cowidev.utils.clean import clean_date, clean_count
from cowidev.utils.web import get_soup

[docs]class ElSalvador(CountryTestBase): location: str = "El Salvador" units: str = "tests performed" source_label: str = "Government of El Salvador" source_url_ref: str = "" source_url: str = "" regex: dict = { "title": r"\'PRUEBAS REALIZADAS\'\, \'CASOS POSITIVOS\'", "element": r"window\.infographicData=({.*})", } rename_columns: dict = { "CASOS POSITIVOS": "positive", "PRUEBAS REALIZADAS": "Daily change in cumulative total", }
[docs] def read(self) -> pd.DataFrame: """Read data from source""" data_id = self._get_data_id_from_source(self.source_url_ref) df = self._load_data(data_id) return df
[docs] def _get_data_id_from_source(self, source_url: str) -> str: """Get Data ID from source""" soup = get_soup(source_url) data_id = soup.find(class_="infogram-embed")["data-id"] return data_id
[docs] def _load_data(self, data_id: str) -> pd.DataFrame: """Load data from source""" url = f"{self.source_url}{data_id}" soup = get_soup(url) match =["element"], str(soup)) if not match: raise ValueError("Website Structure Changed, please update the script") data = json.loads( data = data["elements"]["content"]["content"]["entities"] data = [data[idx] for idx in data if["title"], str(data[idx].values()))][0] data_list = data["props"]["chartData"]["data"] df = pd.DataFrame() for frame in data_list: col = frame.pop(0) col[0] = "Date" df = df.append(pd.DataFrame(frame, columns=col), ignore_index=True) return df
[docs] def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: """Clean date""" last_date = clean_date( df[-1:]["Date"].item() + "-" + str(, "%d-%b-%Y", lang="es", as_datetime=True ) first_date = last_date - timedelta(len(df.index) - 1) df["Date"] = pd.Series(pd.date_range(first_date, last_date).astype(str)) return df
[docs] def pipe_numeric(self, df: pd.DataFrame) -> pd.DataFrame: """Clean numeric columns""" df["positive"] = df["positive"].apply(clean_count) df["Daily change in cumulative total"] = df["Daily change in cumulative total"].apply(clean_count) return df
[docs] def pipe_pr(self, df: pd.DataFrame) -> pd.DataFrame: """Calculate Positive Rate""" df["Positive rate"] = ( df["positive"].rolling(7).sum().div(df["Daily change in cumulative total"].rolling(7).sum()).round(3) ).fillna(0) return df
[docs] def pipe_merge(self, df: pd.DataFrame) -> pd.DataFrame: df_current = pd.read_csv(self.get_output_path()) df_current = df_current[df_current.Date < df.Date.min()] df = pd.concat([df_current, df]).sort_values("Date") return df
[docs] def pipe_positive(self, df: pd.DataFrame) -> pd.DataFrame: return df
[docs] def pipeline(self, df: pd.DataFrame) -> pd.DataFrame: """Pipeline for data processing""" return ( df.pipe(self.pipe_rename_columns) .pipe(self.pipe_date) .pipe(self.pipe_metadata) .pipe(self.pipe_merge) .pipe(self.pipe_positive) .pipe(self.pipe_numeric) .pipe(self.pipe_pr) )
[docs] def export(self): """Export data to csv""" df = # self.export_datafile(df, float_format="%.5f") df.to_csv(self.get_output_path(), index=False)
[docs]def main(): ElSalvador().export()