import re
import json
import pandas as pd
from cowidev.testing import CountryTestBase
from cowidev.utils.web import get_soup
from cowidev.utils.clean import clean_date_series, clean_count
[docs]class Iceland(CountryTestBase):
location: str = "Iceland"
units: str = "tests performed"
source_label: str = "Government of Iceland"
source_url_ref: str = "https://www.covid.is/data"
source_url: str = "https://e.infogram.com/"
regex: dict = {
"title_test": r"Fjöldi sýna eftir dögum",
"title_positive": r"Fjöldi smita innanlands",
"element": r"window\.infographicData=({.*})",
}
rename_columns: dict = {
"Symptomatic tests": "t1",
"Sympotmatic tests": "t1",
"PCR domestic tests": "t1",
"Domestic tests (PCR or antigen)": "t1",
"Antigen domestic tests": "t2",
"Quarantine- and random tests": "t2",
"deCODE Genetics screening": "t3",
"Border tests 1 and 2": "t4",
"Border tests": "t4",
"Domestic infections (PCR or antigen test)": "p1",
"Domestic infections": "p1",
"Symptomatic screening": "p1",
"Domestic infections PCR": "p2",
"Quarantine- and random screening": "p2",
"Screening by deCODE Genetics": "p3",
}
[docs] def read(self) -> pd.DataFrame:
"""Read data from source"""
data_id = self._get_data_id_from_source(self.source_url_ref)
data = self._load_data(data_id)
df = self._build_df(data)
return df
[docs] def _get_data_id_from_source(self, source_url: str) -> str:
"""Get Data ID from source"""
soup = get_soup(source_url)
data_id = soup.find(class_="infogram-embed")["data-id"]
return data_id
[docs] def _load_data(self, data_id):
"""Load data from source"""
url = f"{self.source_url}{data_id}"
soup = get_soup(url)
match = re.search(self.regex["element"], str(soup))
if not match:
raise ValueError("Website Structure Changed, please update the script")
data = json.loads(match.group(1))
return data
[docs] def _build_df(self, data: dict) -> pd.DataFrame:
"""Create dfs from raw data"""
data = data["elements"]["content"]["content"]["entities"]
data_test = [v for v in data.values() if re.search(self.regex["title_test"], str(v))][0]
data_positive = [v for v in data.values() if re.search(self.regex["title_positive"], str(v))][0]
d = {}
for iteration, item in enumerate(data_test["props"]["chartData"]["data"]):
test_list = data_test["props"]["chartData"]["data"][iteration]
d["df" + str(iteration)] = pd.DataFrame(test_list, columns=test_list[0]).drop(0)
d["df" + str(iteration)] = d[("df" + str(iteration))][d[("df" + str(iteration))].iloc[:, 0] != ""]
d["df" + str(iteration)].columns = d["df" + str(iteration)].columns.fillna("")
tests = pd.concat(d.values(), ignore_index=True)
p = {}
for iteration, item in enumerate(data_positive["props"]["chartData"]["data"]):
pos_list = data_positive["props"]["chartData"]["data"][iteration]
p["df" + str(iteration)] = pd.DataFrame(pos_list, columns=pos_list[0]).drop(0)
p["df" + str(iteration)] = p[("df" + str(iteration))][p[("df" + str(iteration))].iloc[:, 0] != ""]
p["df" + str(iteration)].columns = p["df" + str(iteration)].columns.fillna("")
pos = pd.concat(p.values(), ignore_index=True)
return pd.merge(tests, pos)
[docs] def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean date"""
return df.assign(Date=clean_date_series(df.iloc[:, 0], "%d.%m.%y")).sort_values("Date")
[docs] def pipe_row_sum(self, df: pd.DataFrame) -> pd.DataFrame:
"""Sum rows"""
df["Daily change in cumulative total"] = df[["t1", "t2", "t3"]].applymap(clean_count).sum(axis=1)
df["positive"] = df[["p1", "p2", "p3"]].applymap(clean_count).sum(axis=1)
return df.drop_duplicates(subset="Date")
[docs] def pipe_pr(self, df: pd.DataFrame) -> pd.DataFrame:
"""Calculate Positive Rate"""
df["Positive rate"] = (
(df["positive"].rolling(7).sum().div(df["Daily change in cumulative total"].rolling(7).sum())).fillna(0)
).round(3)
return df
[docs] def pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
"""Pipeline for data processing"""
return (
df.pipe(self.pipe_rename_columns)
.pipe(self.pipe_date)
.pipe(self.pipe_row_sum)
.pipe(self.pipe_pr)
.pipe(self.pipe_metadata)
)
[docs] def export(self):
"""Export data to csv"""
df = self.read().pipe(self.pipeline)
self.export_datafile(df, float_format="%.5f")
[docs]def main():
Iceland().export()