Source code for cowidev.testing.batch.kazakhstan

"""Constructs daily time series of COVID-19 testing data for Kazakhstan.

Dashboard: https://hls.kz/

Notes:

* This module requires ChromeDriver to be installed
    (https://chromedriver.chromium.org/downloads) and in your executable
    $PATH.
"""

import time
import json
import datetime
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options

from cowidev.testing import CountryTestBase


COUNTRY = "Kazakhstan"
UNITS = "tests performed"
TESTING_TYPE = "unclear"
SOURCE_LABEL = "Kazakhstan National Center for Public Health"
SOURCE_URL = "https://hls.kz/"

SERIES_TYPE = "Cumulative total"  # one of: {'Cumulative total', 'Daily change in cumulative total'}
URL = "https://qap.datanomix.pro/single/?appid=9ad0ce19-79e5-4c5a-9fc0-83f858de0153&sheet=dfd9c555-2dfa-4ce9-8231-5d588004e1ef"
COMPONENT_ID = "16a9b11c-56fe-4b07-8598-bcb20677924d"
IMPLICIT_WAIT = 30
MAX_TRIES = 1
TIMEOUT = 30


# hardcoded values
hardcoded_data = [
    # {'Date': "", SERIES_TYPE: , "Source URL": ""},
]


# sample of official values for cross-checking against the API data.
sample_official_data = [
    ("2020-09-04", {SERIES_TYPE: 2571562, "source": "https://hls.kz/"}),
    ("2020-08-12", {SERIES_TYPE: 2252153, "source": "https://hls.kz/"}),
    ("2020-07-01", {SERIES_TYPE: 1536607, "source": "https://hls.kz/"}),
    ("2020-06-20", {SERIES_TYPE: 1302094, "source": "https://hls.kz/"}),
    ("2020-04-30", {SERIES_TYPE: 249527, "source": "https://hls.kz/"}),
    ("2020-04-04", {SERIES_TYPE: 45552, "source": "https://hls.kz/"}),
    ("2020-03-14", {SERIES_TYPE: 445, "source": "https://hls.kz/"}),
    ("2020-03-13", {SERIES_TYPE: 126, "source": "https://hls.kz/"}),
]


[docs]class Kazakhstan(CountryTestBase): location = "Kazakhstan"
[docs] def export(self) -> None: # Check website is operative #requests.get(url, timeout=30) i = 0 df = None while df is None and i < MAX_TRIES: # print(f'retrieving COVID-19 testing data (attempt {1+i} of {MAX_TRIES})...') df = get_data() i += 1 assert df is not None, f"Failed to retrieve testing data after {i} tries." df["Source URL"] = df["Source URL"].apply(lambda x: SOURCE_URL if pd.isnull(x) else x) df["Country"] = COUNTRY df["Units"] = UNITS df["Source label"] = SOURCE_LABEL df["Notes"] = "" sanity_checks(df) df = df[ [ "Country", "Units", "Date", SERIES_TYPE, "Source URL", "Source label", "Notes", ] ] self.export_datafile(df) return None
[docs]def get_data() -> pd.DataFrame: options = Options() options.add_argument("--headless") caps = DesiredCapabilities.CHROME caps["goog:loggingPrefs"] = {"performance": "ALL"} try: driver = webdriver.Chrome(desired_capabilities=caps, options=options) driver.implicitly_wait(IMPLICIT_WAIT) driver.get(URL) # retrieves browser logs. wait = 5 time.sleep(wait * 5) # the dashboard tends to be slow to load. t = wait * 3 browser_log = [] n_new_logs = 0 while (len(browser_log) < 1000 or n_new_logs > 0) and t < TIMEOUT: new_logs = driver.get_log("performance") n_new_logs = len(new_logs) if n_new_logs > 0: browser_log += new_logs time.sleep(wait) t += wait assert len(browser_log) > 1000, ( f"Found only {len(browser_log)} browser " "log events, but expected > 1000 events. " "If this problem persists, check that " f"the dashboard is functional ({URL}) " "and try increasing " "time.sleep()." ) # subsets browser logs to websocket responses received and then # finds websocket response containing the testing time series. events = [json.loads(entry["message"])["message"] for entry in browser_log] ws_events_recv = [e for e in events if e["method"] == "Network.webSocketFrameReceived"] found_testing_data = False while not found_testing_data and ws_events_recv: e = ws_events_recv.pop(0) resp_data = json.loads(e["params"]["response"]["payloadData"]) try: data_matrix = resp_data["result"]["qLayout"][0]["value"]["qHyperCube"]["qDataPages"][0]["qMatrix"] component_id = resp_data["result"]["qLayout"][0]["value"]["qInfo"]["qId"] if component_id == COMPONENT_ID: found_testing_data = True except: pass assert found_testing_data, ( "Failed to find testing data in websocket " "responses. If this problem persists, check " f"that the dashboard is functional ({URL}) " "and try increasing time.sleep()." ) df = [] for l in data_matrix: row = { "Date": l[0]["qText"], "Cumulative total": l[1]["qNum"], "Daily change in cumulative total": l[2]["qNum"], } df.append(row) df = pd.DataFrame(df) df["Date"] = pd.to_datetime(df["Date"], format="%d.%m.%Y").dt.strftime("%Y-%m-%d") df["Cumulative total"] = df["Cumulative total"].astype(float) df["Daily change in cumulative total"] = df["Daily change in cumulative total"].astype(float) df = df[(df["Daily change in cumulative total"] > 0) | df["Daily change in cumulative total"].isnull()] df["Source URL"] = None df = df[["Date", SERIES_TYPE, "Source URL"]] if len(hardcoded_data) > 0: # removes rows from df that are hardcoded hardcoded_dates = [d["Date"] for d in hardcoded_data] df = df[~df["Date"].isin(hardcoded_dates)] # appends hardcoded rows to df df_hardcoded = pd.DataFrame(hardcoded_data) df = pd.concat([df, df_hardcoded], axis=0, sort=False).reset_index(drop=True) df.sort_values("Date", inplace=True) df.dropna(subset=["Date", SERIES_TYPE], how="any", inplace=True) df[SERIES_TYPE] = df[SERIES_TYPE].astype(int) except Exception as e: df = None print(f"Error in retrieving testing data: {e}") finally: driver.quit() return df
[docs]def sanity_checks(df: pd.DataFrame) -> None: """checks that there are no obvious errors in the scraped data.""" df_temp = df.copy() # checks that the max date is less than tomorrow's date. assert datetime.datetime.strptime(df_temp["Date"].max(), "%Y-%m-%d") < ( datetime.datetime.utcnow() + datetime.timedelta(days=1) ) # checks that there are no duplicate dates assert df_temp["Date"].duplicated().sum() == 0, "One or more rows share the same date." if "Cumulative total" not in df_temp.columns: df_temp["Cumulative total"] = df_temp["Daily change in cumulative total"].cumsum() # checks that the cumulative number of tests on date t is always greater than the figure for t-1: assert ( df_temp["Cumulative total"].iloc[1:] >= df_temp["Cumulative total"].shift(1).iloc[1:] ).all(), "On one or more dates, `Cumulative total` is greater on date t-1." # df.iloc[1:][df['Cumulative total'].iloc[1:] < df['Cumulative total'].shift(1).iloc[1:]] # cross-checks a sample of scraped figures against the expected result. assert len(sample_official_data) > 0 for dt, d in sample_official_data: val = df_temp.loc[df_temp["Date"] == dt, SERIES_TYPE].squeeze().sum() assert val == d[SERIES_TYPE], f"scraped value ({val:,d}) != official value ({d[SERIES_TYPE]:,d}) on {dt}" return None
[docs]def main(): Kazakhstan().export()