Source code for cowidev.megafile.steps.test

import os
from datetime import date

import pandas as pd
from cowidev import PATHS


INPUT_DIR = PATHS.INTERNAL_INPUT_DIR
DATA_DIR = PATHS.DATA_DIR
data_file = PATHS.DATA_TEST_MAIN_FILE


[docs]def get_testing(): """ Reads the main COVID-19 testing dataset located in /public/data/testing/ Rearranges the Entity column to separate location from testing units Checks for duplicated location/date couples, as we can have more than 1 time series per country Returns: testing {dataframe} """ testing = pd.read_csv( data_file, usecols=[ "Entity", "Date", "Cumulative total", "Daily change in cumulative total", "7-day smoothed daily change", "Cumulative total per thousand", "Daily change in cumulative total per thousand", "7-day smoothed daily change per thousand", "Short-term positive rate", "Short-term tests per case", ], ) testing = testing.rename( columns={ "Entity": "location", "Date": "date", "Cumulative total": "total_tests", "Daily change in cumulative total": "new_tests", "7-day smoothed daily change": "new_tests_smoothed", "Cumulative total per thousand": "total_tests_per_thousand", "Daily change in cumulative total per thousand": "new_tests_per_thousand", "7-day smoothed daily change per thousand": "new_tests_smoothed_per_thousand", "Short-term positive rate": "positive_rate", "Short-term tests per case": "tests_per_case", } ) testing[ [ "total_tests_per_thousand", "new_tests_per_thousand", "new_tests_smoothed_per_thousand", "tests_per_case", ] ] = testing[ [ "total_tests_per_thousand", "new_tests_per_thousand", "new_tests_smoothed_per_thousand", "tests_per_case", ] ].round( 3 ) # Split the original entity into location and testing units testing[["location", "tests_units"]] = testing.location.str.split(" - ", expand=True) # Check for remaining duplicates of location/date duplicates = testing.groupby(["location", "date"]).size().to_frame("n") duplicates = duplicates[duplicates["n"] > 1] if duplicates.shape[0] > 0: print(duplicates) raise Exception("Multiple rows for the same location and date") # Remove observations for current day to avoid rows with testing data but no case/deaths testing = testing[testing["date"] < str(date.today())] return testing