Source code for cowidev.testing.utils.incremental

import os
import re
import numbers
import datetime

import pandas as pd
from cowidev import PATHS
from cowidev.utils.clean.numbers import metrics_to_num_int


UNITS_ACCEPTED = {"people tested", "samples tested", "tests performed", "units unclear", "tests performed (CDC)"}


[docs]def increment( sheet_name: str, country: str, units: str, date: str, source_url: str, source_label: str, notes=None, daily_change=None, count=None, ): output_path = os.path.join(PATHS.INTERNAL_OUTPUT_TEST_MAIN_DIR, f"{sheet_name}.csv") # Create new df df = pd.DataFrame( [ { "Country": country, "Units": units, "Date": date, "Source URL": source_url, "Source label": source_label, "Notes": notes, } ] ) if count is not None: df["Cumulative total"] = count if daily_change is not None: df["Daily change in cumulative total"] = daily_change # If file exists, merge if os.path.isfile(output_path): # Read current dataframe df_current = pd.read_csv(output_path) # Sanity checks _check_fields(df_current, country, source_url, source_label, units, date, count, daily_change) # Merge df_current = df_current[df_current.Date != date] df = pd.concat([df_current, df]) # Ensure Int64 type df = metrics_to_num_int(df, ["Cumulative total", "Daily change in cumulative total"]) df = df.sort_values("Date") if count is not None: df = df[~df["Cumulative total"].duplicated(keep="first") | (df["Cumulative total"].isnull())] # df = df.drop_duplicates(subset=["Cumulative total"], keep="first") # Export df.to_csv(output_path, index=False)
[docs]def _check_fields( df_current: str, location: str, source_url: str, source_label: str, units: str, date, cumulative_total: numbers.Number, daily_change: numbers.Number, ): # Check location, vaccine, source_url if not isinstance(location, str): type_wrong = type(location).__name__ raise TypeError(f"Check `location` type! Should be a str, found {type_wrong}. Value was {location}") if not isinstance(source_label, str): type_wrong = type(source_label).__name__ raise TypeError(f"Check `source_label` type! Should be a str, found {type_wrong}. Value was {source_label}") if not isinstance(source_url, str): type_wrong = type(source_url).__name__ raise TypeError(f"Check `source_url` type! Should be a str, found {type_wrong}. Value was {source_url}") if not isinstance(units, str): type_wrong = type(units).__name__ raise TypeError(f"Check `units` type! Should be a str, found {type_wrong}. Value was {units}") if units not in UNITS_ACCEPTED: raise ValueError(f"Value for `units` is not accepted ({units}). Should be one of {UNITS_ACCEPTED}") # Check metric daily_change if (cumulative_total is None) or (daily_change is not None): if not isinstance(daily_change, numbers.Number): type_wrong = type(location).__name__ raise TypeError( f"Check `daily_change` type! Should be numeric, found {type_wrong}. Value was {daily_change}" ) # Check metric cumulative_total if pd.isna(cumulative_total): if not isinstance(daily_change, numbers.Number): raise TypeError( f"Check `cumulative_total` type! It can't be NaN if no value for `daily_change` is provided." ) elif (daily_change is None) or (cumulative_total is not None): if not isinstance(cumulative_total, numbers.Number): type_wrong = type(location).__name__ raise TypeError( f"Check `cumulative_total` type! Should be numeric, found {type_wrong}. Value was {cumulative_total}" ) if df_current["Cumulative total"].max() > cumulative_total: raise ValueError(f"`cumulative_total` can't be lower than currently highers 'Cumulative total' value.") # Check date if not isinstance(date, str): type_wrong = type(date).__name__ raise TypeError(f"Check `date` type! Should be numeric, found {type_wrong}. Value was {date}") if not (re.match(r"\d{4}-\d{2}-\d{2}", date) and date <= str(datetime.date.today() + datetime.timedelta(days=1))): raise ValueError(f"Check `date`. It either does not match format YYYY-MM-DD or exceeds todays'date: {date}")