import os
import pandas as pd
from cowidev import PATHS
from cowidev.grapher.files import Grapheriser, Exploriser
NUM_SEQUENCES_TOTAL_THRESHOLD = 100
FILE_GRAPHER = os.path.join(PATHS.INTERNAL_GRAPHER_DIR, "COVID-19 - Variants.csv")
FILE_SEQ_GRAPHER = os.path.join(PATHS.INTERNAL_GRAPHER_DIR, "COVID-19 - Sequencing.csv")
FILE_EXPLORER = os.path.join(PATHS.DATA_INTERNAL_DIR, "megafile--variants.json")
[docs]def filter_by_num_sequences(df: pd.DataFrame) -> pd.DataFrame:
msk = df.num_sequences_total < NUM_SEQUENCES_TOTAL_THRESHOLD
# Info
_sk_perc_rows = round(100 * (msk.sum() / len(df)), 2)
_sk_num_countries = df.loc[msk, "location"].nunique()
_sk_countries_top = df[msk]["location"].value_counts().head(10).to_dict()
print(
f"Skipping {msk.sum()} datapoints ({_sk_perc_rows}%), affecting {_sk_num_countries} countries. Some are:"
f" {_sk_countries_top}"
)
return df[~msk]
[docs]def variant_url_frienldy_name(df: pd.DataFrame) -> pd.DataFrame:
def _slug(x):
return x.replace(" ", "_").replace(".", "_").replace("(", "").replace(")", "")
df.columns = [_slug(col) for col in df.columns]
return df
[docs]def run_grapheriser():
# Variants
Grapheriser(
pivot_column="variant",
pivot_values=["num_sequences", "perc_sequences"],
fillna_0=True,
function_input=filter_by_num_sequences,
suffixes=["", "_percentage"],
).run(PATHS.INTERNAL_OUTPUT_VARIANTS_FILE, FILE_GRAPHER)
# Sequencing
Grapheriser(fillna_0=True, columns_non_fillna_0=["variant_dominant"]).run(
PATHS.INTERNAL_OUTPUT_VARIANTS_SEQ_FILE, FILE_SEQ_GRAPHER
)
[docs]def run_explorerizer():
Exploriser(
pivot_column="variant",
pivot_values="perc_sequences",
function_input=filter_by_num_sequences,
function_output=variant_url_frienldy_name,
).run(PATHS.INTERNAL_OUTPUT_VARIANTS_FILE, FILE_EXPLORER)
[docs]def run_db_updater(input_path: str):
raise NotImplementedError("Not yet implemented")