import os
import pandas as pd
from cowidev import PATHS
INPUT_DIR = PATHS.INTERNAL_INPUT_DIR
VACCINATIONS_CSV = PATHS.DATA_VAX_MAIN_FILE
TESTING_CSV = PATHS.DATA_TEST_MAIN_FILE
CASES_CSV = PATHS.DATA_JHU_CASES_FILE
DEATHS_CSV = PATHS.DATA_JHU_DEATHS_FILE
HOSP_CSV = os.path.join(PATHS.INTERNAL_GRAPHER_DIR, "COVID-2019 - Hospital & ICU.csv")
REPR_CSV = "https://github.com/crondonm/TrackingR/raw/main/Estimates-Database/database_7.csv"
POL_CSV = PATHS.INTERNAL_INPUT_BSG_FILE
CODEBOOK_CSV = PATHS.DATA_CODEBOOK_FILE
[docs]def get_excluded_locations():
df = pd.read_csv(VACCINATIONS_CSV)
codes = [code for code in df["iso_code"].unique() if "OWID_" in code]
EXCLUDE_LOCATIONS = set(
df[df.iso_code.isin(codes)].location.unique().tolist() + ["2020 Summer Olympics athletes & staff"]
)
EXCLUDE_LOCATIONS.remove("Kosovo")
EXCLUDE_ISOS = df[df.location.isin(EXCLUDE_LOCATIONS)].iso_code.unique()
return EXCLUDE_LOCATIONS, EXCLUDE_ISOS
EXCLUDE_LOCATIONS, EXCLUDE_ISOS = get_excluded_locations()
[docs]def get_num_countries_by_iso(iso_code_colname, csv_filepath=None, df=None):
if df is None:
df = pd.read_csv(csv_filepath, low_memory=False)
codes = [code for code in df[iso_code_colname].dropna().unique() if code not in EXCLUDE_ISOS]
return len(codes)
[docs]def get_num_countries_by_location(csv_filepath, location_colname, low_memory=True):
df = pd.read_csv(csv_filepath, low_memory=low_memory)
locations = [loc for loc in df[location_colname].dropna().unique() if loc not in EXCLUDE_LOCATIONS]
return len(locations)
[docs]def get_num_countries_jhu(csv_filepath):
df = pd.read_csv(csv_filepath, low_memory=False)
columns = df.columns
return len(columns[~columns.isin(EXCLUDE_LOCATIONS)]) - 1
[docs]def load_macro_df():
macro_variables = {
"population": "un/population_latest.csv",
"population_density": "wb/population_density.csv",
"median_age": "un/median_age.csv",
"aged_65_older": "wb/aged_65_older.csv",
"aged_70_older": "un/aged_70_older.csv",
"gdp_per_capita": "wb/gdp_per_capita.csv",
"extreme_poverty": "wb/extreme_poverty.csv",
"cardiovasc_death_rate": "gbd/cardiovasc_death_rate.csv",
"diabetes_prevalence": "wb/diabetes_prevalence.csv",
"female_smokers": "wb/female_smokers.csv",
"male_smokers": "wb/male_smokers.csv",
"handwashing_facilities": "un/handwashing_facilities.csv",
"hospital_beds_per_thousand": "owid/hospital_beds.csv",
"life_expectancy": "owid/life_expectancy.csv",
"human_development_index": "un/human_development_index.csv",
}
dfs = []
for var, file in macro_variables.items():
dfs.append(pd.read_csv(os.path.join(INPUT_DIR, file), usecols=["iso_code", var]))
df = pd.concat(dfs)
return df
[docs]def get_variable_section():
template = """### {title}\n{notes_top}\n{table}\n{notes}"""
df = pd.read_csv(CODEBOOK_CSV).rename(columns={"description": "Description"})
df_notes = pd.read_csv(PATHS.INTERNAL_INPUT_OWID_COVID_NOTES_FILE, index_col="category")
df = df.assign(Variable=df.column.apply(lambda x: f"`{x}`"))
variable_description = []
categories = list(filter(lambda x: x != "Others", sorted(df.category.unique()))) + ["Others"]
for cat in categories:
df_ = df[df.category == cat]
table = df_[["Variable", "Description"]].to_markdown(index=False)
notes = _generate_category_notes(df_notes, cat)
notes_top = _generate_category_notes_top(df_notes, cat)
variable_description.append(template.format(title=cat, table=table, notes=notes, notes_top=notes_top))
return variable_description
[docs]def _generate_category_notes(df_notes, category):
notes_pretty = "\n#### Notes:\n"
if category in df_notes.index:
notes = df_notes.loc[category, "notes"]
if not pd.isnull(notes):
if isinstance(notes, list):
for note in notes:
notes_pretty += f"* {note}\n"
else:
notes_pretty += f"* {notes}\n"
else:
notes_pretty = ""
return notes_pretty
return ""
[docs]def _generate_category_notes_top(df_notes, category):
if category in df_notes.index:
note_top = df_notes.loc[category, "notes_top"]
if not pd.isnull(note_top):
return note_top
return ""
[docs]def get_placeholder():
placeholders = {
"num_countries_vaccinations": get_num_countries_by_iso(
csv_filepath=VACCINATIONS_CSV, iso_code_colname="iso_code"
),
"num_countries_testing": get_num_countries_by_iso(csv_filepath=TESTING_CSV, iso_code_colname="ISO code"),
"num_countries_cases": get_num_countries_jhu(csv_filepath=CASES_CSV),
"num_countries_deaths": get_num_countries_jhu(csv_filepath=DEATHS_CSV),
"num_countries_hospital": get_num_countries_by_location(csv_filepath=HOSP_CSV, location_colname="Country"),
"num_countries_reproduction": get_num_countries_by_location(
csv_filepath=REPR_CSV, location_colname="Country/Region"
),
"num_countries_policy": get_num_countries_by_location(
csv_filepath=POL_CSV,
location_colname="CountryName",
low_memory=False,
),
"num_countries_others": get_num_countries_by_iso(df=load_macro_df(), iso_code_colname="iso_code"),
"variable_description": "\n".join(get_variable_section()),
}
return placeholders
[docs]def generate_readme(readme_template: str, readme_output: str):
placeholders = get_placeholder()
with open(readme_template, "r", encoding="utf-8") as fr:
s = fr.read()
s = s.format(**placeholders)
with open(readme_output, "w", encoding="utf-8") as fw:
fw.write(s)