Source code for cowidev.megafile.export.readme

import os
import pandas as pd

from cowidev import PATHS


INPUT_DIR = PATHS.INTERNAL_INPUT_DIR
VACCINATIONS_CSV = PATHS.DATA_VAX_MAIN_FILE
TESTING_CSV = PATHS.DATA_TEST_MAIN_FILE
CASES_CSV = PATHS.DATA_JHU_CASES_FILE
DEATHS_CSV = PATHS.DATA_JHU_DEATHS_FILE
HOSP_CSV = os.path.join(PATHS.INTERNAL_GRAPHER_DIR, "COVID-2019 - Hospital & ICU.csv")
REPR_CSV = "https://github.com/crondonm/TrackingR/raw/main/Estimates-Database/database_7.csv"
POL_CSV = PATHS.INTERNAL_INPUT_BSG_FILE
CODEBOOK_CSV = PATHS.DATA_CODEBOOK_FILE


[docs]def get_excluded_locations(): df = pd.read_csv(VACCINATIONS_CSV) codes = [code for code in df["iso_code"].unique() if "OWID_" in code] EXCLUDE_LOCATIONS = set( df[df.iso_code.isin(codes)].location.unique().tolist() + ["2020 Summer Olympics athletes & staff"] ) EXCLUDE_LOCATIONS.remove("Kosovo") EXCLUDE_ISOS = df[df.location.isin(EXCLUDE_LOCATIONS)].iso_code.unique() return EXCLUDE_LOCATIONS, EXCLUDE_ISOS
EXCLUDE_LOCATIONS, EXCLUDE_ISOS = get_excluded_locations()
[docs]def get_num_countries_by_iso(iso_code_colname, csv_filepath=None, df=None): if df is None: df = pd.read_csv(csv_filepath, low_memory=False) codes = [code for code in df[iso_code_colname].dropna().unique() if code not in EXCLUDE_ISOS] return len(codes)
[docs]def get_num_countries_by_location(csv_filepath, location_colname, low_memory=True): df = pd.read_csv(csv_filepath, low_memory=low_memory) locations = [loc for loc in df[location_colname].dropna().unique() if loc not in EXCLUDE_LOCATIONS] return len(locations)
[docs]def get_num_countries_jhu(csv_filepath): df = pd.read_csv(csv_filepath, low_memory=False) columns = df.columns return len(columns[~columns.isin(EXCLUDE_LOCATIONS)]) - 1
[docs]def load_macro_df(): macro_variables = { "population": "un/population_latest.csv", "population_density": "wb/population_density.csv", "median_age": "un/median_age.csv", "aged_65_older": "wb/aged_65_older.csv", "aged_70_older": "un/aged_70_older.csv", "gdp_per_capita": "wb/gdp_per_capita.csv", "extreme_poverty": "wb/extreme_poverty.csv", "cardiovasc_death_rate": "gbd/cardiovasc_death_rate.csv", "diabetes_prevalence": "wb/diabetes_prevalence.csv", "female_smokers": "wb/female_smokers.csv", "male_smokers": "wb/male_smokers.csv", "handwashing_facilities": "un/handwashing_facilities.csv", "hospital_beds_per_thousand": "owid/hospital_beds.csv", "life_expectancy": "owid/life_expectancy.csv", "human_development_index": "un/human_development_index.csv", } dfs = [] for var, file in macro_variables.items(): dfs.append(pd.read_csv(os.path.join(INPUT_DIR, file), usecols=["iso_code", var])) df = pd.concat(dfs) return df
[docs]def get_variable_section(): template = """### {title}\n{notes_top}\n{table}\n{notes}""" df = pd.read_csv(CODEBOOK_CSV).rename(columns={"description": "Description"}) df_notes = pd.read_csv(PATHS.INTERNAL_INPUT_OWID_COVID_NOTES_FILE, index_col="category") df = df.assign(Variable=df.column.apply(lambda x: f"`{x}`")) variable_description = [] categories = list(filter(lambda x: x != "Others", sorted(df.category.unique()))) + ["Others"] for cat in categories: df_ = df[df.category == cat] table = df_[["Variable", "Description"]].to_markdown(index=False) notes = _generate_category_notes(df_notes, cat) notes_top = _generate_category_notes_top(df_notes, cat) variable_description.append(template.format(title=cat, table=table, notes=notes, notes_top=notes_top)) return variable_description
[docs]def _generate_category_notes(df_notes, category): notes_pretty = "\n#### Notes:\n" if category in df_notes.index: notes = df_notes.loc[category, "notes"] if not pd.isnull(notes): if isinstance(notes, list): for note in notes: notes_pretty += f"* {note}\n" else: notes_pretty += f"* {notes}\n" else: notes_pretty = "" return notes_pretty return ""
[docs]def _generate_category_notes_top(df_notes, category): if category in df_notes.index: note_top = df_notes.loc[category, "notes_top"] if not pd.isnull(note_top): return note_top return ""
[docs]def get_placeholder(): placeholders = { "num_countries_vaccinations": get_num_countries_by_iso( csv_filepath=VACCINATIONS_CSV, iso_code_colname="iso_code" ), "num_countries_testing": get_num_countries_by_iso(csv_filepath=TESTING_CSV, iso_code_colname="ISO code"), "num_countries_cases": get_num_countries_jhu(csv_filepath=CASES_CSV), "num_countries_deaths": get_num_countries_jhu(csv_filepath=DEATHS_CSV), "num_countries_hospital": get_num_countries_by_location(csv_filepath=HOSP_CSV, location_colname="Country"), "num_countries_reproduction": get_num_countries_by_location( csv_filepath=REPR_CSV, location_colname="Country/Region" ), "num_countries_policy": get_num_countries_by_location( csv_filepath=POL_CSV, location_colname="CountryName", low_memory=False, ), "num_countries_others": get_num_countries_by_iso(df=load_macro_df(), iso_code_colname="iso_code"), "variable_description": "\n".join(get_variable_section()), } return placeholders
[docs]def generate_readme(readme_template: str, readme_output: str): placeholders = get_placeholder() with open(readme_template, "r", encoding="utf-8") as fr: s = fr.read() s = s.format(**placeholders) with open(readme_output, "w", encoding="utf-8") as fw: fw.write(s)