Source code for cowidev.megafile.export.annotations

import yaml
import pandas as pd


[docs]class AnnotatorInternal: """Adds annotations column. Uses attribute `config` to add annotations. Its format should be as: .. code-block:: python { "vaccinations": [{ 'annotation_text': 'Data for China added on Jun 10', 'location': ['World', 'Asia', 'Upper middle income'], 'date': '2020-06-10' }], "case-tests": [{ 'annotation_text': 'something', 'location': ['World', 'Asia', 'Upper middle income'], 'date': '2020-06-11' }], } Keys in config should match those in `internal_files_columns`. """ def __init__(self, config: dict, logger=None): self._config = config self._logger = logger
[docs] @classmethod def from_yaml(cls, path, logger=None): with open(path, "r") as f: dix = yaml.safe_load(f) return cls(dix, logger)
@property def config(self): for stream in self._config.keys(): self._config[stream] = sorted(self._config[stream], key=lambda x: x["date"]) return self._config @property def streams(self): return list(self._config.keys())
[docs] def config_nested_to_flat(self, config): """Convert class attribute config to a flattened dataframe. Each row in the dataframe contains [stream, annotation_text, location, date]. Essentially, what gets flattened is the `location` field, which originally contains a list of locations. Args: config (dict): Dictionary with original class config. Returns: pd.DataFrame: Table with config in a flatten version. """ data_flat = [] for stream, config_ in config.items(): for d in config_: for loc in d["location"]: data_flat.append( { "stream": stream, "annotation_text": d["annotation_text"], "date": d["date"], "location": loc, } ) return pd.DataFrame(data_flat)
[docs] def config_flat_to_nested(self, df_config): """Converts flattened config dataframe to class instance format. Args: df_config (pd.DataFrame): Flattened config. Returns: dict: Dictionary with original data. """ config_nested = {} streams = df_config.stream.unique() for stream in streams: df_ = df_config[df_config.stream == stream] rec = df_.groupby(["annotation_text", "date"]).location.apply(list).reset_index().to_dict(orient="records") config_nested[stream] = rec return config_nested
[docs] def _remove_config_duplicates(self): df_config = self.config_nested_to_flat(self._config) df_config = df_config.drop_duplicates() return self.config_flat_to_nested(df_config)
[docs] def insert_annotation(self, stream: str, annotation: dict): # Checks if "annotation_text" not in annotation or "location" not in annotation or "date" not in annotation: raise ValueError("annotation dictionary must contain fields `annotation_text`, `location` and `date`") if not ( isinstance(annotation["annotation_text"], str) and isinstance(annotation["location"], list) and isinstance(annotation["annotation_text"], str) ): raise ValueError( f"Check `annotation` field types. `annotation_text` (str), `location` (list) and `date` (str)" ) # Add annotation self._config[stream].append(annotation) # Remove duplicates self._config = self._remove_config_duplicates()
[docs] def to_yaml(self): pass
[docs] def add_annotations(self, df: pd.DataFrame, stream: str) -> pd.DataFrame: if stream in self.streams: self._logger.info(f"Adding annotation for {stream}") return self._add_annotations(df, stream) return df
[docs] def _add_annotations(self, df: pd.DataFrame, stream: str) -> pd.DataFrame: df = df.assign(annotations=pd.NA) conf = self.config[stream] for c in conf: if not ("location" in c and "annotation_text" in c): raise ValueError(f"Missing field in {stream} (`location` and `annotation_text` are required).") if isinstance(c["location"], str): mask = df.location == c["location"] elif isinstance(c["location"], list): mask = df.location.isin(c["location"]) if "date" in c: mask = mask & (df.date >= c["date"]) df.loc[mask, "annotations"] = c["annotation_text"] return df
[docs]def add_annotations_countries_100_percentage(df, annotator): threshold_perc = 100 locations_exc = df[df.people_vaccinated_per_hundred > threshold_perc].groupby("location").date.min().to_dict() for loc, dt in locations_exc.items(): annotator.insert_annotation( "vaccinations", { "annotation_text": "Exceeds 100% due to vaccination of non-residents", "location": [loc], "date": dt, }, ) return annotator