Source code for cowidev.utils.clean.numbers

import re
import pandas as pd


[docs]class NumericCleaner: numeric_words: dict = { "million": { "words": [ "million", "millió", "millón", "millones", "millions", "millionen", "milioni", "milione", "miljoen", "milhão", "milhões", ], "factor": 1e6, }, "ten_thousand": { "words": ["万"], "factor": 1e4, }, "thousand": { "words": ["thousand", "ezren", "mil", "duizend", "mila", "mille", "tausend"], "factor": 1e3, }, "hundred": {"words": ["hundred", "cien", "cent", "hundert", "honderd", "cem", "cento"], "factor": 1e2}, "one": {"words": [""], "factor": 1}, } regex_number_verbose_template: str = "(?:(?P<{}>\d+(?:\.\d+)?)\s?(?:{}))?" regex_number_not_verbose: str = r"\d+((.\d+)+)?" regex_number_not_verbose_correct: str = r"\d+((.\d{3})+)?" @property def regex_number_verbose(self): regex = [ self.regex_number_verbose_template.format(k, "|".join(v["words"])) for k, v in self.numeric_words.items() ] regex = "\s?".join(regex) return regex
[docs] def _match_numeric_words(self, num_as_str): match = re.search(self.regex_number_verbose, num_as_str) numbers = match.groupdict(default=0) return numbers
[docs] def _build_number(self, numbers): value = 0 for k, v in numbers.items(): value += float(v) * self.numeric_words[k]["factor"] return int(value)
[docs] def _to_str(self, num_as_str): if not isinstance(num_as_str, str): return str(num_as_str) return num_as_str
[docs] def _is_verbose(self, num): pattern = re.compile(self.regex_number_not_verbose) match = pattern.fullmatch(num) return not match
[docs] def _is_not_verbose_and_incorrect(self, num): pattern = re.compile(self.regex_number_not_verbose_correct) match = pattern.fullmatch(num) return not match
[docs] def clean_verbose_number(self, num_as_str): number_dict = self._match_numeric_words(num_as_str) # print(number_dict) number = self._build_number(number_dict) return number
[docs] def run(self, num_as_str): num = self._to_str(num_as_str).strip() if self._is_verbose(num): num = self.clean_verbose_number(num) elif self._is_not_verbose_and_incorrect(num): raise ValueError("The format of the number seems to be not correct! Please review.") num = re.sub(r"[^0-9]", "", str(num)) num = int(num) return num
[docs]def clean_count(count): cleaner = NumericCleaner() return cleaner.run(count)
[docs]def _series_to_int64(ds): return ds.astype(pd.Int64Dtype())
[docs]def _series_to_float(ds): return ds.astype(float)
[docs]def metrics_to_num_int(df, metrics): for metric in metrics: if metric in df.columns: # and any(df[metric].isnull()): df.loc[:, metric] = _series_to_int64(df[metric]) return df
[docs]def metrics_to_num_float(df, metrics): for metric in metrics: if metric in df.columns: # and any(df[metric].isnull()): df.loc[:, metric] = _series_to_float(df[metric]) return df