cowidev.utils.clean#

cowidev.utils.clean.dataframes#

cowidev.utils.clean.dataframes.clean_column_name(colname: str)[source]#

Clean column name.

cowidev.utils.clean.dataframes.clean_df_columns_multiindex(df: DataFrame)[source]#

cowidev.utils.clean.dates#

cowidev.utils.clean.dates._replace_date_fields(date_raw: str, replace_fields: dict = {}, date_format: str = '%Y-%m-%d')[source]#

Replace date field.

Parameters:
  • date_raw (str) – Date raw in standard format %Y-%m-%d.

  • replace_fields (dict, optional) – Fields to replace. Format should be: dict(field, value), e.g. {year: “2021”}.

  • date_format (str, optional) – Date format of date_raw. Defaults to DATE_FORMAT.

Returns:

Modified date, in standard format %Y-%m-%d.

Return type:

str

cowidev.utils.clean.dates._setlocale(name: str)[source]#
cowidev.utils.clean.dates.clean_date(date_or_text: Union[str, datetime, date], fmt: Optional[str] = None, lang: str = 'en', loc: str = '', minus_days: int = 0, unicode_norm: bool = True, output_fmt: str = '%Y-%m-%d', as_datetime: bool = False)[source]#

Extract a date from a text.

The date from text is extracted using locale loc. Alternatively, you can provide language lang instead.

By default, system default locale is used.

Parameters:
  • date_or_text (Union[str, datetime, date]) – Input text or date.

  • fmt (str, optional) – Text format. More details at https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes.

  • lang (str, optional) – Language two-letter code, e.g. ‘da’ (dansk). If given, loc will be ignored and redefined based on lang. Defaults to None.

  • loc (str, optional) – Locale, e.g es_ES. Get list of available locales with locale.locale_alias or locale.windows_locale in windows. Defaults to “” (system default).

  • minus_days (int, optional) – Number of days to subtract. Defaults to 0.

  • unicode_norm (bool, optional) – [description]. Defaults to True.

  • output_fmt (str, optional) – Format of the output date. By default, uses DATE_FORMAT.

  • as_datetime (bool, optional) – Set to True to return the date as a datetime.

Returns:

Extracted date in format %Y-%m-%d

Return type:

str

cowidev.utils.clean.dates.clean_date_series(ds: Union[Series, list], format_input: Optional[str] = None, format_output: str = '%Y-%m-%d', as_datetime: bool = False, **kwargs) Union[Series, list][source]#
cowidev.utils.clean.dates.extract_clean_date(text: str, regex: str, date_format: str, lang: str = 'en', loc: str = '', minus_days: int = 0, unicode_norm: bool = True, replace_year=None)[source]#

Export clean date from raw text using RegEx.

>>> from cowidev.utils import extract_clean_date
>>> text = "Something irrelevant. This page was last updated on 25 May 2021 at 09:05hrs."
>>> date_str = extract_clean_date(
    text=text,
    regex=r"This page was last updated on (\d{1,2} May 202\d) at \d{1,2}:\d{1,2}hrs",
    date_format="%d %B %Y",
    minus_days=1,
)
Parameters:
  • text (str) – Raw original text.

  • regex (str) – RegEx to export date fragment. Should have the data grouped (group number 1)

  • date_format (str) – Format of the date (was extracted using regex).

  • lang (str, optional) – Language two-letter code, e.g. ‘da’ (dansk). If given, loc will be ignored and redefined based on lang. Defaults to None.

  • loc (str, optional) – Locale, e.g es_ES. Get list of available locales with locale.locale_alias or locale.windows_locale in windows. Defaults to “” (system default).

  • minus_days (int, optional) – Number of days to subtract. Defaults to 0.

  • unicode_norm (bool, optional) – [description]. Defaults to True.

  • replace_year (str) – Replace the year with this one.

cowidev.utils.clean.dates.from_tz_to_tz(dt: datetime, from_tz: str = 'UTC', to_tz: Optional[str] = None)[source]#
cowidev.utils.clean.dates.list_timezones()[source]#
cowidev.utils.clean.dates.localdate(tz: str = 'utc', force_today: bool = False, hour_limit: Optional[int] = None, date_format: str = '%Y-%m-%d', plus_days: Optional[int] = None, as_datetime: bool = False, minus_days: int = 0)[source]#

Get local date.

By default, gets date prior to execution.

Parameters:
  • tz (str, optional) – Timezone name. Defaults to UTC.

  • force_today (bool, optional) – If True, return today’s date regardles of hour_limit value.

  • hour_limit (int, optional) – If local time hour is lower than this, returned date is previous day. Defaults to None.

  • date_format (str, optional) – Format of output datetime. Uses default YYYY-mm-dd.

  • plus_days (int, optional) – Number of days to add to local date.

  • as_datetime (bool, optional) – Set to True to return the date as a datetime.

  • minus_days (int, optional) – Number of days to subtract. Defaults to 0.

cowidev.utils.clean.dates.localdatenow(tz: str = 'utc', **kwargs)[source]#
cowidev.utils.clean.dates.week_to_date(year: int, week: int, output_fmt: str = '%Y-%m-%d')[source]#

cowidev.utils.clean.numbers#

class cowidev.utils.clean.numbers.NumericCleaner[source]#

Bases: object

_build_number(numbers)[source]#
_is_not_verbose_and_incorrect(num)[source]#
_is_verbose(num)[source]#
_match_numeric_words(num_as_str)[source]#
_to_str(num_as_str)[source]#
clean_verbose_number(num_as_str)[source]#
numeric_words: dict = {'hundred': {'factor': 100.0, 'words': ['hundred', 'cien', 'cent', 'hundert', 'honderd', 'cem', 'cento']}, 'million': {'factor': 1000000.0, 'words': ['million', 'millió', 'millón', 'millones', 'millions', 'millionen', 'milioni', 'milione', 'miljoen', 'milhão', 'milhões']}, 'one': {'factor': 1, 'words': ['']}, 'ten_thousand': {'factor': 10000.0, 'words': ['万']}, 'thousand': {'factor': 1000.0, 'words': ['thousand', 'ezren', 'mil', 'duizend', 'mila', 'mille', 'tausend']}}#
regex_number_not_verbose: str = '\\d+((.\\d+)+)?'#
regex_number_not_verbose_correct: str = '\\d+((.\\d{3})+)?'#
property regex_number_verbose#
regex_number_verbose_template: str = '(?:(?P<{}>\\d+(?:\\.\\d+)?)\\s?(?:{}))?'#
run(num_as_str)[source]#
cowidev.utils.clean.numbers._series_to_float(ds)[source]#
cowidev.utils.clean.numbers._series_to_int64(ds)[source]#
cowidev.utils.clean.numbers.clean_count(count)[source]#
cowidev.utils.clean.numbers.metrics_to_num_float(df, metrics)[source]#
cowidev.utils.clean.numbers.metrics_to_num_int(df, metrics)[source]#

cowidev.utils.clean.strings#

cowidev.utils.clean.strings.clean_string(text_raw: str)[source]#

Clean column name.

cowidev.utils.clean.urls#

cowidev.utils.clean.urls.clean_urls(df: DataFrame) DataFrame[source]#
cowidev.utils.clean.clean_column_name(colname: str)[source]#

Clean column name.

cowidev.utils.clean.clean_count(count)[source]#
cowidev.utils.clean.clean_date(date_or_text: Union[str, datetime, date], fmt: Optional[str] = None, lang: str = 'en', loc: str = '', minus_days: int = 0, unicode_norm: bool = True, output_fmt: str = '%Y-%m-%d', as_datetime: bool = False)[source]#

Extract a date from a text.

The date from text is extracted using locale loc. Alternatively, you can provide language lang instead.

By default, system default locale is used.

Parameters:
  • date_or_text (Union[str, datetime, date]) – Input text or date.

  • fmt (str, optional) – Text format. More details at https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes.

  • lang (str, optional) – Language two-letter code, e.g. ‘da’ (dansk). If given, loc will be ignored and redefined based on lang. Defaults to None.

  • loc (str, optional) – Locale, e.g es_ES. Get list of available locales with locale.locale_alias or locale.windows_locale in windows. Defaults to “” (system default).

  • minus_days (int, optional) – Number of days to subtract. Defaults to 0.

  • unicode_norm (bool, optional) – [description]. Defaults to True.

  • output_fmt (str, optional) – Format of the output date. By default, uses DATE_FORMAT.

  • as_datetime (bool, optional) – Set to True to return the date as a datetime.

Returns:

Extracted date in format %Y-%m-%d

Return type:

str

cowidev.utils.clean.clean_date_series(ds: Union[Series, list], format_input: Optional[str] = None, format_output: str = '%Y-%m-%d', as_datetime: bool = False, **kwargs) Union[Series, list][source]#
cowidev.utils.clean.clean_df_columns_multiindex(df: DataFrame)[source]#
cowidev.utils.clean.clean_string(text_raw: str)[source]#

Clean column name.

cowidev.utils.clean.clean_urls(df: DataFrame) DataFrame[source]#
cowidev.utils.clean.extract_clean_date(text: str, regex: str, date_format: str, lang: str = 'en', loc: str = '', minus_days: int = 0, unicode_norm: bool = True, replace_year=None)[source]#

Export clean date from raw text using RegEx.

>>> from cowidev.utils import extract_clean_date
>>> text = "Something irrelevant. This page was last updated on 25 May 2021 at 09:05hrs."
>>> date_str = extract_clean_date(
    text=text,
    regex=r"This page was last updated on (\d{1,2} May 202\d) at \d{1,2}:\d{1,2}hrs",
    date_format="%d %B %Y",
    minus_days=1,
)
Parameters:
  • text (str) – Raw original text.

  • regex (str) – RegEx to export date fragment. Should have the data grouped (group number 1)

  • date_format (str) – Format of the date (was extracted using regex).

  • lang (str, optional) – Language two-letter code, e.g. ‘da’ (dansk). If given, loc will be ignored and redefined based on lang. Defaults to None.

  • loc (str, optional) – Locale, e.g es_ES. Get list of available locales with locale.locale_alias or locale.windows_locale in windows. Defaults to “” (system default).

  • minus_days (int, optional) – Number of days to subtract. Defaults to 0.

  • unicode_norm (bool, optional) – [description]. Defaults to True.

  • replace_year (str) – Replace the year with this one.