Source code for cowidev.testing.incremental.russia

import re

from bs4 import BeautifulSoup, element
import pandas as pd

from cowidev.utils.web import get_soup
from cowidev.utils.clean import clean_count, clean_date
from cowidev.testing.utils.incremental import increment


[docs]class Russia: location = "Russia" units = "tests performed" source_label = "Government of the Russian Federation" notes = "" _base_url = "https://rospotrebnadzor.ru" _url_subdirectory = "/about/info/news/?PAGEN_1=" _num_max_pages = 3 regex = { "title": r"Информационный бюллетень о ситуации", "date": r"(\d+ \d+ \d+)", "count": r"проведено (\d+).* исследовани", }
[docs] def read(self) -> pd.Series: data = [] for cnt in range(1, self._num_max_pages + 1): url = f"{self._base_url}{self._url_subdirectory}{cnt}" soup = get_soup(url) data, proceed = self._parse_data(soup) if not proceed: break return pd.Series(data)
[docs] def _parse_data(self, soup: BeautifulSoup) -> tuple: """Get data from the source page.""" # Get relevant element elem = self._get_relevant_element(soup) if not elem: return None, True # Extract url from element url = self._get_link_from_element(elem) # Extract text from url text, date = self._get_text_and_date_from_url(url) count = self._parse_metrics(text) record = { "source_url": url, "date": date, "count": count, } return record, False
[docs] def _get_relevant_element(self, soup: BeautifulSoup) -> element.Tag: """Get the relevant element in news feed.""" news_list = soup.find(class_="content").find(class_="page").find_all("a") url_idx = [i for i, news in enumerate(news_list) if self.regex["title"] in news.text] if not url_idx: return None return news_list[url_idx[0]]
[docs] def _get_text_and_date_from_url(self, url: str) -> tuple: """Extract text from the url.""" soup = get_soup(url) date = self._parse_date(soup) text = soup.find(class_="news-detail").text.replace("\n", " ").replace("\xa0", "") text = re.sub(r'(\d)\s+(\d)', r'\1\2', text) return text, date
[docs] def _parse_date(self, soup: BeautifulSoup) -> str: """Get date from relevant element.""" date_text = soup.find(class_="date").text.replace(".", " ") date = re.search(self.regex["date"], date_text).group() return clean_date(date, "%d %m %Y")
[docs] def _parse_metrics(self, text: str) -> int: """Get metrics from news text.""" count = int(re.search(self.regex["count"], text).group(1)) return clean_count(count)
[docs] def export(self): data = self.read() increment( sheet_name=self.location, country=self.location, units=self.units, date=data["date"], source_url=data["source_url"], source_label=self.source_label, count=data["count"], )
[docs]def main(): Russia().export()