import pandas as pd
import requests
from bs4 import BeautifulSoup
from cowidev import PATHS
from cowidev.utils.web.scraping import get_soup, get_headers
from cowidev.vax.utils.orgs import WHO_VACCINES
VAX_MAPPING = {
"Bharat Biotech": "Covaxin",
"FBRI": "EpiVacCorona",
"Janssen (Johnson & Johnson)": "Johnson&Johnson",
"Moderna": "Moderna",
"Oxford/AstraZeneca": "Oxford/AstraZeneca",
"Pfizer/BioNTech": "Pfizer/BioNTech",
"Sinopharm (Beijing)": "Sinopharm/Beijing",
"Sinopharm (Wuhan)": "Sinopharm/Wuhan",
"Sinovac": "Sinovac",
"Gamaleya": "Sputnik V",
"Serum Institute of India": "Oxford/AstraZeneca",
}
COUNTRY_MAP = {
"United Kingdom": "united-kingdom-of-great-britain-and-northern-ireland",
"United States": "united-states-of-america",
"Venezuela": "venezuela-bolivarian-republic-of",
"South Korea": "republic-of-korea",
"Scotland": "united-kingdom-of-great-britain-and-northern-ireland",
"Wales": "united-kingdom-of-great-britain-and-northern-ireland",
"England": "united-kingdom-of-great-britain-and-northern-ireland",
"Northern Ireland": "united-kingdom-of-great-britain-and-northern-ireland",
"Faeroe Islands": "faroe-islands/",
"Cape Verde": "cabo-verde",
"Brunei": "brunei-darussalam",
"Bahamas": "the-bahamas",
}
[docs]class TrackVaccinesClient:
"""Client to interact with https://covid19.trackvaccines.org."""
def __init__(self):
self.base_url = "https://covid19.trackvaccines.org"
[docs] def get_country_url(self, location: str):
# Build URL
if location in COUNTRY_MAP:
location = COUNTRY_MAP[location]
else:
location = location.lower().replace(" ", "-")
url = f"{self.base_url}/country/{location}/"
if not self._valid_url(url):
raise ValueError(f"Couldn't find vaccines for {location}. Check {url}")
return url
@property
def all_vaccines_url(self):
return f"{self.base_url}/vaccines/"
[docs] def _valid_url(self, url: str):
resp = requests.get(url, headers=get_headers())
if not resp.ok:
return False
return True
[docs] def vaccines_approved(self, location: str = None, original_names: bool = False) -> list:
"""Get list of approved vaccines in a country (or all if None specified).
Args:
location (str, optional): Country name. If None, retrieves all approved vaccines. Defaults to None.
original_names (bool, optional): Set to True to keep vaccine from web. Defaults to False.
Returns:
list: Approved vaccines
"""
if location:
try:
url = self.get_country_url(location)
soup = get_soup(url)
return self._parse_vaccines_location(soup, original_names)
except ValueError:
return None
else:
soup = get_soup(self.all_vaccines_url)
return self._parse_vaccines_all(soup, original_names)
[docs] def _parse_vaccines_location(self, soup: BeautifulSoup, original_names: bool = False):
content = soup.find(class_="card-grid alignwide")
vaccines_html = content.find_all(class_="card__title has-text-align-center")
vaccines = [e.find("span").text for e in vaccines_html]
if original_names:
return vaccines
return list(set(map(map_vaccine, vaccines)))
[docs] def _parse_vaccines_all(self, soup: BeautifulSoup, original_names: bool = False):
vaccines_html = soup.find("ul", class_="card-grid alignwide").find_all("h2")
vaccines = [vax.find("span").text for vax in vaccines_html]
if original_names:
return vaccines
return list(set(map(map_vaccine, vaccines)))
[docs]def map_vaccine(vaccine):
if vaccine in VAX_MAPPING:
return VAX_MAPPING[vaccine]
return vaccine
[docs]def vaccines_tracked(path_locations: str = None, location: str = None, as_list: bool = False) -> pd.DataFrame:
"""Get tracked vaccines for tracked countries.
Args:
path_locations (str, optional): Path to locations csv file.
Default value works if repo structure is left unmodified.
location (str, optional): Country name. Defaults to None.
as_list (bool, optional): Set to True to return a (flattened) list.
Returns:
pd.DataFrame: Dataframe with location and vaccines tracked.
"""
if not path_locations:
path_locations = PATHS.DATA_VAX_META_FILE
df = pd.read_csv(path_locations, usecols=["vaccines", "location"])
df = df.assign(vaccines=df.vaccines.apply(lambda x: set(x.split(", "))))
if location:
if isinstance(location, str):
location = [location]
df = df[df.location.isin(location)]
if as_list:
return list(set([vv for v in df.vaccines for vv in v]))
return df
[docs]def vaccines_approved(path_locations: str = None, verbose: bool = False) -> pd.DataFrame:
"""Get approved vaccines for tracked countries.
This may take between 2-3 minutes.
Args:
path_locations (str, optional): Path to locations csv file.
Default value works if repo structure is left unmodified.
Returns:
pd.DataFrame: Dataframe with location and vaccines approved.
"""
if verbose:
print("This may take from 2 to 3 minutes...")
if not path_locations:
path_locations = PATHS.DATA_VAX_META_FILE
df = pd.read_csv(path_locations, usecols=["location"])
client = TrackVaccinesClient()
y = df.location.apply(lambda x: client.vaccines_approved(x))
return df.assign(vaccines=y.apply(lambda x: set(x) if x is not None else None))
[docs]def vaccines_missing(aggregated: bool = False, verbose: bool = False):
"""Get missing vaccines.
- Columns "_unapproved" mean vaccines not approved but currently being administered.
- Columns "_untracked" mean vaccines approved but not tracked.
Note: Unapproved might mean that trackvaccines.org are not counting a vaccine that was actually approved.
Args:
aggregated (bool, optional): Set to True to get list of untracked/unapproved global vaccines. Defaults to
False.
Returns:
Union[pd.DataFrame, dict]: Unapproved/untracked vaccines
"""
if aggregated:
# Get tracked vaccines
vax_tracked = vaccines_tracked(as_list=True)
client = TrackVaccinesClient()
vax_approved = client.vaccines_approved()
return {
"vaccines_untracked": [v for v in vax_approved if v not in vax_tracked],
"vaccines_unapproved": [v for v in vax_tracked if v not in vax_approved],
}
else:
vax_tracked = vaccines_tracked()
vax_approved = vaccines_approved(verbose=True)
# Build result dataframe
df = vax_tracked.merge(vax_approved, on="location", suffixes=("_tracked", "_approved"))
df = df[df.vaccines_tracked != df.vaccines_approved].dropna()
df = df.assign(
unapproved=(
df.apply(
lambda x: [xx for xx in x["vaccines_tracked"] if xx not in x["vaccines_approved"]],
axis=1,
)
),
untracked=(
df.apply(
lambda x: [xx for xx in x["vaccines_approved"] if xx not in x["vaccines_tracked"]],
axis=1,
)
),
)
df = df.assign(
num_unapproved=df.unapproved.apply(len),
num_untracked=df.untracked.apply(len),
)
df = df[["location", "unapproved", "num_unapproved", "untracked", "num_untracked"]]
df = df.sort_values(by="num_untracked", ascending=False)
return df
[docs]def vaccines_comparison_with_who():
# Load WHO
url = "https://covid19.who.int/who-data/vaccination-metadata.csv"
df_who = pd.read_csv(url)
vaccines_used_who = df_who.groupby("ISO3").apply(
lambda x: set(WHO_VACCINES[xx] for xx in x[~x.START_DATE.isnull()].VACCINE_NAME)
)
vaccines_used_who.name = "vaccines_used_who"
# Load OWID
url = "https://github.com/owid/covid-19-data/raw/master/public/data/vaccinations/locations.csv"
df_owid = pd.read_csv(url)
vaccines_used_owid = df_owid.assign(
vaccines_used_owid=(df_owid.vaccines.apply(lambda x: set(xx.strip() for xx in x.split(", "))))
)[["iso_code", "location", "vaccines_used_owid"]].set_index("iso_code")
# Merge
vaccines_used = vaccines_used_owid.merge(vaccines_used_who, right_index=True, left_index=True)
# Obtain differences
vaccines_used = vaccines_used.assign(
missing_in_who=vaccines_used.apply(lambda x: x.vaccines_used_owid.difference(x.vaccines_used_who), axis=1),
missing_in_owid=vaccines_used.apply(lambda x: x.vaccines_used_who.difference(x.vaccines_used_owid), axis=1),
)
return vaccines_used