Source code for cowidev.utils.web.download

import tempfile
from urllib.parse import urlparse
import pandas as pd

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context

from cowidev.utils.web.scraping import to_proxy_url


CIPHERS = "HIGH:!DH:!aNULL:DEFAULT@SECLEVEL=1"


[docs]def read_xlsx_from_url( url: str, timeout=30, as_series: bool = False, verify=True, drop=False, ciphers_low=False, use_proxy=False, **kwargs, ) -> pd.DataFrame: """Download and load xls file from URL. Args: url (str): File url. as_series (bol): Set to True to return a pandas.Series object. Source file must be of shape 1xN (1 row, N columns). Defaults to False. kwargs: Arguments for pandas.read_excel. Returns: pandas.DataFrame: Data loaded. """ with tempfile.NamedTemporaryFile() as tmp: download_file_from_url( url, tmp.name, timeout=timeout, verify=verify, ciphers_low=ciphers_low, use_proxy=use_proxy, ) df = pd.read_excel(tmp.name, **kwargs) if as_series: return df.T.squeeze() if drop: df = df.dropna(how="all") return df
[docs]def read_csv_from_url(url, timeout=30, verify=True, ciphers_low=False, use_proxy=False, **kwargs): with tempfile.NamedTemporaryFile(mode="w+", delete=False) as tmp: download_file_from_url( url, tmp.name, timeout=timeout, verify=verify, ciphers_low=ciphers_low, use_proxy=use_proxy ) df = pd.read_csv(tmp.name, **kwargs) # df = df.dropna(how="all") return df
[docs]def download_file_from_url( url, save_path, chunk_size=1024 * 1024, timeout=30, verify=True, ciphers_low=False, use_proxy=False, ): if use_proxy: url = to_proxy_url(url) if ciphers_low: base_url = get_base_url(url) s = requests.Session() s.mount(base_url, DESAdapter()) r = s.get(url) else: r = requests.get(url, stream=True, timeout=timeout, verify=verify) with open(save_path, "wb") as fd: for chunk in r.iter_content(chunk_size=chunk_size): fd.write(chunk)
[docs]class DESAdapter(HTTPAdapter): """ A TransportAdapter that re-enables 3DES support in Requests. From: https://stackoverflow.com/a/46186957/5056599 """
[docs] def init_poolmanager(self, *args, **kwargs): context = create_urllib3_context(ciphers=CIPHERS) kwargs["ssl_context"] = context return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
[docs] def proxy_manager_for(self, *args, **kwargs): context = create_urllib3_context(ciphers=CIPHERS) kwargs["ssl_context"] = context return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)
[docs]def get_base_url(url: str, scheme="https") -> str: """ Parse a URL and return the base URL. ## Parameters : url: str URL to parse. <scheme>://<netloc>/<path>;<params>?<query>#<fragment> scheme : {'http', 'https'}, default 'https' Scheme to use. ## Returns : Base URL: str <scheme>://<netloc>/ """ if scheme not in ["http", "https"]: raise ValueError("Invalid scheme: {}".format(scheme)) elif scheme == "http": return f"http://{urlparse(url).netloc}" else: return f"https://{urlparse(url).netloc}"