cowidev.utils.web#

cowidev.utils.web.download#

class cowidev.utils.web.download.DESAdapter(pool_connections=10, pool_maxsize=10, max_retries=0, pool_block=False)[source]#

Bases: HTTPAdapter

A TransportAdapter that re-enables 3DES support in Requests.

From: https://stackoverflow.com/a/46186957/5056599

init_poolmanager(*args, **kwargs)[source]#

Initializes a urllib3 PoolManager.

This method should not be called from user code, and is only exposed for use when subclassing the HTTPAdapter.

Parameters:

connections – The number of urllib3 connection pools to cache.
maxsize – The maximum number of connections to save in the pool.
block – Block when no free connections are available.
pool_kwargs – Extra keyword arguments used to initialize the Pool Manager.

proxy_manager_for(*args, **kwargs)[source]#

Return urllib3 ProxyManager for the given proxy.

This method should not be called from user code, and is only exposed for use when subclassing the HTTPAdapter.

Parameters:

proxy – The proxy to return a urllib3 ProxyManager for.
proxy_kwargs – Extra keyword arguments used to configure the Proxy Manager.

Returns:

ProxyManager

Return type:

urllib3.ProxyManager

cowidev.utils.web.download.download_file_from_url(url, save_path, chunk_size=1048576, timeout=30, verify=True, ciphers_low=False, use_proxy=False)[source]#

cowidev.utils.web.download.get_base_url(url: str, scheme='https') → str[source]#

Parse a URL and return the base URL.

## Parameters :

url: str
URL to parse. <scheme>://<netloc>/<path>;<params>?<query>#<fragment>

scheme{‘http’, ‘https’}, default ‘https’
Scheme to use.

## Returns :

Base URL: str
<scheme>://<netloc>/

cowidev.utils.web.download.read_csv_from_url(url, timeout=30, verify=True, ciphers_low=False, use_proxy=False, **kwargs)[source]#

cowidev.utils.web.download.read_xlsx_from_url(url: str, timeout=30, as_series: bool = False, verify=True, drop=False, ciphers_low=False, use_proxy=False, **kwargs) → DataFrame[source]#

Download and load xls file from URL.

Parameters:

url (str) – File url.
as_series (bol) – Set to True to return a pandas.Series object. Source file must be of shape 1xN (1 row, N columns). Defaults to False.
kwargs – Arguments for pandas.read_excel.

Returns:

Data loaded.

Return type:

pandas.DataFrame

cowidev.utils.web.scraping#

cowidev.utils.web.scraping.get_driver(headless: bool = True, download_folder: Optional[str] = None, options=None, firefox: bool = False, timeout: Optional[int] = None)[source]#

cowidev.utils.web.scraping.get_headers() → dict[source]#

Get generic header for requests.

Returns:: Header.
Return type:: dict

cowidev.utils.web.scraping.get_response(source: str, request_method: str = 'get', use_proxy: bool = False, **kwargs)[source]#

cowidev.utils.web.scraping.get_soup(source: str, from_encoding: Optional[str] = None, parser='lxml', request_method: str = 'get', use_proxy: bool = False, **kwargs) → BeautifulSoup[source]#

Get soup from website.

Parameters:

source (str) – Website url.
from_encoding (str, optional) – Encoding to use. Defaults to None.
parser (str, optional) – HTML parser. Read https://www.crummy.com/software/BeautifulSoup/bs4/doc/ #installing-a-parser. Defaults to ‘lxml’.
request_method (str, optional) – Request method. Options are ‘get’ and ‘post’. Defaults to GET method. For POST method, make sure to specify a header (default one does not work).
use_proxy (bool) –
kwargs (dict) – Extra arguments passed to requests.get method. Default values for headers, verify and timeout are used.

Returns:

Website soup.

Return type:

BeautifulSoup

cowidev.utils.web.scraping.request_json(url, mode='soup', **kwargs) → dict[source]#

Get data from url as a dictionary.

Content at url should be a dictionary.

Parameters:

url (str) – URL to data.
mode (str) – Mode to use. Accepted is ‘soup’ (default) and ‘raw’.
kwargs – Check get_soup for the complete list of accepted arguments.

Returns:

Data

Return type:

dict

cowidev.utils.web.scraping.request_text(url, mode='soup', **kwargs) → str[source]#

Get data from url as plain text.

Content at url should be a dictionary.

Parameters:

url (str) – URL to data.
mode (str) – Mode to use. Accepted is ‘soup’ (default) and ‘raw’.
kwargs – Check get_soup for the complete list of accepted arguments.

Returns:

Data

Return type:

dict

cowidev.utils.web.scraping.scroll_till_element(driver, element)[source]#

cowidev.utils.web.scraping.sel_options(headless: bool = True, firefox: bool = False)[source]#

cowidev.utils.web.scraping.set_download_settings(driver, folder_name: Optional[str] = None, firefox: bool = False)[source]#

cowidev.utils.web.utils#

cowidev.utils.web.utils.to_proxy_url(url)[source]#

cowidev.utils.web.utils.url_request_broken(url)[source]#

cowidev.utils.web.get_base_url(url: str, scheme='https') → str[source]#

Parse a URL and return the base URL.

## Parameters :

url: str
URL to parse. <scheme>://<netloc>/<path>;<params>?<query>#<fragment>

scheme{‘http’, ‘https’}, default ‘https’
Scheme to use.

## Returns :

Base URL: str
<scheme>://<netloc>/

cowidev.utils.web.get_driver(headless: bool = True, download_folder: Optional[str] = None, options=None, firefox: bool = False, timeout: Optional[int] = None)[source]#

cowidev.utils.web.get_soup(source: str, from_encoding: Optional[str] = None, parser='lxml', request_method: str = 'get', use_proxy: bool = False, **kwargs) → BeautifulSoup[source]#

Get soup from website.

Parameters:

source (str) – Website url.
from_encoding (str, optional) – Encoding to use. Defaults to None.
parser (str, optional) – HTML parser. Read https://www.crummy.com/software/BeautifulSoup/bs4/doc/ #installing-a-parser. Defaults to ‘lxml’.
request_method (str, optional) – Request method. Options are ‘get’ and ‘post’. Defaults to GET method. For POST method, make sure to specify a header (default one does not work).
use_proxy (bool) –
kwargs (dict) – Extra arguments passed to requests.get method. Default values for headers, verify and timeout are used.

Returns:

Website soup.

Return type:

BeautifulSoup

cowidev.utils.web.read_xlsx_from_url(url: str, timeout=30, as_series: bool = False, verify=True, drop=False, ciphers_low=False, use_proxy=False, **kwargs) → DataFrame[source]#

Download and load xls file from URL.

Parameters:

url (str) – File url.
as_series (bol) – Set to True to return a pandas.Series object. Source file must be of shape 1xN (1 row, N columns). Defaults to False.
kwargs – Arguments for pandas.read_excel.

Returns:

Data loaded.

Return type:

pandas.DataFrame

cowidev.utils.web.request_json(url, mode='soup', **kwargs) → dict[source]#

Get data from url as a dictionary.

Content at url should be a dictionary.

Parameters:

url (str) – URL to data.
mode (str) – Mode to use. Accepted is ‘soup’ (default) and ‘raw’.
kwargs – Check get_soup for the complete list of accepted arguments.

Returns:

Data

Return type:

dict