Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# load book_download_urls
book_download_urls = read(root_dir / 'book_download_urls.txt').splitlines()
# remove any books that have already been downloaded
book_download_urls = [url for url in book_download_urls if not (data_dir / f'{get_book_id(url)}.txt').exists()]
if book_download_urls:
# keep only the first 500 (as smashwords blocks the IP-address after 500 requests)
book_download_urls = book_download_urls[:500]
# get headers (user-agents)
headers = get_headers(root_dir / 'user-agents.txt')
# initialize cache-controlled session
session = CacheControl(Session())
# get proxies
proxies = get_free_proxies(session=session, headers=headers[0])
# get the books (concurrently)
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
for nb_retry in count(1):
# break if all book_download_urls successful
if not book_download_urls:
break
# break if max number of retries exceeded
if nb_retry > NB_RETRIES:
print(f'Could not download {len(book_download_urls)} books after {NB_RETRIES} retries.')
break
def __init__(self, access_token=None, cache=None):
"""Constructs a Service object.
"""
self.session = requests.Session()
self.session.params.update(access_token=access_token)
self.session.headers.update({
"User-Agent": "mapbox-sdk-py/{0} {1}".format(
"0.8.0", requests.utils.default_user_agent())})
if cache:
self.session = CacheControl(self.session, cache=cache)
def download_tiles(self):
# initialize a cache-controlled requests Session object and set the headers
self.session = CacheControl(requests.Session())
self.session.headers = HEADERS
# download the tile images
for nb_retry in count(1):
# get the unsuccessful tiles
tiles = [tile for tile in self.tiles if not tile.success]
# break if all tiles successful
if not tiles:
break
# break if max number of retries exceeded
if nb_retry > self.nb_retries:
if not self.quiet_mode:
raise RuntimeError(
f'Could not download {len(tiles)}/{len(self.tiles)} tiles after {self.nb_retries} retries.')
self._logger = logging.getLogger(__name__)
self.host = host
self.port = str(port)
self.api_version = api_version
session= requests.Session()
session.verify = verify
session.proxies = proxies
session.auth = auth
retry_policies = Retry(total=10,
read=10,
connect=10,
backoff_factor=.5,
status_forcelist=(500, 502, 504),)
http_retry = HTTPAdapter(max_retries=retry_policies)
session.mount(host, http_retry)
self.session = CacheControl(session)
self._get_remote_api_specs()
def get_session(output_dir, verbose=True):
session = requests.Session()
try:
import cachecontrol
import cachecontrol.caches
except ImportError:
if verbose:
print("Tip: install CacheControl (conda package) to cache the CRAN metadata")
else:
session = cachecontrol.CacheControl(session,
cache=cachecontrol.caches.FileCache(join(output_dir,
'.web_cache')))
return session
def get_session(output_dir, verbose=True, cache=[]):
if cache:
return cache[0]
session = requests.Session()
try:
import cachecontrol
import cachecontrol.caches
except ImportError:
if verbose:
print("Tip: install CacheControl to cache the CRAN metadata")
else:
session = cachecontrol.CacheControl(session,
cache=cachecontrol.caches.FileCache(join(output_dir,
'.web_cache')))
cache.append(session)
return session
def __init__(self, conf):
self.conf = conf
self.session = CacheControl(requests.Session())
self.session.headers.update({
'User-agent': 'Kibitzr/' + version,
})
self.url = conf['url']
self.valid_http = set(conf.get('valid_http', [200]))
self.verify_cert = conf.get('verify-cert', conf.get('verify_cert', True))