Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
details = dict(args=kwargs, error=error)
if r.status_code == 404:
raise NotFoundException(url=wb_url.url, msg=details)
else:
raise UpstreamException(r.status_code, url=wb_url.url, details=details)
cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))
cdx_url_parts = urlsplit(cdx['url'])
if cdx_url_parts.path.endswith('/') and not url_parts.path.endswith('/'):
# add trailing slash
new_path = url_parts.path + '/'
no_except_close(r.raw)
return self.send_redirect(new_path, url_parts, urlrewriter)
stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
record = self.loader.parse_record_stream(stream,
ensure_http_headers=True)
memento_dt = r.headers.get('Memento-Datetime')
target_uri = r.headers.get('WARC-Target-URI')
# cdx['urlkey'] = urlkey
# cdx['timestamp'] = http_date_to_timestamp(memento_dt)
# cdx['url'] = target_uri
set_content_loc = False
warc_headers = payload.rec_headers
if headers != payload:
warc_headers.replace_header('WARC-Refers-To-Target-URI',
payload.rec_headers.get_header('WARC-Target-URI'))
warc_headers.replace_header('WARC-Refers-To-Date',
payload.rec_headers.get_header('WARC-Date'))
warc_headers.replace_header('WARC-Target-URI',
headers.rec_headers.get_header('WARC-Target-URI'))
warc_headers.replace_header('WARC-Date',
headers.rec_headers.get_header('WARC-Date'))
no_except_close(headers.raw_stream)
return (warc_headers, http_headers_buff, payload.raw_stream)
def load_yaml_config(config_file):
config = None
configdata = None
try:
configdata = load(config_file)
config = yaml.load(configdata, Loader=yaml.Loader)
finally:
no_except_close(configdata)
return config
upstream_res = manager.urlopen(method=method,
url=load_url,
body=data,
headers=req_headers,
redirect=False,
assert_same_host=False,
preload_content=False,
decode_content=False,
retries=max_retries,
timeout=params.get('_timeout'))
return upstream_res
except Exception as e:
if upstream_res:
no_except_close(upstream_res)
if logger.isEnabledFor(logging.DEBUG):
import traceback
traceback.print_exc()
logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e))
raise LiveResourceException(load_url)
skipping = True
if not skipping:
entry = (self.req.headers, self.req.out,
self.headers, self.out, self.params)
self.queue.put(entry)
except Exception:
traceback.print_exc()
skipping = True
finally:
if skipping:
no_except_close(self.out)
no_except_close(self.req.out)
no_except_close(self.req)
self.req = None
total_pages = 1
if query.page_count:
# same line, so actually need to look at cdx
# to determine if it exists
if blocks == 0:
try:
block_cdx_iter = self.idx_to_cdx([first_line], query)
block = six.next(block_cdx_iter)
cdx = six.next(block)
except StopIteration:
total_pages = 0
blocks = -1
yield self._page_info(total_pages, pagesize, blocks + 1)
no_except_close(reader)
return
curr_page = query.page
if curr_page >= total_pages or curr_page < 0:
msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
no_except_close(reader)
raise CDXException(msg.format(curr_page, total_pages - 1))
startline = curr_page * pagesize
endline = startline + pagesize - 1
if blocks >= 0:
endline = min(endline, blocks)
if curr_page == 0:
yield first_line
else:
req_pay.seek(0)
req = self.writer.create_warc_record(uri=uri,
record_type='request',
payload=req_pay,
length=req_length,
warc_headers_dict=req_head)
self.writer.write_request_response_pair(req, resp, params)
else:
self.writer.write_record(resp, params)
finally:
try:
if req_pay:
no_except_close(req_pay)
if resp_pay:
no_except_close(resp_pay)
except Exception as e:
traceback.print_exc()
record = self.writer.create_warc_record(uri=params['url'],
record_type=record_type,
payload=req_stream.out,
length=payload_length,
warc_content_type=content_type,
warc_headers_dict=req_stream.headers)
self.writer.write_record(record, params)
msg = {'success': 'true',
'WARC-Date': record.rec_headers.get_header('WARC-Date')}
finally:
if req_stream:
no_except_close(req_stream.out)
return self.send_message(msg,
'200 OK',
start_response)
# if starting with . or /, can only be a file path..
file_only = url.startswith(('/', '.'))
# convert to filename
filename = from_file_url(url)
if filename != url:
file_only = True
url = filename
afile = None
try:
# first, try as file
afile = open(url, 'rb')
except IOError:
no_except_close(afile)
if file_only:
raise
return super(LocalFileLoader, self).load(url, offset, length)
if offset > 0:
afile.seek(offset)
if length >= 0:
return LimitReader(afile, length)
else:
return afile
def handle_timemap(self, params):
url = res_template(self.timemap_url, params)
headers = self._get_headers(params)
res = None
try:
res = self.sesh.get(url,
headers=headers,
timeout=params.get('_timeout'))
res.raise_for_status()
assert(res.text)
except Exception as e:
no_except_close(res)
self.logger.debug('FAILED: ' + str(e))
raise NotFoundException(url)
links = res.text
return self.links_to_cdxobject(links, 'timemap')