Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if line.strip()
]
files_downloaded = (
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
if 'Downloaded:' in output_tail[-1]
else 0
)
# Check for common failure cases
if result.returncode > 0 and files_downloaded < 1:
hints = (
'Got wget response code: {}.'.format(result.returncode),
*output_tail,
)
if b'403: Forbidden' in result.stderr:
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
if b'404: Not Found' in result.stderr:
raise ArchiveError('404 Not Found', hints)
if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Got an error from the server', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=WGET_VERSION,
output=output,
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=out_dir)
if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console
pass
else:
hints = (
'Got youtube-dl response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to save media', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=YOUTUBEDL_VERSION,
output=output,
status=status,
**timer.stats,
)
if 'Downloaded:' in output_tail[-1]
else 0
)
# Check for common failure cases
if result.returncode > 0 and files_downloaded < 1:
hints = (
'Got wget response code: {}.'.format(result.returncode),
*output_tail,
)
if b'403: Forbidden' in result.stderr:
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
if b'404: Not Found' in result.stderr:
raise ArchiveError('404 Not Found', hints)
if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Got an error from the server', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=WGET_VERSION,
output=output,
status=status,
**timer.stats,
)
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'screenshot.png'
cmd = [
*chrome_args(TIMEOUT=timeout),
'--screenshot',
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to save screenshot', hints)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CHROME_VERSION,
output=output,
status=status,
**timer.stats,
)
"""try to guess the page's title from its content"""
output: ArchiveOutput = None
cmd = [
CURL_BINARY,
link.url,
'|',
'grep',
'
submit_url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
content_location, errors = parse_archive_dot_org_response(result.stdout)
if content_location:
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
archive_org_url = None
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
elif errors:
raise ArchiveError(', '.join(errors))
else:
raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
if output and not isinstance(output, Exception):
# instead of writing None when archive.org rejects the url write the
# url to resubmit it to archive.org. This is so when the user visits
# the URL in person, it will attempt to re-archive it, and it'll show the
# nicer error message explaining why the url was rejected if it fails.
archive_org_url = archive_org_url or submit_url
with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
f.write(archive_org_url)
chmod_file('archive.org.txt', cwd=out_dir)
output = archive_org_url
'--max-time', str(timeout),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
submit_url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
content_location, errors = parse_archive_dot_org_response(result.stdout)
if content_location:
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
archive_org_url = None
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
elif errors:
raise ArchiveError(', '.join(errors))
else:
raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
if output and not isinstance(output, Exception):
# instead of writing None when archive.org rejects the url write the
# url to resubmit it to archive.org. This is so when the user visits
# the URL in person, it will attempt to re-archive it, and it'll show the
# nicer error message explaining why the url was rejected if it fails.
archive_org_url = archive_org_url or submit_url
with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
f.write(archive_org_url)
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'output.pdf'
cmd = [
*chrome_args(TIMEOUT=timeout),
'--print-to-pdf',
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to save PDF', hints)
chmod_file('output.pdf', cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CHROME_VERSION,
output=output,
status=status,
**timer.stats,
)
'--mirror',
'--recursive',
*([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
without_query(without_fragment(link.url)),
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
if result.returncode == 128:
# ignore failed re-download when the folder already exists
pass
elif result.returncode > 0:
hints = 'Got git response code: {}.'.format(result.returncode)
raise ArchiveError('Failed to save git clone', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=GIT_VERSION,
output=output,
status=status,
**timer.stats,
)
output: ArchiveOutput = 'output.html'
output_path = os.path.join(out_dir, str(output))
cmd = [
*chrome_args(TIMEOUT=timeout),
'--dump-dom',
link.url
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
with open(output_path, 'w+') as f:
result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
if result.returncode:
hints = result.stderr.decode()
raise ArchiveError('Failed to save DOM', hints)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CHROME_VERSION,
output=output,
status=status,
**timer.stats,
)