Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=link_dir)
if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console
pass
else:
hints = (
'Got youtube-dl response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to download media', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=link_dir)
if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console
pass
else:
hints = (
'Got youtube-dl response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to download media', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=link_dir,
cmd_version=YOUTUBEDL_VERSION,
output=output,
status=status,
**timer.stats,
)
files_downloaded = (
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
if 'Downloaded:' in output_tail[-1]
else 0
)
# Check for common failure cases
if result.returncode > 0 and files_downloaded < 1:
hints = (
'Got wget response code: {}.'.format(result.returncode),
*output_tail,
)
if b'403: Forbidden' in result.stderr:
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
if b'404: Not Found' in result.stderr:
raise ArchiveError('404 Not Found', hints)
if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Got an error from the server', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=link_dir,
cmd_version=WGET_VERSION,
output=output,
status=status,
**timer.stats,
if 'Downloaded:' in output_tail[-1]
else 0
)
# Check for common failure cases
if result.returncode > 0 and files_downloaded < 1:
hints = (
'Got wget response code: {}.'.format(result.returncode),
*output_tail,
)
if b'403: Forbidden' in result.stderr:
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
if b'404: Not Found' in result.stderr:
raise ArchiveError('404 Not Found', hints)
if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Got an error from the server', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=link_dir,
cmd_version=WGET_VERSION,
output=output,
status=status,
**timer.stats,
)
else 0
)
# Check for common failure cases
if result.returncode > 0 and files_downloaded < 1:
hints = (
'Got wget response code: {}.'.format(result.returncode),
*output_tail,
)
if b'403: Forbidden' in result.stderr:
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
if b'404: Not Found' in result.stderr:
raise ArchiveError('404 Not Found', hints)
if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Got an error from the server', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
'--mirror',
'--recursive',
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
without_query(without_fragment(link['url'])),
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
if result.returncode == 128:
# ignore failed re-download when the folder already exists
pass
elif result.returncode > 0:
hints = 'Got git response code: {}.'.format(result.returncode)
raise ArchiveError('Failed git download', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
"""try to guess the page's title from its content"""
output: ArchiveOutput = None
cmd = [
CURL_BINARY,
link.url,
'|',
'grep',
'
if 'Downloaded:' in output_tail[-1]
else 0
)
# Check for common failure cases
if result.returncode > 0 and files_downloaded < 1:
hints = (
'Got wget response code: {}.'.format(result.returncode),
*output_tail,
)
if b'403: Forbidden' in result.stderr:
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
if b'404: Not Found' in result.stderr:
raise ArchiveError('404 Not Found', hints)
if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Got an error from the server', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
"""take screenshot of site using chrome --headless"""
output = 'screenshot.png'
cmd = [
*chrome_args(TIMEOUT=timeout),
'--screenshot',
link['url'],
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to take screenshot', hints)
chmod_file(output, cwd=link_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
submit_url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout)
content_location, errors = parse_archive_dot_org_response(result.stdout)
if content_location:
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
archive_org_url = None
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
elif errors:
raise ArchiveError(', '.join(errors))
else:
raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
if not isinstance(output, Exception):
# instead of writing None when archive.org rejects the url write the
# url to resubmit it to archive.org. This is so when the user visits
# the URL in person, it will attempt to re-archive it, and it'll show the
# nicer error message explaining why the url was rejected if it fails.
archive_org_url = archive_org_url or submit_url
with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f:
f.write(archive_org_url)
chmod_file('archive.org.txt', cwd=link_dir)
output = archive_org_url