How to use the archivebox.archive_methods.ArchiveError function in archivebox

To help you get started, we’ve selected a few archivebox examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
        chmod_file(output, cwd=link_dir)
        if result.returncode:
            if (b'ERROR: Unsupported URL' in result.stderr
                or b'HTTP Error 404' in result.stderr
                or b'HTTP Error 403' in result.stderr
                or b'URL could be a direct video link' in result.stderr
                or b'Unable to extract container ID' in result.stderr):
                # These happen too frequently on non-media pages to warrant printing to console
                pass
            else:
                hints = (
                    'Got youtube-dl response code: {}.'.format(result.returncode),
                    *result.stderr.decode().split('\n'),
                )
                raise ArchiveError('Failed to download media', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
        chmod_file(output, cwd=link_dir)
        if result.returncode:
            if (b'ERROR: Unsupported URL' in result.stderr
                or b'HTTP Error 404' in result.stderr
                or b'HTTP Error 403' in result.stderr
                or b'URL could be a direct video link' in result.stderr
                or b'Unable to extract container ID' in result.stderr):
                # These happen too frequently on non-media pages to warrant printing to console
                pass
            else:
                hints = (
                    'Got youtube-dl response code: {}.'.format(result.returncode),
                    *result.stderr.decode().split('\n'),
                )
                raise ArchiveError('Failed to download media', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return ArchiveResult(
        cmd=cmd,
        pwd=link_dir,
        cmd_version=YOUTUBEDL_VERSION,
        output=output,
        status=status,
        **timer.stats,
    )
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
files_downloaded = (
            int(output_tail[-1].strip().split(' ', 2)[1] or 0)
            if 'Downloaded:' in output_tail[-1]
            else 0
        )

        # Check for common failure cases
        if result.returncode > 0 and files_downloaded < 1:
            hints = (
                'Got wget response code: {}.'.format(result.returncode),
                *output_tail,
            )
            if b'403: Forbidden' in result.stderr:
                raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
            if b'404: Not Found' in result.stderr:
                raise ArchiveError('404 Not Found', hints)
            if b'ERROR 500: Internal Server Error' in result.stderr:
                raise ArchiveError('500 Internal Server Error', hints)
            raise ArchiveError('Got an error from the server', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return ArchiveResult(
        cmd=cmd,
        pwd=link_dir,
        cmd_version=WGET_VERSION,
        output=output,
        status=status,
        **timer.stats,
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
if 'Downloaded:' in output_tail[-1]
            else 0
        )

        # Check for common failure cases
        if result.returncode > 0 and files_downloaded < 1:
            hints = (
                'Got wget response code: {}.'.format(result.returncode),
                *output_tail,
            )
            if b'403: Forbidden' in result.stderr:
                raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
            if b'404: Not Found' in result.stderr:
                raise ArchiveError('404 Not Found', hints)
            if b'ERROR 500: Internal Server Error' in result.stderr:
                raise ArchiveError('500 Internal Server Error', hints)
            raise ArchiveError('Got an error from the server', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return ArchiveResult(
        cmd=cmd,
        pwd=link_dir,
        cmd_version=WGET_VERSION,
        output=output,
        status=status,
        **timer.stats,
    )
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
else 0
        )

        # Check for common failure cases
        if result.returncode > 0 and files_downloaded < 1:
            hints = (
                'Got wget response code: {}.'.format(result.returncode),
                *output_tail,
            )
            if b'403: Forbidden' in result.stderr:
                raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
            if b'404: Not Found' in result.stderr:
                raise ArchiveError('404 Not Found', hints)
            if b'ERROR 500: Internal Server Error' in result.stderr:
                raise ArchiveError('500 Internal Server Error', hints)
            raise ArchiveError('Got an error from the server', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
'--mirror',
        '--recursive',
        *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
        without_query(without_fragment(link['url'])),
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)

        if result.returncode == 128:
            # ignore failed re-download when the folder already exists
            pass
        elif result.returncode > 0:
            hints = 'Got git response code: {}.'.format(result.returncode)
            raise ArchiveError('Failed git download', hints)

    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
"""try to guess the page's title from its content"""

    output: ArchiveOutput = None
    cmd = [
        CURL_BINARY,
        link.url,
        '|',
        'grep',
        '
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
if 'Downloaded:' in output_tail[-1]
            else 0
        )

        # Check for common failure cases
        if result.returncode > 0 and files_downloaded < 1:
            hints = (
                'Got wget response code: {}.'.format(result.returncode),
                *output_tail,
            )
            if b'403: Forbidden' in result.stderr:
                raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
            if b'404: Not Found' in result.stderr:
                raise ArchiveError('404 Not Found', hints)
            if b'ERROR 500: Internal Server Error' in result.stderr:
                raise ArchiveError('500 Internal Server Error', hints)
            raise ArchiveError('Got an error from the server', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
"""take screenshot of site using chrome --headless"""

    output = 'screenshot.png'
    cmd = [
        *chrome_args(TIMEOUT=timeout),
        '--screenshot',
        link['url'],
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)

        if result.returncode:
            hints = (result.stderr or result.stdout).decode()
            raise ArchiveError('Failed to take screenshot', hints)

        chmod_file(output, cwd=link_dir)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return {
        'cmd': cmd,
        'pwd': link_dir,
        'output': output,
        'status': status,
        **timer.stats,
    }
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
submit_url,
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout)
        content_location, errors = parse_archive_dot_org_response(result.stdout)
        if content_location:
            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
            archive_org_url = None
            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
        elif errors:
            raise ArchiveError(', '.join(errors))
        else:
            raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    if not isinstance(output, Exception):
        # instead of writing None when archive.org rejects the url write the
        # url to resubmit it to archive.org. This is so when the user visits
        # the URL in person, it will attempt to re-archive it, and it'll show the
        # nicer error message explaining why the url was rejected if it fails.
        archive_org_url = archive_org_url or submit_url
        with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f:
            f.write(archive_org_url)
        chmod_file('archive.org.txt', cwd=link_dir)
        output = archive_org_url