How to use the archivebox.config.TIMEOUT function in archivebox

To help you get started, we’ve selected a few archivebox examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pirate / ArchiveBox / archivebox / extractors / git.py View on Github external
def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download full site using git"""

    out_dir = out_dir or link.link_dir
    output: ArchiveOutput = 'git'
    output_path = os.path.join(out_dir, str(output))
    os.makedirs(output_path, exist_ok=True)
    cmd = [
        GIT_BINARY,
        'clone',
        '--mirror',
        '--recursive',
        *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
        without_query(without_fragment(link.url)),
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
def fetch_git(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download full site using git"""

    link_dir = link_dir or link.link_dir
    output: ArchiveOutput = 'git'
    output_path = os.path.join(link_dir, str(output))
    os.makedirs(output_path, exist_ok=True)
    cmd = [
        GIT_BINARY,
        'clone',
        '--mirror',
        '--recursive',
        *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
        without_query(without_fragment(link.url)),
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
github pirate / ArchiveBox / archivebox / parsers / __init__.py View on Github external
PARSERS = (
        # Specialized parsers
        ('Pocket HTML', parse_pocket_html_export),
        ('Pinboard RSS', parse_pinboard_rss_export),
        ('Shaarli RSS', parse_shaarli_rss_export),
        ('Medium RSS', parse_medium_rss_export),
        
        # General parsers
        ('Netscape HTML', parse_netscape_html_export),
        ('Generic RSS', parse_generic_rss_export),
        ('Generic JSON', parse_generic_json_export),

        # Fallback parser
        ('Plain Text', parse_generic_txt_export),
    )
    timer = TimedProgress(TIMEOUT * 4)
    with open(source_file, 'r', encoding='utf-8') as file:
        for parser_name, parser_func in PARSERS:
            try:
                links = list(parser_func(file))
                if links:
                    timer.end()
                    return links, parser_name
            except Exception as err:   # noqa
                pass
                # Parsers are tried one by one down the list, and the first one
                # that succeeds is used. To see why a certain parser was not used
                # due to error or format incompatibility, uncomment this line:
                # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
                # raise

    timer.end()
github pirate / ArchiveBox / archivebox / extractors / screenshot.py View on Github external
def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """take screenshot of site using chrome --headless"""
    
    out_dir = out_dir or link.link_dir
    output: ArchiveOutput = 'screenshot.png'
    cmd = [
        *chrome_args(TIMEOUT=timeout),
        '--screenshot',
        link.url,
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)

        if result.returncode:
            hints = (result.stderr or result.stdout).decode()
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
def fetch_title(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """try to guess the page's title from its content"""

    output: ArchiveOutput = None
    cmd = [
        CURL_BINARY,
        link.url,
        '|',
        'grep',
        '
github pirate / ArchiveBox / archivebox / extractors / title.py View on Github external
def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """try to guess the page's title from its content"""

    output: ArchiveOutput = None
    cmd = [
        CURL_BINARY,
        link.url,
        '|',
        'grep',
        '
github pirate / ArchiveBox / archivebox / extractors / favicon.py View on Github external
def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download site favicon from google's favicon api"""

    out_dir = out_dir or link.link_dir
    output: ArchiveOutput = 'favicon.ico'
    cmd = [
        CURL_BINARY,
        '--max-time', str(timeout),
        '--location',
        '--output', str(output),
        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
        'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
github pirate / ArchiveBox / archivebox / extractors / dom.py View on Github external
def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """print HTML of site to file using chrome --dump-html"""

    out_dir = out_dir or link.link_dir
    output: ArchiveOutput = 'output.html'
    output_path = os.path.join(out_dir, str(output))
    cmd = [
        *chrome_args(TIMEOUT=timeout),
        '--dump-dom',
        link.url
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        with open(output_path, 'w+') as f:
            result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
github pirate / ArchiveBox / archivebox / extractors / archive_org.py View on Github external
def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """submit site to archive.org for archiving via their service, save returned archive url"""

    out_dir = out_dir or link.link_dir
    output: ArchiveOutput = 'archive.org.txt'
    archive_org_url = None
    submit_url = 'https://web.archive.org/save/{}'.format(link.url)
    cmd = [
        CURL_BINARY,
        '--location',
        '--head',
        '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
        '--max-time', str(timeout),
        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
        submit_url,
    ]
    status = 'succeeded'
github pirate / ArchiveBox / archivebox / extractors / wget.py View on Github external
def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download full site using wget"""

    out_dir = out_dir or link.link_dir
    if SAVE_WARC:
        warc_dir = os.path.join(out_dir, 'warc')
        os.makedirs(warc_dir, exist_ok=True)
        warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))

    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
    output: ArchiveOutput = None
    cmd = [
        WGET_BINARY,
        # '--server-response',  # print headers for better error parsing
        '--no-verbose',
        '--adjust-extension',
        '--convert-links',