How to use the archivebox.util.is_static_file function in archivebox

To help you get started, we’ve selected a few archivebox examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
def should_fetch_git(link: Link, link_dir: Optional[str]=None) -> bool:
    link_dir = link_dir or link.link_dir
    if is_static_file(link.url):
        return False

    if os.path.exists(os.path.join(link_dir, 'git')):
        return False

    is_clonable_url = (
        (domain(link.url) in GIT_DOMAINS)
        or (extension(link.url) == 'git')
    )
    if not is_clonable_url:
        return False

    return FETCH_GIT
github pirate / ArchiveBox / archivebox / extractors / git.py View on Github external
def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
    out_dir = out_dir or link.link_dir
    if is_static_file(link.url):
        return False

    if os.path.exists(os.path.join(out_dir, 'git')):
        return False

    is_clonable_url = (
        (domain(link.url) in GIT_DOMAINS)
        or (extension(link.url) == 'git')
    )
    if not is_clonable_url:
        return False

    return SAVE_GIT
github pirate / ArchiveBox / archivebox / extractors / archive_org.py View on Github external
def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
    out_dir = out_dir or link.link_dir
    if is_static_file(link.url):
        return False

    if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
        # if open(path, 'r').read().strip() != 'None':
        return False

    return SAVE_ARCHIVE_DOT_ORG
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
def should_fetch_title(link: Link, link_dir: Optional[str]=None) -> bool:
    # if link already has valid title, skip it
    if link.title and not link.title.lower().startswith('http'):
        return False

    if is_static_file(link.url):
        return False

    return FETCH_TITLE
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
def should_fetch_archive_dot_org(link: Link, link_dir: Optional[str]=None) -> bool:
    link_dir = link_dir or link.link_dir
    if is_static_file(link.url):
        return False

    if os.path.exists(os.path.join(link_dir, 'archive.org.txt')):
        # if open(path, 'r').read().strip() != 'None':
        return False

    return SUBMIT_ARCHIVE_DOT_ORG
github pirate / ArchiveBox / archivebox / extractors / wget.py View on Github external
def wget_output_path(link: Link) -> Optional[str]:
    """calculate the path to the wgetted .html file, since wget may
    adjust some paths to be different than the base_url path.

    See docs on wget --adjust-extension (-E)
    """

    if is_static_file(link.url):
        return without_scheme(without_fragment(link.url))

    # Wget downloads can save in a number of different ways depending on the url:
    #    https://example.com
    #       > example.com/index.html
    #    https://example.com?v=zzVa_tX1OiI
    #       > example.com/index.html?v=zzVa_tX1OiI.html
    #    https://www.example.com/?v=zzVa_tX1OiI
    #       > example.com/index.html?v=zzVa_tX1OiI.html

    #    https://example.com/abc
    #       > example.com/abc.html
    #    https://example.com/abc/
    #       > example.com/abc/index.html
    #    https://example.com/abc?v=zzVa_tX1OiI.html
    #       > example.com/abc?v=zzVa_tX1OiI.html
github pirate / ArchiveBox / archivebox / extractors / media.py View on Github external
def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
    out_dir = out_dir or link.link_dir

    if is_static_file(link.url):
        return False

    if os.path.exists(os.path.join(out_dir, 'media')):
        return False

    return SAVE_MEDIA
github pirate / ArchiveBox / archivebox / extractors / pdf.py View on Github external
def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
    out_dir = out_dir or link.link_dir
    if is_static_file(link.url):
        return False
    
    if os.path.exists(os.path.join(out_dir, 'output.pdf')):
        return False

    return SAVE_PDF
github pirate / ArchiveBox / archivebox / extractors / screenshot.py View on Github external
def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
    out_dir = out_dir or link.link_dir
    if is_static_file(link.url):
        return False
    
    if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
        return False

    return SAVE_SCREENSHOT
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
def should_fetch_pdf(link: Link, link_dir: Optional[str]=None) -> bool:
    link_dir = link_dir or link.link_dir
    if is_static_file(link.url):
        return False
    
    if os.path.exists(os.path.join(link_dir, 'output.pdf')):
        return False

    return FETCH_PDF