How to use the archivebox.legacy.util.enforce_types function in archivebox

To help you get started, we’ve selected a few archivebox examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
@enforce_types
def init():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    is_empty = not len(set(os.listdir(OUTPUT_DIR)) - ALLOWED_IN_OUTPUT_DIR)
    existing_index = os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))

    if is_empty and not existing_index:
        print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
        print(f'    {OUTPUT_DIR}')
        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
    elif existing_index:
        print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
        print(f'    {OUTPUT_DIR}')
        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
    else:
        stderr(
github pirate / ArchiveBox / archivebox / legacy / archive_methods.py View on Github external
@enforce_types
def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download full site using git"""

    out_dir = out_dir or link.link_dir
    output: ArchiveOutput = 'git'
    output_path = os.path.join(out_dir, str(output))
    os.makedirs(output_path, exist_ok=True)
    cmd = [
        GIT_BINARY,
        'clone',
        '--mirror',
        '--recursive',
        *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
        without_query(without_fragment(link.url)),
    ]
    status = 'succeeded'
github pirate / ArchiveBox / archivebox / legacy / archive_methods.py View on Github external
@enforce_types
def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
    # Parse archive.org response headers
    headers: Dict[str, List[str]] = defaultdict(list)

    # lowercase all the header names and store in dict
    for header in response.splitlines():
        if b':' not in header or not header.strip():
            continue
        name, val = header.decode().split(':', 1)
        headers[name.lower().strip()].append(val.strip())

    # Get successful archive url in "content-location" header or any errors
    content_location = headers['content-location']
    errors = headers['x-archive-wayback-runtime-error']
    return content_location, errors
github pirate / ArchiveBox / archivebox / legacy / archive_methods.py View on Github external
@enforce_types
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
    # if link already has valid title, skip it
    if link.title and not link.title.lower().startswith('http'):
        return False

    if is_static_file(link.url):
        return False

    return SAVE_TITLE
github pirate / ArchiveBox / archivebox / legacy / archive_methods.py View on Github external
@enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
    out_dir = out_dir or link.link_dir
    if is_static_file(link.url):
        return False
    
    if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
        return False

    return SAVE_SCREENSHOT
github pirate / ArchiveBox / archivebox / legacy / archive_methods.py View on Github external
@enforce_types
def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
    out_dir = out_dir or link.link_dir
    if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
        return False

    return SAVE_FAVICON
github pirate / ArchiveBox / archivebox / legacy / archive_methods.py View on Github external
@enforce_types
def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """try to guess the page's title from its content"""

    output: ArchiveOutput = None
    cmd = [
        CURL_BINARY,
        link.url,
        '|',
        'grep',
        '
github pirate / ArchiveBox / archivebox / legacy / archive_methods.py View on Github external
@enforce_types
def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download full site using wget"""

    out_dir = out_dir or link.link_dir
    if SAVE_WARC:
        warc_dir = os.path.join(out_dir, 'warc')
        os.makedirs(warc_dir, exist_ok=True)
        warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))

    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
    output: ArchiveOutput = None
    cmd = [
        WGET_BINARY,
        # '--server-response',  # print headers for better error parsing
        '--no-verbose',
        '--adjust-extension',
github pirate / ArchiveBox / archivebox / legacy / archive_methods.py View on Github external
@enforce_types
def archive_link(link: Link, out_dir: Optional[str]=None) -> Link:
    """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""

    ARCHIVE_METHODS = (
        ('title', should_save_title, save_title),
        ('favicon', should_save_favicon, save_favicon),
        ('wget', should_save_wget, save_wget),
        ('pdf', should_save_pdf, save_pdf),
        ('screenshot', should_save_screenshot, save_screenshot),
        ('dom', should_save_dom, save_dom),
        ('git', should_save_git, save_git),
        ('media', should_save_media, save_media),
        ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
    )
    
    out_dir = out_dir or link.link_dir
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
@enforce_types
def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
                         after: Optional[float]=None, before: Optional[float]=None,
                         yes: bool=False, delete: bool=False) -> List[Link]:
    
    check_dependencies()
    check_data_folder()

    log_list_started(filter_patterns, filter_type)
    timer = TimedProgress(360, prefix='      ')
    try:
        links = list(list_archive_data(
            filter_patterns=filter_patterns,
            filter_type=filter_type,
            after=after,
            before=before,
        ))