How to use the archivebox.config.check_data_folder function in archivebox

To help you get started, we’ve selected a few archivebox examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pirate / ArchiveBox / archivebox / main.py View on Github external
def remove(filter_str: Optional[str]=None,
           filter_patterns: Optional[List[str]]=None,
           filter_type: str='exact',
           after: Optional[float]=None,
           before: Optional[float]=None,
           yes: bool=False,
           delete: bool=False,
           out_dir: str=OUTPUT_DIR) -> List[Link]:
    """Remove the specified URLs from the archive"""
    
    check_data_folder(out_dir=out_dir)

    if filter_str and filter_patterns:
        stderr(
            '[X] You should pass either a pattern as an argument, '
            'or pass a list of patterns via stdin, but not both.\n',
            color='red',
        )
        raise SystemExit(2)
    elif not (filter_str or filter_patterns):
        stderr(
            '[X] You should pass either a pattern as an argument, '
            'or pass a list of patterns via stdin.',
            color='red',
        )
        stderr()
        stderr('    {lightred}Hint:{reset} To remove all urls you can run:'.format(**ANSI))
github pirate / ArchiveBox / archivebox / main.py View on Github external
def list_all(filter_patterns_str: Optional[str]=None,
             filter_patterns: Optional[List[str]]=None,
             filter_type: str='exact',
             status: Optional[str]=None,
             after: Optional[float]=None,
             before: Optional[float]=None,
             sort: Optional[str]=None,
             csv: Optional[str]=None,
             json: bool=False,
             out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
    """List, filter, and export information about archive entries"""
    
    check_data_folder(out_dir=out_dir)

    if filter_patterns and filter_patterns_str:
        stderr(
            '[X] You should either pass filter patterns as an arguments '
            'or via stdin, but not both.\n',
            color='red',
        )
        raise SystemExit(2)
    elif filter_patterns_str:
        filter_patterns = filter_patterns_str.split('\n')


    links = list_links(
        filter_patterns=filter_patterns,
        filter_type=filter_type,
        before=before,
github pirate / ArchiveBox / archivebox / main.py View on Github external
def add(import_str: Optional[str]=None,
        import_path: Optional[str]=None,
        update_all: bool=not ONLY_NEW,
        index_only: bool=False,
        out_dir: str=OUTPUT_DIR) -> List[Link]:
    """Add a new URL or list of URLs to your archive"""

    check_data_folder(out_dir=out_dir)

    if import_str and import_path:
        stderr(
            '[X] You should pass either an import path as an argument, '
            'or pass a list of links via stdin, but not both.\n',
            color='red',
        )
        raise SystemExit(2)
    elif import_str:
        import_path = save_stdin_to_sources(import_str, out_dir=out_dir)
    else:
        import_path = save_file_to_sources(import_path, out_dir=out_dir)

    check_dependencies()

    # Step 1: Load list of links from the existing index
github pirate / ArchiveBox / archivebox / main.py View on Github external
def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None:
    """Run an ArchiveBox Django management command"""

    check_data_folder(out_dir=out_dir)

    setup_django(out_dir)
    from django.core.management import execute_from_command_line

    execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
github pirate / ArchiveBox / archivebox / main.py View on Github external
def list_folders(links: List[Link],
                 status: str,
                 out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    
    check_data_folder(out_dir=out_dir)

    if status == 'indexed':
        return get_indexed_folders(links, out_dir=out_dir)
    elif status == 'archived':
        return get_archived_folders(links, out_dir=out_dir)
    elif status == 'unarchived':
        return get_unarchived_folders(links, out_dir=out_dir)

    elif status == 'present':
        return get_present_folders(links, out_dir=out_dir)
    elif status == 'valid':
        return get_valid_folders(links, out_dir=out_dir)
    elif status == 'invalid':
        return get_invalid_folders(links, out_dir=out_dir)

    elif status == 'duplicate':
github pirate / ArchiveBox / archivebox / main.py View on Github external
def info(out_dir: str=OUTPUT_DIR) -> None:
    """Print out some info and statistics about the archive collection"""

    check_data_folder(out_dir=out_dir)

    print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
    print(f'    {out_dir}/*')
    num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
    size = printable_filesize(num_bytes)
    print(f'    Size: {size} across {num_files} files')
    print()

    links = list(load_main_index(out_dir=out_dir))
    num_json_links = len(links)
    num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=out_dir))
    num_html_links = sum(1 for url in parse_html_main_index(out_dir=out_dir))
    num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
    users = get_admins().values_list('username', flat=True)
    print(f'    > JSON Main Index: {num_json_links} links'.ljust(36),  f'(found in {JSON_INDEX_FILENAME})')
    print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
github pirate / ArchiveBox / archivebox / main.py View on Github external
def list_links(filter_patterns: Optional[List[str]]=None,
               filter_type: str='exact',
               after: Optional[float]=None,
               before: Optional[float]=None,
               out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
    
    check_data_folder(out_dir=out_dir)

    all_links = load_main_index(out_dir=out_dir)

    for link in all_links:
        if after is not None and float(link.timestamp) < after:
            continue
        if before is not None and float(link.timestamp) > before:
            continue
        
        if filter_patterns:
            if link_matches_filter(link, filter_patterns, filter_type):
                yield link
        else:
            yield link
github pirate / ArchiveBox / archivebox / main.py View on Github external
def shell(out_dir: str=OUTPUT_DIR) -> None:
    """Enter an interactive ArchiveBox Django shell"""

    check_data_folder(out_dir=out_dir)

    setup_django(OUTPUT_DIR)
    from django.core.management import call_command
    call_command("shell_plus")