How to use the archivebox.legacy.config.OUTPUT_DIR function in archivebox

To help you get started, we’ve selected a few archivebox examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
                      after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
    
    all_links = load_main_index(out_dir=OUTPUT_DIR)

    for link in all_links:
        if after is not None and float(link.timestamp) < after:
            continue
        if before is not None and float(link.timestamp) > before:
            continue
        
        if filter_patterns:
            if link_matches_filter(link, filter_patterns, filter_type):
                yield link
        else:
            yield link
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
    duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
    orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
    corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
    unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
    return {**duplicate, **orphaned, **corrupted, **unrecognized}
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
    duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
    orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
    corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
    unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
    return {**duplicate, **orphaned, **corrupted, **unrecognized}
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
folder: link
        for folder, link in get_invalid_folders(all_links.values(), out_dir=OUTPUT_DIR).items()
        if folder not in orphaned_data_dirs
    }
    if invalid_folders:
        print('    {lightyellow}! Skipped adding {} corrupted/unrecognized link data directories that could not be read.{reset}'.format(len(orphan_duplicates), **ANSI))
        
    if orphan_duplicates or invalid_folders:
        print('        For more information about the link data directories that were skipped, run:')
        print('            archivebox info')
        print('            archivebox list --status=invalid')
        print('            archivebox list --status=orphaned')
        print('            archivebox list --status=duplicate')


    write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR)

    print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
    if existing_index:
        print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
    else:
        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
    print()
    print('    To view your archive index, open:')
    print('        {}'.format(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME)))
    print()
    print('    To add new links, you can run:')
    print("        archivebox add 'https://example.com'")
    print()
    print('    For more usage and examples, run:')
    print('        archivebox help')
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
    duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
    orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
    corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
    unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
    return {**duplicate, **orphaned, **corrupted, **unrecognized}
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
for link in load_main_index(out_dir=OUTPUT_DIR, warn=False)
        }
        print('    √ Loaded {} links from existing main index...'.format(len(all_links)))

    orphaned_json_links = {
        link.url: link
        for link in parse_json_main_index(OUTPUT_DIR)
        if link.url not in all_links
    }
    if orphaned_json_links:
        all_links.update(orphaned_json_links)
        print('    {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))

    orphaned_sql_links = {
        link.url: link
        for link in parse_sql_main_index(OUTPUT_DIR)
        if link.url not in all_links
    }
    if orphaned_sql_links:
        all_links.update(orphaned_sql_links)
        print('    {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))

    orphaned_data_dir_links = {
        link.url: link
        for link in parse_json_links_details(OUTPUT_DIR)
    }
    orphan_new_links = {
        url: link
        for url, link in orphaned_data_dir_links.items()
        if url not in all_links
    }
    orphan_duplicates = {
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
print()

    num_indexed = len(get_indexed_folders(links, out_dir=OUTPUT_DIR))
    num_archived = len(get_archived_folders(links, out_dir=OUTPUT_DIR))
    num_unarchived = len(get_unarchived_folders(links, out_dir=OUTPUT_DIR))
    print(f'    > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
    print(f'      > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
    print(f'      > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
    
    num_present = len(get_present_folders(links, out_dir=OUTPUT_DIR))
    num_valid = len(get_valid_folders(links, out_dir=OUTPUT_DIR))
    print()
    print(f'    > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
    print(f'      > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
    
    duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
    orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
    corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
    unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
    num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
    print(f'      > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
    print(f'        > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
    print(f'        > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
    print(f'        > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
    print(f'        > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
    
    if num_indexed:
        print()
        print('    {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
        print('        archivebox list --status=  (e.g. indexed, corrupted, archived, etc.)')

    if orphaned:
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
    duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
    orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
    corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
    unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
    return {**duplicate, **orphaned, **corrupted, **unrecognized}
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
}
    orphan_duplicates = {
        url: link
        for url, link in orphaned_data_dir_links.items()
        if url in all_links
    }
    if orphan_new_links:
        all_links.update(orphan_new_links)
        print('    {lightyellow}√ Added {} orphaned links from existing archive directories...{reset}'.format(len(orphan_new_links), **ANSI))
    if orphan_duplicates:
        print('    {lightyellow}! Skipped adding {} invalid link data directories that would have overwritten or corrupted existing data.{reset}'.format(len(orphan_duplicates), **ANSI))

    orphaned_data_dirs = {folder for folder in orphan_duplicates.keys()}
    invalid_folders = {
        folder: link
        for folder, link in get_invalid_folders(all_links.values(), out_dir=OUTPUT_DIR).items()
        if folder not in orphaned_data_dirs
    }
    if invalid_folders:
        print('    {lightyellow}! Skipped adding {} corrupted/unrecognized link data directories that could not be read.{reset}'.format(len(orphan_duplicates), **ANSI))
        
    if orphan_duplicates or invalid_folders:
        print('        For more information about the link data directories that were skipped, run:')
        print('            archivebox info')
        print('            archivebox list --status=invalid')
        print('            archivebox list --status=orphaned')
        print('            archivebox list --status=duplicate')


    write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR)

    print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))