How to use the archivebox.legacy.storage.json.parse_json_link_details function in archivebox

To help you get started, we’ve selected a few archivebox examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """dirs that don't contain recognizable archive data and aren't listed in the main index"""
    by_timestamp = {link.timestamp: 0 for link in links}
    unrecognized_folders: Dict[str, Optional[Link]] = {}

    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
        if entry.is_dir(follow_symlinks=True):
            index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
            link = None
            try:
                link = parse_json_link_details(entry.path)
            except Exception:
                pass

            if index_exists and link is None:
                # index exists but it's corrupted or unparseable
                unrecognized_folders[entry.path] = link
            
            elif not index_exists:
                # link details index doesn't exist and the folder isn't in the main index
                timestamp = entry.path.rsplit('/', 1)[-1]
                if timestamp not in by_timestamp:
                    unrecognized_folders[entry.path] = link

    return unrecognized_folders
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """dirs that contain a valid index but aren't listed in the main index"""
    links = list(links)
    indexed_folders = {link.link_dir: link for link in links}
    orphaned_folders = {}

    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
        if entry.is_dir(follow_symlinks=True):
            index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
            link = None
            try:
                link = parse_json_link_details(entry.path)
            except Exception:
                pass

            if index_exists and entry.path not in indexed_folders:
                # folder is a valid link data dir with index details, but it's not in the main index
                orphaned_folders[entry.path] = link

    return orphaned_folders
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
by_url = {link.url: 0 for link in links}
    by_timestamp = {link.timestamp: 0 for link in links}

    duplicate_folders = {}

    indexed_folders = {link.link_dir for link in links}
    data_folders = (
        entry.path
        for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
        if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
    )

    for path in chain(sorted(indexed_folders), sorted(data_folders)):
        link = None
        try:
            link = parse_json_link_details(path)
        except Exception:
            pass

        if link:
            # link folder has same timestamp as different link folder
            by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
            if by_timestamp[link.timestamp] > 1:
                duplicate_folders[path] = link

            # link folder has same url as different link folder
            by_url[link.url] = by_url.get(link.url, 0) + 1
            if by_url[link.url] > 1:
                duplicate_folders[path] = link

    return duplicate_folders
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
def is_valid(link: Link) -> bool:
    dir_exists = os.path.exists(link.link_dir)
    index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
    if not dir_exists:
        # unarchived links are not included in the valid list
        return False
    if dir_exists and not index_exists:
        return False
    if dir_exists and index_exists:
        try:
            parsed_link = parse_json_link_details(link.link_dir)
            return link.url == parsed_link.url
        except Exception:
            pass
    return False
github pirate / ArchiveBox / archivebox / legacy / main.py View on Github external
def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """dirs that are expected to exist based on the main index"""
    all_folders = {}

    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
        if entry.is_dir(follow_symlinks=True):
            link = None
            try:
                link = parse_json_link_details(entry.path)
            except Exception:
                pass

            all_folders[entry.path] = link

    return all_folders