How to use the archivebox.schema.ArchiveResult function in archivebox

To help you get started, we’ve selected a few archivebox examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
or b'Unable to extract container ID' in result.stderr):
                # These happen too frequently on non-media pages to warrant printing to console
                pass
            else:
                hints = (
                    'Got youtube-dl response code: {}.'.format(result.returncode),
                    *result.stderr.decode().split('\n'),
                )
                raise ArchiveError('Failed to download media', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return ArchiveResult(
        cmd=cmd,
        pwd=link_dir,
        cmd_version=YOUTUBEDL_VERSION,
        output=output,
        status=status,
        **timer.stats,
    )
github pirate / ArchiveBox / archivebox / schema.py View on Github external
def typecheck(self) -> None:
        assert self.schema == self.__class__.__name__
        assert isinstance(self.timestamp, str) and self.timestamp
        assert self.timestamp.replace('.', '').isdigit()
        assert isinstance(self.url, str) and '://' in self.url
        assert self.updated is None or isinstance(self.updated, datetime)
        assert self.title is None or isinstance(self.title, str) and self.title
        assert self.tags is None or isinstance(self.tags, str) and self.tags
        assert isinstance(self.sources, list)
        assert all(isinstance(source, str) and source for source in self.sources)
        assert isinstance(self.history, dict)
        for method, results in self.history.items():
            assert isinstance(method, str) and method
            assert isinstance(results, list)
            assert all(isinstance(result, ArchiveResult) for result in results)
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)

        if result.returncode == 128:
            # ignore failed re-download when the folder already exists
            pass
        elif result.returncode > 0:
            hints = 'Got git response code: {}.'.format(result.returncode)
            raise ArchiveError('Failed git download', hints)

    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return ArchiveResult(
        cmd=cmd,
        pwd=link_dir,
        cmd_version=GIT_VERSION,
        output=output,
        status=status,
        **timer.stats,
    )
github pirate / ArchiveBox / archivebox / schema.py View on Github external
info['start_ts'] = parse_date(info['start_ts'])
        info['end_ts'] = parse_date(info['end_ts'])
        return cls(**info)

    @property
    def duration(self) -> int:
        return (self.end_ts - self.start_ts).seconds

@dataclass(frozen=True)
class Link:
    timestamp: str
    url: str
    title: Optional[str]
    tags: Optional[str]
    sources: List[str]
    history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
    updated: Optional[datetime] = None
    schema: str = 'Link'

    def __post_init__(self):
        self.typecheck()

    def overwrite(self, **kwargs):
        """pure functional version of dict.update that returns a new instance"""
        return Link(**{**self._asdict(), **kwargs})

    def __eq__(self, other):
        if not isinstance(other, Link):
            return NotImplemented
        return self.url == other.url

    def __gt__(self, other):
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
try:
        with open(output_path, 'w+') as f:
            result = run(cmd, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout)

        if result.returncode:
            hints = result.stderr.decode()
            raise ArchiveError('Failed to fetch DOM', hints)

        chmod_file(output, cwd=link_dir)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return ArchiveResult(
        cmd=cmd,
        pwd=link_dir,
        cmd_version=CHROME_VERSION,
        output=output,
        status=status,
        **timer.stats,
    )
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
output = err
    finally:
        timer.end()

    if output and not isinstance(output, Exception):
        # instead of writing None when archive.org rejects the url write the
        # url to resubmit it to archive.org. This is so when the user visits
        # the URL in person, it will attempt to re-archive it, and it'll show the
        # nicer error message explaining why the url was rejected if it fails.
        archive_org_url = archive_org_url or submit_url
        with open(os.path.join(link_dir, str(output)), 'w', encoding='utf-8') as f:
            f.write(archive_org_url)
        chmod_file('archive.org.txt', cwd=link_dir)
        output = archive_org_url

    return ArchiveResult(
        cmd=cmd,
        pwd=link_dir,
        cmd_version=CURL_VERSION,
        output=output,
        status=status,
        **timer.stats,
    )
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)

        if result.returncode:
            hints = (result.stderr or result.stdout).decode()
            raise ArchiveError('Failed to take screenshot', hints)

        chmod_file(output, cwd=link_dir)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return ArchiveResult(
        cmd=cmd,
        pwd=link_dir,
        cmd_version=CHROME_VERSION,
        output=output,
        status=status,
        **timer.stats,
    )
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
*output_tail,
            )
            if b'403: Forbidden' in result.stderr:
                raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
            if b'404: Not Found' in result.stderr:
                raise ArchiveError('404 Not Found', hints)
            if b'ERROR 500: Internal Server Error' in result.stderr:
                raise ArchiveError('500 Internal Server Error', hints)
            raise ArchiveError('Got an error from the server', hints)
    except Exception as err:
        status = 'failed'
        output = err
    finally:
        timer.end()

    return ArchiveResult(
        cmd=cmd,
        pwd=link_dir,
        cmd_version=WGET_VERSION,
        output=output,
        status=status,
        **timer.stats,
    )
github pirate / ArchiveBox / archivebox / schema.py View on Github external
allowed_fields = {f.name for f in fields(cls)}
        info = {
            key: val
            for key, val in json_info.items()
            if key in allowed_fields
        }
        info['updated'] = parse_date(info['updated'])

        json_history = info['history']
        cast_history = {}

        for method, method_history in json_history.items():
            cast_history[method] = []
            for json_result in method_history:
                assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts'
                cast_result = ArchiveResult.from_json(json_result)
                cast_history[method].append(cast_result)

        info['history'] = cast_history
        return cls(**info)