How to use the archivebox.util.enforce_types function in archivebox

To help you get started, we’ve selected a few archivebox examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
@enforce_types
def fetch_git(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download full site using git"""

    link_dir = link_dir or link.link_dir
    output: ArchiveOutput = 'git'
    output_path = os.path.join(link_dir, str(output))
    os.makedirs(output_path, exist_ok=True)
    cmd = [
        GIT_BINARY,
        'clone',
        '--mirror',
        '--recursive',
        *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
        without_query(without_fragment(link.url)),
    ]
    status = 'succeeded'
github pirate / ArchiveBox / archivebox / parsers / __init__.py View on Github external
@enforce_types
def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
    """download a given url's content into output/sources/domain-.txt"""
    check_data_folder(out_dir=out_dir)

    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
    if not os.path.exists(sources_dir):
        os.makedirs(sources_dir)

    ts = str(datetime.now().timestamp()).split('.', 1)[0]

    source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))

    if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
        source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
        print('{}[*] [{}] Downloading {}{}'.format(
            ANSI['green'],
github pirate / ArchiveBox / archivebox / extractors / favicon.py View on Github external
@enforce_types
def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download site favicon from google's favicon api"""

    out_dir = out_dir or link.link_dir
    output: ArchiveOutput = 'favicon.ico'
    cmd = [
        CURL_BINARY,
        '--max-time', str(timeout),
        '--location',
        '--output', str(output),
        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
        'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
@enforce_types
def should_fetch_media(link: Link, link_dir: Optional[str]=None) -> bool:
    link_dir = link_dir or link.link_dir

    if is_static_file(link.url):
        return False

    if os.path.exists(os.path.join(link_dir, 'media')):
        return False

    return FETCH_MEDIA
github pirate / ArchiveBox / archivebox / main.py View on Github external
@enforce_types
def run(subcommand: str,
        subcommand_args: Optional[List[str]],
        stdin: Optional[IO]=None,
        out_dir: str=OUTPUT_DIR) -> None:
    """Run a given ArchiveBox subcommand with the given list of args"""
    run_subcommand(
        subcommand=subcommand,
        subcommand_args=subcommand_args,
        stdin=stdin,
        pwd=out_dir,
    )
github pirate / ArchiveBox / archivebox / index / html.py View on Github external
@enforce_types
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
    out_dir = out_dir or link.link_dir

    rendered_html = link_details_template(link)
    atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
github pirate / ArchiveBox / archivebox / main.py View on Github external
@enforce_types
def list_links(filter_patterns: Optional[List[str]]=None,
               filter_type: str='exact',
               after: Optional[float]=None,
               before: Optional[float]=None,
               out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
    
    check_data_folder(out_dir=out_dir)

    all_links = load_main_index(out_dir=out_dir)

    for link in all_links:
        if after is not None and float(link.timestamp) < after:
            continue
        if before is not None and float(link.timestamp) > before:
            continue
github pirate / ArchiveBox / archivebox / index / sql.py View on Github external
@enforce_types
def apply_migrations(out_dir: str=OUTPUT_DIR) -> List[str]:
    setup_django(out_dir, check_db=False)
    from django.core.management import call_command
    null, out = StringIO(), StringIO()
    call_command("makemigrations", interactive=False, stdout=null)
    call_command("migrate", interactive=False, stdout=out)
    out.seek(0)

    return [line.strip() for line in out.readlines() if line.strip()]
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
@enforce_types
def fetch_media(link: Link, link_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
    """Download playlists or individual video, audio, and subtitles using youtube-dl"""

    link_dir = link_dir or link.link_dir
    output: ArchiveOutput = 'media'
    output_path = os.path.join(link_dir, str(output))
    os.makedirs(output_path, exist_ok=True)
    cmd = [
        YOUTUBEDL_BINARY,
        '--write-description',
        '--write-info-json',
        '--write-annotations',
        '--yes-playlist',
        '--write-thumbnail',
        '--no-call-home',
        '--no-check-certificate',
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
@enforce_types
def fetch_favicon(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download site favicon from google's favicon api"""

    link_dir = link_dir or link.link_dir
    output: ArchiveOutput = 'favicon.ico'
    cmd = [
        CURL_BINARY,
        '--max-time', str(timeout),
        '--location',
        '--output', str(output),
        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
        'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
    ]
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try: