How to use the libratom.lib.download.download_files function in libratom

To help you get started, we’ve selected a few libratom examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github libratom / libratom / tests / unit / test_libratom.py View on Github external
def test_download_files_with_bad_urls():

    bad_urls = ["http://foobar"] * 6

    with TemporaryDirectory() as tmpdir, patch("requests.Session.get") as mock_get:
        mock_get.return_value.ok = False

        with pytest.raises(RuntimeError):
            download_files(bad_urls, Path(tmpdir))
github libratom / libratom / tests / unit / test_libratom.py View on Github external
def test_download_files(directory_of_mbox_files, dry_run):

    assert directory_of_mbox_files  # so that the files are already present

    # Try to re-download files already downloaded by the fixture
    url_template = (
        "https://mail-archives.apache.org/mod_mbox/httpd-users/20190{month}.mbox"
    )
    path = Path("/tmp/libratom/test_data/httpd-users")
    urls = [url_template.format(month=i) for i in range(1, 7)]
    download_files(urls, path, dry_run=dry_run)
github libratom / libratom / libratom / scripts / get_media_type_list.py View on Github external
"message",
        "model",
        "multipart",
        "text",
        "video",
    ]

    # CSV files to download
    urls = [
        f"https://www.iana.org/assignments/media-types/{registry}.csv"
        for registry in media_type_registries
    ]

    with TemporaryDirectory() as tmpdir:
        directory = Path(tmpdir)
        download_files(urls, directory, dry_run=False)

        for file in directory.glob("*.csv"):
            with file.open(newline="") as csvfile:
                reader = csv.reader(csvfile)

                # Use the first token (Name) in each row, skip headers
                # The split is to strip DEPRECATED/OBSOLETED/... mentions appended to the name
                for [name, *_] in reader:
                    if name != "Name":
                        media_types.append(f"{file.stem}/{name.split(maxsplit=1)[0]}")

    with out.open(mode="w") as f:
        json.dump(sorted(media_types), f, indent=4)
github libratom / libratom / libratom / scripts / download_edrm_zipped_files.py View on Github external
names = [
        "albert_meyers",
        "andrea_ring",
        "andrew_lewis",
        "andy_zipper",
        "chris_dorland",
        "jason_wolfe",
        "vkaminski"
    ]

    # CSV files to download
    urls = [
        f"{ENRON_DATASET_URL}/{name}.zip" for name in names
    ]

    download_files(urls, CACHED_ENRON_DATA_DIR, dry_run=False)