How to use the textacy.io.download_file function in textacy

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / textacy / datasets / reddit_comments.py View on Github external
Args:
            date_range (Tuple[str]): Interval specifying the [start, end) dates
                for which comments files will be downloaded. Each item must be
                a str formatted as YYYY-MM or YYYY-MM-DD (the latter is converted
                to the corresponding YYYY-MM value). Both start and end values
                must be specified, but a null value for either is automatically
                replaced by the minimum or maximum valid values, respectively.
            force (bool): If True, download the dataset, even if it already
                exists on disk under ``data_dir``.
        """
        date_range = utils.validate_and_clip_range(
            date_range, self.full_date_range, val_type=(str, bytes))
        filestubs = self._generate_filestubs(date_range)
        for filestub in filestubs:
            tio.download_file(
                urllib.parse.urljoin(DOWNLOAD_ROOT, filestub),
                filename=filestub,
                dirpath=self.data_dir,
                force=force,
            )
github chartbeat-labs / textacy / textacy / datasets / udhr.py View on Github external
def download(self, *, force=False):
        """
        Download the data as a zipped archive of language-specific text files,
        then save it to disk and extract its contents under the ``data_dir`` directory.

        Args:
            force (bool): If True, always download the dataset even if
                it already exists.
        """
        filepath = tio.download_file(
            DOWNLOAD_URL,
            filename="udhr_txt.zip",
            dirpath=self.data_dir,
            force=force,
        )
        if filepath:
            tio.unpack_archive(filepath, extract_dir=self.data_dir.joinpath("udhr_txt"))
        self._check_data()
github chartbeat-labs / textacy / textacy / datasets / imdb.py View on Github external
def download(self, *, force=False):
        """
        Download the data as a compressed tar archive file, then save it to disk and
        extract its contents under the ``data_dir`` directory.

        Args:
            force (bool): If True, always download the dataset even if
                it already exists.
        """
        filepath = tio.download_file(
            DOWNLOAD_URL,
            filename="aclImdb.tar.gz",
            dirpath=self.data_dir,
            force=force,
        )
        if filepath:
            tio.unpack_archive(filepath, extract_dir=None)
        self._check_data()
github chartbeat-labs / textacy / textacy / datasets / wikimedia.py View on Github external
def download(self, *, force=False):
        """
        Download the Wikimedia CirrusSearch db dump corresponding to the given
        ``project``, ``lang``, and ``version`` as a compressed JSON file,
        and save it to disk under the ``data_dir`` directory.

        Args:
            force (bool): If True, download the dataset, even if it already
                exists on disk under ``data_dir``.

        Note:
            Some datasets are quite large (e.g. English Wikipedia is ~28GB)
            and can take hours to fully download.
        """
        file_url = self._get_file_url()
        tio.download_file(
            file_url,
            filename=self._filestub,
            dirpath=self.data_dir,
            force=force,
        )
github chartbeat-labs / textacy / scripts / train_lang_identifier.py View on Github external
Download two multilingual collections of short excerpts of journalistic texts,
    focused on language groups that are very similar and thus more difficult
    to correctly identify.

    Args:
        dirpath (str or :class:`pathlib.Path`)
        force (bool)

    References:
        http://ttg.uni-saarland.de/resources/DSLCC/
    """
    dirpath = textacy.utils.to_path(dirpath).resolve()
    for version in [3, 4]:
        name = "dslcc{}".format(version)
        url = "http://scholar.harvard.edu/files/malmasi/files/{}.zip".format(name)
        fpath = textacy.io.download_file(url, dirpath=dirpath, force=force)
        if fpath:
            textacy.io.unpack_archive(fpath, extract_dir=dirpath.joinpath(name))
github chartbeat-labs / textacy / scripts / train_lang_identifier.py View on Github external
def download_iso_639_data(dirpath, force=False):
    """
    Download official ISO 639 code table as a TSV,
    mapping all language code variations (639-1, 639-2, 639-3)
    to each other.

    Args:
        dirpath (str or :class:`pathlib.Path`)
        force (bool)

    References:
        https://iso639-3.sil.org/code_tables/639/data
    """
    url = "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab"
    textacy.io.download_file(url, filename="iso-639-3.tsv", dirpath=dirpath, force=force)
github chartbeat-labs / textacy / textacy / datasets / oxford_text_archive.py View on Github external
def download(self, *, force=False):
        """
        Download the data as a zip archive file, then save it to disk and
        extract its contents under the :attr:`OxfordTextArchive.data_dir` directory.

        Args:
            force (bool): If True, always download the dataset even if
                it already exists.
        """
        filepath = tio.download_file(
            DOWNLOAD_URL,
            filename=None,
            dirpath=self.data_dir,
            force=force,
        )
        if filepath:
            tio.unpack_archive(filepath, extract_dir=None)
github chartbeat-labs / textacy / textacy / datasets / supreme_court.py View on Github external
def download(self, *, force=False):
        """
        Download the data as a Python version-specific compressed json file and
        save it to disk under the ``data_dir`` directory.

        Args:
            force (bool): If True, download the dataset, even if it already
                exists on disk under ``data_dir``.
        """
        release_tag = "supreme_court_py3_v{data_version}".format(data_version=1.0)
        url = urllib.parse.urljoin(DOWNLOAD_ROOT, release_tag + "/" + self._filename)
        tio.download_file(
            url,
            filename=self._filename,
            dirpath=self.data_dir,
            force=force,
        )
github chartbeat-labs / textacy / scripts / train_lang_identifier.py View on Github external
def download_wili_data(dirpath, force=False):
    """
    Args:
        dirpath (str or :class:`pathlib.Path`)
        force (bool)

    References:
        https://tatoeba.org/eng/downloads
    """
    url = "https://zenodo.org/record/841984/files/wili-2018.zip?download=1"
    fpath = textacy.io.download_file(url, dirpath=dirpath, force=force)
    if fpath:
        textacy.io.unpack_archive(fpath, extract_dir=dirpath)