How to use the textacy.io.read_csv function in textacy

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / tests / test_io.py View on Github external
def test_read_write_delimiters(self, tmpdir):
        expected = [
            ["this is some text", "scandal", 42.0],
            ["here's some more text: boom!", "escándalo", 1.0],
        ]
        for delimiter in (",", "\t", "|", ":"):
            filepath = str(tmpdir.join("test_read_write_csv.csv"))
            io.write_csv(expected, filepath, delimiter=delimiter, make_dirs=True)
            observed = list(io.read_csv(filepath, delimiter=delimiter))
            assert observed == expected
github chartbeat-labs / textacy / tests / test_io.py View on Github external
def test_read_write_dict(self, tmpdir):
        expected = [
            {"text": "this is some text", "kind": "scandal", "number": 42.0},
            {"text": "here's some more text: boom!", "kind": "escándalo", "number": 1.0},
        ]
        filepath = str(tmpdir.join("test_read_write_csv_dict.csv"))
        io.write_csv(
            expected,
            filepath,
            dialect="excel",
            make_dirs=True,
            fieldnames=["text", "kind", "number"],
        )
        observed = [
            dict(item)
            for item in io.read_csv(
                filepath, dialect="excel", fieldnames=["text", "kind", "number"]
            )
        ]
        assert observed == expected
github chartbeat-labs / textacy / tests / test_io.py View on Github external
def test_read_write_compressed(self, tmpdir):
        expected = [
            ["this is some text", "scandal", 42.0],
            ["here's some more text: boom!", "escándalo", 1.0],
        ]
        for ext in (".csv", ".csv.gz", ".csv.bz2", ".csv.xz"):
            filepath = str(tmpdir.join("test_read_write_csv" + ext))
            if compat.PY2 is True and ext != ".csv":
                with pytest.raises(ValueError):
                    io.open_sesame(filepath, mode="wt", encoding=None, make_dirs=True)
            else:
                io.write_csv(expected, filepath, make_dirs=True)
                observed = list(io.read_csv(filepath))
                assert observed == expected
github chartbeat-labs / textacy / textacy / resources / concept_net.py View on Github external
rel_fname = "{}.json.gz".format(_split_uri(relation)[1].lower())
        rel_fpath = self.data_dir.joinpath(rel_fname)
        if rel_fpath.is_file():
            LOGGER.debug("loading data for '%s' relation from %s", relation, rel_fpath)
            return next(
                tio.read_json(rel_fpath, mode="rt", encoding="utf-8", lines=False)
            )
        else:
            rel_data = collections.defaultdict(
                lambda: collections.defaultdict(
                    lambda: collections.defaultdict(set)
                )
            )
            LOGGER.info(
                "preparing data for '%s' relation; this may take a while...", relation)
            rows = tio.read_csv(self.filepath, delimiter="\t", quoting=1)
            with tqdm() as pbar:
                for row in rows:
                    pbar.update(1)
                    _, rel_type, start_uri, end_uri, _ = row
                    if rel_type < relation:
                        continue
                    elif rel_type > relation:
                        break
                    start_lang, start_term, start_sense = _parse_concept_uri(start_uri)
                    end_lang, end_term, end_sense = _parse_concept_uri(end_uri)
                    if start_lang == end_lang and start_term != end_term:
                        rel_data[start_lang][start_term][start_sense].add(end_term)
                        if is_symmetric:
                            rel_data[start_lang][end_term][end_sense].add(start_term)
            # make relation data json-able (i.e. cast set => list)
            for terms in rel_data.values():
github chartbeat-labs / textacy / scripts / train_lang_identifier.py View on Github external
def load_iso_639_data(dirpath, exclude=None):
    """
    Args:
        dirpath (str or :class:`pathlib.Path`)
        exclude (Set[str])

    Returns:
        Dict[str, str]
    """
    dirpath = textacy.utils.to_path(dirpath).resolve()
    rows = textacy.io.read_csv(
        dirpath.joinpath("iso-639-3.tsv").resolve(),
        delimiter="\t",
        fieldnames=["Id", "Part2B", "Part2T", "Part1", "Scope", "Language_Type", "Ref_Name", "Comment"],
        quoting=1,
    )
    lang_map = {
        row["Id"]: row["Part1"]
        for row in rows
        if row.get("Part1") and
        (exclude is None or row["Part1"] not in exclude)
    }
    return lang_map
github chartbeat-labs / textacy / scripts / train_lang_identifier.py View on Github external
def load_tatoeba_data(dirpath, iso_lang_map, min_len=25):
    """
    Args:
        dirpath (str or :class:`pathlib.Path`)
        iso_lang_map (Dict[str, str])
        min_len (int): minimum text length in *chars*

    Returns:
        List[Tuple[str, str]]
    """
    dirpath = textacy.utils.to_path(dirpath).resolve()
    rows = textacy.io.read_csv(
        dirpath.joinpath("sentences.csv"),
        fieldnames=["sent_id", "iso-639-3", "text"],
        delimiter="\t",
        quoting=1,
    )
    langs = set(iso_lang_map.keys())
    ds = [
        (row["text"], iso_lang_map[row["iso-639-3"]])
        for row in rows
        if row["iso-639-3"] in langs
        and itertoolz.count(char for char in row["text"] if char.isalnum()) >= min_len
    ]
    return ds