Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
)
n_keyterms = int(round(n_toks * n_keyterms))
if window_width < 2:
raise ValueError("`window_width` must be >= 2")
window_width = min(n_toks, window_width)
min_term_freq = min(n_toks // 1000, 4)
if isinstance(ngrams, int):
ngrams = (ngrams,)
# build full list of candidate terms
# if inverse doc freqs available, include nouns, adjectives, and verbs;
# otherwise, just include nouns and adjectives
# (without IDF downweighting, verbs dominate the results in a bad way)
include_pos = {"NOUN", "PROPN", "ADJ", "VERB"} if idf else {"NOUN", "PROPN", "ADJ"}
terms = itertoolz.concat(
extract.ngrams(
doc,
n,
filter_stops=True,
filter_punct=True,
filter_nums=False,
include_pos=include_pos,
min_freq=min_term_freq,
)
for n in ngrams
)
# get normalized term strings, as desired
# paired with positional index in document and length in a 3-tuple
if normalize == "lemma":
terms = [(term.lemma_, term.start, len(term)) for term in terms]
elif normalize == "lower":
def test_records():
for text, meta in DATASET.records(limit=3):
assert isinstance(text, compat.unicode_)
assert isinstance(meta, dict)
def test_texts():
for text in DATASET.texts(limit=3):
assert isinstance(text, compat.unicode_)
def test_plaintext_functionality(text):
preprocessed_text = preprocessing.normalize_whitespace(text)
preprocessed_text = preprocessing.remove_punctuation(text)
preprocessed_text = preprocessed_text.lower()
assert all(char.islower() for char in preprocessed_text if char.isalpha())
assert all(char.isalnum() or char.isspace() for char in preprocessed_text)
keyword = "America"
kwics = text_utils.keyword_in_context(
text, keyword, window_width=35, print_only=False
)
for pre, kw, post in kwics:
assert kw == keyword
assert isinstance(pre, compat.unicode_)
assert isinstance(post, compat.unicode_)
def test_lang(self, doc):
lang = doc._.lang
assert isinstance(lang, compat.unicode_)
assert lang == doc.vocab.lang
def test_read_write_sparse_csr_compressed(self, tmpdir):
expected = sp.csr_matrix(
(
np.array([1, 2, 3, 4, 5, 6]),
(np.array([0, 0, 1, 2, 2, 2]), np.array([0, 2, 2, 0, 1, 2])),
),
shape=(3, 3),
)
filepath = str(tmpdir.join("test_read_write_sparse_matrix_csr_compressed.npz"))
io.write_sparse_matrix(expected, filepath, compressed=True)
observed = io.read_sparse_matrix(filepath, kind="csr")
assert abs(observed - expected).nnz == 0
def test_unpack_archive(tmpdir):
data = "Here's some text data to pack and unpack."
fpath_txt = str(tmpdir.join("test_unpack_archive.txt"))
with tio.open_sesame(fpath_txt, mode="wt") as f:
f.write(data)
fpath_zip = str(tmpdir.join("test_unpack_archive.zip"))
with zipfile.ZipFile(fpath_zip, "w") as f:
f.write(fpath_txt)
unpack_archive(fpath_zip, extract_dir=tmpdir)
fpath_tar = str(tmpdir.join("test_unpack_archive.tar"))
with tarfile.TarFile(fpath_tar, "w") as f:
f.add(fpath_txt)
unpack_archive(fpath_tar, extract_dir=tmpdir)
unpack_archive(fpath_txt, extract_dir=tmpdir)
def test_read_write_bytes_lines(self, tmpdir, spacy_doc):
expected = [{"idx": i, "sent": sent.text} for i, sent in enumerate(spacy_doc.sents)]
for ext in (".json", ".json.gz", ".json.bz2", ".json.xz"):
filepath = str(tmpdir.join("test_read_write_json_lines_bytes" + ext))
if compat.PY2 is True:
if ext == ".json.xz":
with pytest.raises(ValueError):
io.open_sesame(
filepath, mode="wb", encoding="utf-8", make_dirs=True
)
else:
io.write_json(expected, filepath, mode="wb", make_dirs=True, lines=True)
observed = list(io.read_json(filepath, mode="rb", lines=True))
assert observed == expected
else:
with pytest.raises(TypeError):
io.write_json(
expected,
filepath,
mode="wb",
encoding=None,
make_dirs=True,
lines=True,
)
def test_stats():
text = "the quick fox and the cat. The turtle and the rabbit."
doc = make_spacy_doc(text, lang=SPACY_MODEL)
stats = nlp.compute_stats(doc)
assert stats.counts.sentences == 2
assert stats.counts.words == 11
def empty_spacy_doc():
return textacy.make_spacy_doc("", lang="en")