Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
include_pos = utils.to_collection(include_pos, compat.unicode_, set)
if isinstance(topn, float):
if not 0.0 < topn <= 1.0:
raise ValueError(
"topn={} is invalid; "
"must be an int, or a float between 0.0 and 1.0".format(topn)
)
# bail out on empty docs
if not doc:
return []
# build up a graph of good words, edges weighting by adjacent sentence co-occurrence
cooc_mat = collections.Counter()
n_sents = itertoolz.count(doc.sents) # in case doc only has 1 sentence
for sent1, sent2 in itertoolz.sliding_window(min(2, n_sents), doc.sents):
window_words = (
word
for word in itertoolz.concatv(sent1, sent2)
if not (word.is_stop or word.is_punct or word.is_space)
and (not include_pos or word.pos_ in include_pos)
)
window_words = ke_utils.normalize_terms(window_words, normalize)
cooc_mat.update(
w1_w2
for w1_w2 in itertools.combinations(sorted(window_words), 2)
if w1_w2[0] != w1_w2[1]
)
# doc doesn't have any valid words...
if not cooc_mat:
return []
# pos = set(aug_tok.pos for aug_tok in aug_toks if aug_tok.is_word)
# cand_idx_pairs = list(
# itertools.chain.from_iterable(
# itertools.combinations(
# (idx for idx, aug_tok in enumerate(aug_toks) if aug_tok.pos == pos_),
# 2,
# )
# for pos_ in pos
# )
# )
cand_idxs = (
idx for idx, aug_tok in enumerate(aug_toks)
if aug_tok.is_word and (pos is None or aug_tok.pos in pos)
)
cand_idx_pairs = [
(idx1, idx2) for idx1, idx2 in itertoolz.sliding_window(2, cand_idxs)
if idx2 - idx1 == 1
]
rand_idx_pairs = _select_random_candidates(cand_idx_pairs, num)
if not rand_idx_pairs:
return aug_toks[:]
new_aug_toks = aug_toks[:]
for idx1, idx2 in rand_idx_pairs:
tok1 = new_aug_toks[idx1]
tok2 = new_aug_toks[idx2]
new_aug_toks[idx1] = aug_utils.AugTok(
text=tok2.text,
ws=tok1.ws,
pos=tok2.pos,
is_word=tok2.is_word,
syns=tok2.syns,
# bail out on very short sentences to avoid clobbering meaning
if len(aug_toks) < 3:
return aug_toks[:]
cand_aug_toks = [
aug_tok for aug_tok in aug_toks
if aug_tok.syns and (pos is None or aug_tok.pos in pos)
]
rand_aug_toks = _select_random_candidates(cand_aug_toks, num)
rand_idxs = random.sample(range(len(aug_toks)), len(rand_aug_toks))
if not rand_idxs:
return aug_toks[:]
rand_aug_toks = iter(rand_aug_toks)
new_aug_toks = []
for idx, (prev_tok, curr_tok) in enumerate(itertoolz.sliding_window(2, [None] + aug_toks)):
if idx in rand_idxs:
rand_aug_tok = next(rand_aug_toks)
if prev_tok:
# use previous token's whitespace for inserted synonym
new_tok_ws = prev_tok.ws
if prev_tok.is_word and not prev_tok.ws:
# previous token should have whitespace, if a word
new_aug_toks[-1] = aug_utils.AugTok(
text=prev_tok.text,
ws=" ",
pos=prev_tok.pos,
is_word=True,
syns=prev_tok.syns,
)
else:
new_tok_ws = " "
def unzip(seq):
"""
Borrowed from ``toolz.sandbox.core.unzip``, but using cytoolz instead of toolz
to avoid the additional dependency.
"""
seq = iter(seq)
# check how many iterators we need
try:
first = tuple(next(seq))
except StopIteration:
return tuple()
# and create them
niters = len(first)
seqs = itertools.tee(itertoolz.cons(first, seq), niters)
return tuple(itertools.starmap(itertoolz.pluck, enumerate(seqs)))
def unzip(seq):
"""
Borrowed from ``toolz.sandbox.core.unzip``, but using cytoolz instead of toolz
to avoid the additional dependency.
"""
seq = iter(seq)
# check how many iterators we need
try:
first = tuple(next(seq))
except StopIteration:
return tuple()
# and create them
niters = len(first)
seqs = itertools.tee(itertoolz.cons(first, seq), niters)
return tuple(itertools.starmap(itertoolz.pluck, enumerate(seqs)))
normalize (str)
stop_words (Set[str])
window_size (int)
Returns:
Dict[int, Dict[str, list]]
"""
word_occ_vals = collections.defaultdict(lambda: collections.defaultdict(list))
def _is_upper_cased(tok):
return tok.is_upper or (tok.is_title and not tok.is_sent_start)
attr_name = _get_attr_name(normalize, False)
padding = [None] * window_size
for sent_idx, sent in enumerate(doc.sents):
sent_padded = itertoolz.concatv(padding, sent, padding)
for window in itertoolz.sliding_window(1 + (2 * window_size), sent_padded):
lwords, word, rwords = window[:window_size], window[window_size], window[window_size + 1:]
w_id = getattr(word, attr_name)
if word.is_stop:
stop_words.add(w_id)
word_occ_vals[w_id]["is_uc"].append(_is_upper_cased(word))
word_occ_vals[w_id]["sent_idx"].append(sent_idx)
word_occ_vals[w_id]["l_context"].extend(
getattr(w, attr_name) for w in lwords
if not (w is None or w.is_punct or w.is_space)
)
word_occ_vals[w_id]["r_context"].extend(
getattr(w, attr_name) for w in rwords
if not (w is None or w.is_punct or w.is_space)
)
return word_occ_vals