Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
Args:
dirpath (str or :class:`pathlib.Path`)
langs (Set[str])
min_len (int)
Returns:
List[Tuple[str, str]]
"""
ds = textacy.datasets.UDHR(data_dir=dirpath)
data = [
(snippet, meta["lang"])
for text, meta in ds.records()
for snippet in text.split("\n")
if meta["lang"] in langs
and itertoolz.count(char for char in snippet if char.isalnum()) >= min_len
]
return data
def _add_valid_doc(self, doc):
self.docs.append(doc)
self._doc_ids.append(id(doc))
self.n_docs += 1
self.n_tokens += len(doc)
if doc.is_sentenced:
self.n_sents += itertoolz.count(doc.sents)
def __init__(self, doc):
self.lang = doc.vocab.lang
self.n_sents = itertoolz.count(doc.sents) if doc.is_sentenced else None
# get objs for basic count computations
hyphenator = cache.load_hyphenator(lang=self.lang)
words = tuple(
extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False)
)
syllables_per_word = tuple(
len(hyphenator.positions(word.lower_)) + 1 for word in words
)
chars_per_word = tuple(len(word) for word in words)
# compute basic counts needed for most readability stats
self.n_words = len(words)
self.n_unique_words = len({word.lower for word in words})
self.n_chars = sum(chars_per_word)
self.n_long_words = sum(1 for cpw in chars_per_word if cpw >= 7)
self.n_syllables = sum(syllables_per_word)
self.n_monosyllable_words = sum(1 for spw in syllables_per_word if spw == 1)
def _remove_one_doc_by_index(self, idx):
doc = self.docs[idx]
self.n_docs -= 1
self.n_tokens -= len(doc)
if doc.is_sentenced:
self.n_sents -= itertoolz.count(doc.sents)
del self.docs[idx]
del self._doc_ids[idx]
# validate / transform args
include_pos = utils.to_collection(include_pos, compat.unicode_, set)
if isinstance(topn, float):
if not 0.0 < topn <= 1.0:
raise ValueError(
"topn={} is invalid; "
"must be an int, or a float between 0.0 and 1.0".format(topn)
)
# bail out on empty docs
if not doc:
return []
# build up a graph of good words, edges weighting by adjacent sentence co-occurrence
cooc_mat = collections.Counter()
n_sents = itertoolz.count(doc.sents) # in case doc only has 1 sentence
for sent1, sent2 in itertoolz.sliding_window(min(2, n_sents), doc.sents):
window_words = (
word
for word in itertoolz.concatv(sent1, sent2)
if not (word.is_stop or word.is_punct or word.is_space)
and (not include_pos or word.pos_ in include_pos)
)
window_words = ke_utils.normalize_terms(window_words, normalize)
cooc_mat.update(
w1_w2
for w1_w2 in itertools.combinations(sorted(window_words), 2)
if w1_w2[0] != w1_w2[1]
)
# doc doesn't have any valid words...
if not cooc_mat:
return []
)
word_vals[wid]["right_context"].extend(
w.lower for w in rwords
if not (w is None or w.is_punct or w.is_space)
)
# compute word frequencies and aggregated statistics
word_freqs = {wid: len(vals["is_upper_cased"]) for wid, vals in word_vals.items()}
freqs_nsw = [freq for wid, freq in word_freqs.items() if wid not in stop_words]
freqs_max = max(word_freqs.values())
freq_mean = compat.mean_(freqs_nsw)
freq_stdev = compat.stdev_(freqs_nsw)
# compute per-word weights
word_weights = collections.defaultdict(dict)
n_sents = itertoolz.count(doc.sents)
for wid, vals in word_vals.items():
freq = word_freqs[wid]
word_weights[wid]["case"] = sum(vals["is_upper_cased"]) / math.log2(1 + freq)
word_weights[wid]["pos"] = math.log2(math.log2(3 + compat.median_(vals["sent_idx"])))
word_weights[wid]["freq"] = freq / (freq_mean + freq_stdev)
n_unique_lc = len(set(vals["left_context"]))
n_unique_rc = len(set(vals["right_context"]))
try:
wl = n_unique_lc / len(vals["left_context"])
except ZeroDivisionError:
wl = 0.0
try:
wr = n_unique_rc / len(vals["right_context"])
except ZeroDivisionError:
wr = 0.0
pl = n_unique_lc / freqs_max
"""
dirpath = textacy.utils.to_path(dirpath).resolve()
raw_tweets = textacy.io.read_json(
dirpath.joinpath("tweets.jsonl"), mode="rt", lines=True)
tweets = []
for tweet in raw_tweets:
# totally remove any URLS from tweet text
for url in tweet.get("urls", []):
for item in url.values():
tweet["text"] = tweet["text"].replace(item, "")
tweets.append(tweet)
ds = [
(tweet["text"], tweet["lang"])
for tweet in tweets
if tweet["lang"] in langs
and itertoolz.count(char for char in tweet["text"] if char.isalnum()) >= min_len
]
return ds