Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
line_count += 1
if line_count%10000 == 0 :
endtime = datetime.datetime.now()
print("has processed: %d lines, takes %d seconds..." % (line_count, (endtime - starttime).seconds))
# split the paragraphs after removing references, head entity and href
paras = re.split(para_p, re.sub(trim_href_p, "", line.lower()))
for para in paras:
sent_pos = 0
words_set = []
entity_index = []
ent_dic = {}
# skip the para within length of 30 or Nonetype
if not para or len(para) <=30:
continue
# iterate all the anchors in wiki text
for anchor in re.finditer(anchor_p, para):
segment(para[sent_pos:anchor.start()], words_set)
anchor_word = toWord(anchor.group(), ent_dic)
if anchor_word:
entity_index.append(len(words_set))
words_set.append(anchor_word)
sent_pos = anchor.end()
if sent_pos < len(para):
segment(para[sent_pos:len(para)], words_set)
if len(words_set) > 8:
texts.append(" ".join(words_set)+"\n")
if len(texts) >= 10000:
fout_text.writelines(texts)
del texts[:]
for i in entity_index:
anchors.append(ent_dic[words_set[i]]+"\t\t"+";".join(reversed(words_set[max(0,i-ent_half_window-1):i]))+"\n")
anchors.append(ent_dic[words_set[i]] + "\t\t"+";".join(words_set[i+1:min(len(words_set), i+1+ent_half_window)])+"\n")
def _tokenize(result: REPPResult, pattern: str) -> List[Tuple[int, int, str]]:
s, sm, em = result # unpack for efficiency in loop
toks = []
pos = 0
for m in re.finditer(pattern, result.string):
if pos < m.start():
toks.append((pos + sm[pos + 1],
m.start() + em[m.start()],
s[pos:m.start()]))
pos = m.end()
if pos < len(s):
toks.append((pos + sm[pos + 1],
len(s) + em[len(s)],
s[pos:]))
return toks
def chunk_doc(content: str) -> List[str]:
"""
Split document into a list of chunks. All unequal numbered indices are
numbered text.
:param content: The content of the document as a string.
"""
starts = regex.finditer(r"\\beginnumbering\n", content)
ends = regex.finditer(r"\n\\endnumbering", content)
if "\\beginnumbering\n" in content:
indices = []
for start, end in zip(starts, ends):
if not indices:
# Setup the indices with the slice before first numbered text
indices.append([0, start.span()[0]])
else:
# Add the indices between previous numbered section and next
indices.append([indices[-1][-1], start.span()[0]])
# Now, add the indices of the numbered section
indices.append([start.span()[0], end.span()[1]])
# Add the tail from last numbered to end
try:
indices.append([indices[-1][-1], len(content) + 1])
except IndexError:
raise ValueError(
def try_merge_modifier_token(self, extract_result: ExtractResult, pattern: Pattern, source: str,
potential_ambiguity: bool = False) -> bool:
before_str = source[0:extract_result.start]
after_str = source[extract_result.start: extract_result.length]
# Avoid adding mod for ambiguity cases, such as "from" in "from ... to ..." should not add mod
if potential_ambiguity and self.config.ambiguous_range_modifier_prefix and \
regex.search(self.config.ambiguous_range_modifier_prefix, before_str):
matches = list(regex.finditer(self.config.potential_ambiguous_range_regex, source))
if matches and len(matches):
return any(match.start() < extract_result.start + extract_result.length and match.end() > extract_result.start for match in matches)
# return self._filter_item(extract_result, matches)
token = self.has_token_index(before_str.strip(), pattern)
if token.matched:
mod_len = len(before_str) - token.index
extract_result.length += mod_len
extract_result.start -= mod_len
extract_result.text = source[extract_result.start:extract_result.start + extract_result.length]
extract_result.meta_data = self.assign_mod_metadata(extract_result.meta_data)
return True
elif self.config.check_both_before_after:
# check also after_str
after_str = source[extract_result.start: extract_result.length]
def basic_regex_match(self, source: str) -> List[Token]:
tokens: List[Token] = list()
# handle "now"
matches: List[Match] = list(
regex.finditer(self.config.now_regex, source))
tokens.extend(map(lambda x: Token(x.start(), x.end()), matches))
return tokens
def finditer(pattern, string, *args, **kwargs):
"""Wrapper for `finditer`."""
flags = args[2] if len(args) > 2 else kwargs.get('flags', 0)
return _regex.finditer(_apply_search_backrefs(pattern, flags), string, *args, **kwargs)
map(lambda x: MatchesVal(matches=list(re.finditer(x.re, source)), val=x.val), self.regexes))
matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))
def __findeqtagpairspans(
s: str,
tag: str,
useregex: bool = False
) -> Iterable[Tuple[Tuple[int, int], ...]]:
for match in re.finditer(r"(?P<__open>{})(?P<__content>.*?)(?P<__close>\1)".format(tag if useregex else re.escape(tag)), s):
yield (match.span("__open"), match.span("__content"), match.span("__close"))
def _filter_ambiguity(self, extract_results: List[ExtractResult], text: str, ) -> List[ExtractResult]:
if self.config.ambiguity_filters_dict is not None:
for regex_var in self.config.ambiguity_filters_dict:
regex_var_value = self.config.ambiguity_filters_dict[regex_var]
try:
reg_len = list(filter(lambda x: x.group(), regex.finditer(regex_var_value, text)))
reg_length = len(reg_len)
if reg_length > 0:
matches = reg_len
new_ers = list(filter(lambda x: list(
filter(lambda m: m.start() < x.start + x.length and m.start() +
len(m.group()) > x.start, matches)), extract_results))
if len(new_ers) > 0:
for item in extract_results:
for i in new_ers:
if item is i:
extract_results.remove(item)
except Exception:
pass
def _split_emojis(self, node, token_class="emoticon"):
boundaries = []
for m in re.finditer(r"\X", node.value.text):
if m.end() - m.start() > 1:
if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}\uFE0F]", m.group()):
boundaries.append((m.start(), m.end(), None))
else:
if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}]", m.group()):
boundaries.append((m.start(), m.end(), None))
self._split_on_boundaries(node, boundaries, token_class)