Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _normalize_text(cls, text):
""" Preprocess text and normalize for parsing network """
pgs = text.split("\n")
normalized_pgs = [
[
tok.txt
for tok in list(bintokenizer.tokenize(pg))
if BIN_Token.is_understood(tok)
]
for pg in pgs
]
return [
" ".join(tok for tok in npg if tok) for npg in normalized_pgs
]
def split_text(text: str) -> List[List[str]]:
""" Segments contiguous (Icelandic) text into paragraphs and sentences
and returns a list of lists
"""
text = prep_text_for_tokenizer(text)
tok_stream = bintokenizer.tokenize(text)
pgs = tokenizer.paragraphs(tok_stream)
data = [] # type: List[List[str]]
for pg in pgs:
pg_data = [] # type: List[str]
for _, sentence in pg:
sentence = list(filter(BIN_Token.is_understood, sentence))
sentence_text = tokenizer.normalized_text_from_tokens(sentence)
pg_data.append(sentence_text)
data.append(pg_data)
return data
def _normalize_sentence(cls, single_sentence):
""" Preprocess text and normalize for parsing network """
return [
tok.txt
for tok in bintokenizer.tokenize(single_sentence)
if BIN_Token.is_understood(tok)
]
for c in self._CASES:
if c != case_override:
if c.upper() in m.beyging:
return False
elif self.case.upper() not in m.beyging:
return False
# Check number match
if self.number is not None:
if self.number.upper() not in m.beyging:
return False
if self.is_verb:
# The following code is parallel to BIN_Token.verb_matches()
for v in self.varlist:
# Lookup variant to see if it is one of the required ones for verbs
rq = BIN_Token._VERB_FORMS.get(v)
if rq and rq not in m.beyging:
# If this is required variant that is not found in the form we have,
# return False
return False
for v in ["sagnb", "lhþt", "bh"]:
if BIN_Token.VARIANT[v] in m.beyging and v not in self.variants:
return False
if "bh" in self.variants and "ST" in m.beyging:
return False
if self.varlist[0] not in "012":
# No need for argument check: we're done, unless...
if "lhþt" in self.variants:
# Special check for lhþt: may specify a case without it being an argument case
if any(
c in self.variants and BIN_Token.VARIANT[c] not in m.beyging
for c in BIN_Token.CASES
if rq and rq not in m.beyging:
# If this is required variant that is not found in the form we have,
# return False
return False
for v in ["sagnb", "lhþt", "bh"]:
if BIN_Token.VARIANT[v] in m.beyging and v not in self.variants:
return False
if "bh" in self.variants and "ST" in m.beyging:
return False
if self.varlist[0] not in "012":
# No need for argument check: we're done, unless...
if "lhþt" in self.variants:
# Special check for lhþt: may specify a case without it being an argument case
if any(
c in self.variants and BIN_Token.VARIANT[c] not in m.beyging
for c in BIN_Token.CASES
):
# Terminal specified a non-argument case but the token doesn't have it:
# no match
return False
return True
nargs = int(self.varlist[0])
if m.stofn in VerbObjects.VERBS[nargs]:
if nargs == 0 or len(self.varlist) < 2:
# No arguments: we're done
return True
for argspec in VerbObjects.VERBS[nargs][m.stofn]:
if all(self.varlist[1 + ix] == c for ix, c in enumerate(argspec)):
# This verb takes arguments that match the terminal
return True
return False
for i in range(0, nargs):
and returns:
dictionary of sentence indices to sentences
dictionary of paragraph index to constituent sentence indices"""
text = prep_text_for_tokenizer(text)
tok_stream = bintokenizer.tokenize(text)
pgs = tokenizer.paragraphs(tok_stream)
pg_idx_to_sent_idx = dict() # type: Dict[int, List[int]]
sent_idx_to_sent = dict() # type: Dict[int, str]
curr_sent_idx = 0
curr_pg_idx = 0
for pg in pgs:
sent_idxs = []
for _, sent in pg:
curr_sent = list(filter(BIN_Token.is_understood, sent)) # type: List[Tok]
curr_sent_text = tokenizer.normalized_text_from_tokens(curr_sent)
sent_idxs.append(curr_sent_idx)
sent_idx_to_sent[curr_sent_idx] = curr_sent_text
curr_sent_idx += 1
pg_idx_to_sent_idx[curr_pg_idx] = sent_idxs
curr_pg_idx += 1
return pg_idx_to_sent_idx, sent_idx_to_sent