Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Í miðbæ Reykjavíkur er herrafataverslunin Geysir.
Mér er sagt að Geysir sé hættur að gjósa.
Geysir er hættur að gjósa.
Geysir er gamall goshver.
Fyrirtækið Apple-búðin selur Apple Mac tölvur.
Fyrirtækið Origo selur IBM tölvur.
Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf.
"""
toklist = tokenize(text)
fp = Fast_Parser(verbose=False)
ip = IncrementalParser(fp, toklist, verbose=False)
# Dict of parse trees in string dump format,
# stored by sentence index (1-based)
trees = OrderedDict()
num_sent = 0
for p in ip.paragraphs():
for sent in p.sentences():
num_sent += 1
num_tokens = len(sent)
assert sent.parse(), "Sentence does not parse: " + sent.text
# Obtain a text representation of the parse tree
token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
# Create a verbose text representation of
# the highest scoring parse tree
tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
def to_tokens(soup, enclosing_session=None):
""" Convert an HTML soup root into a parsable token stream """
# Extract the text content of the HTML into a list
tlist = Fetcher.TextList()
Fetcher.extract_text(soup, tlist)
text = tlist.result()
# Tokenize the resulting text, returning a generator
token_stream = tokenize(text)
return recognize_entities(token_stream, enclosing_session=enclosing_session)
def _process_text(parser, session, text, all_names, xform):
""" Low-level utility function to parse text and return the result of
a transformation function (xform) for each sentence.
Set all_names = True to get a comprehensive name register.
Set all_names = False to get a simple name register.
Set all_names = None to get no name register. """
t0 = time.time()
# Demarcate paragraphs in the input
text = mark_paragraphs(text)
# Tokenize the result
token_stream = tokenize(text)
toklist = list(recognize_entities(token_stream, enclosing_session=session))
t1 = time.time()
pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform)
if all_names is None:
register = None
else:
from queries.builtin import create_name_register
register = create_name_register(toklist, session, all_names=all_names)
t2 = time.time()
stats["tok_time"] = t1 - t0
stats["parse_time"] = t2 - t1
stats["total_time"] = t2 - t0
return (pgs, stats, register)
def parse(self, result):
""" Parse the query from its string, returning True if valid """
self._tree = None # Erase previous tree, if any
self._error = None # Erase previous error, if any
self._qtype = None # Erase previous query type, if any
self._key = None
self._toklist = None
q = self._query.strip()
if not q:
self.set_error("E_EMPTY_QUERY")
return False
toklist = tokenize(q, auto_uppercase=self._auto_uppercase and q.islower())
toklist = list(toklist)
# The following seems not to be needed and may complicate things
# toklist = list(recognize_entities(toklist, enclosing_session=self._session))
actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt))
if actual_q:
actual_q = actual_q[0].upper() + actual_q[1:]
if not any(actual_q.endswith(s) for s in ("?", ".", "!")):
actual_q += "?"
# Update the beautified query string, as the actual_q string
# probably has more correct capitalization
self.set_beautified_query(actual_q)
if Settings.DEBUG:
# Log the query string as seen by the parser
def tag(self, toklist_or_text):
""" Assign IFD tags to the given toklist, putting the tag in the
"i" field of each non-punctuation token. If a string is passed,
tokenize it first. Return the toklist so modified. """
if isinstance(toklist_or_text, str):
toklist = list(tokenize(toklist_or_text))
else:
toklist = list(toklist_or_text)
tagsets = []
for t in toklist:
if not t.txt:
continue
taglist = self.tag_single_token(t)
if taglist:
# display = " | ".join("{0} {1:.2f}".format(w, p) for w, p in taglist)
# print("{0:20}: {1}".format(t.txt, display))
tagsets.append(taglist)
_, tags = self._most_likely(tagsets)
if not tags: