Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Í miðbæ Reykjavíkur er herrafataverslunin Geysir.
Mér er sagt að Geysir sé hættur að gjósa.
Geysir er hættur að gjósa.
Geysir er gamall goshver.
Fyrirtækið Apple-búðin selur Apple Mac tölvur.
Fyrirtækið Origo selur IBM tölvur.
Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf.
"""
toklist = tokenize(text)
fp = Fast_Parser(verbose=False)
ip = IncrementalParser(fp, toklist, verbose=False)
# Dict of parse trees in string dump format,
# stored by sentence index (1-based)
trees = OrderedDict()
num_sent = 0
for p in ip.paragraphs():
for sent in p.sentences():
num_sent += 1
num_tokens = len(sent)
assert sent.parse(), "Sentence does not parse: " + sent.text
# Obtain a text representation of the parse tree
token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
# Create a verbose text representation of
# the highest scoring parse tree
tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
# Add information about the sentence tree's score
def raw_tag_toklist(session, toklist, root=None):
""" Parse plain text and return the parsed paragraphs as lists of sentences
where each sentence is a list of tagged tokens. The result does not
include a name register. """
def xform(tokens, tree, err_index):
""" Transformation function that simply returns a list of POS-tagged,
normalized tokens for the sentence """
return TreeUtility.dump_tokens(tokens, tree, error_index=err_index)
with Fast_Parser(verbose=False, root=root) as parser:
return TreeUtility._process_toklist(parser, session, toklist, xform)
def tag_toklist(session, toklist, all_names=False):
""" Parse plain text and return the parsed paragraphs as lists of sentences
where each sentence is a list of tagged tokens """
def xform(tokens, tree, err_index):
""" Transformation function that simply returns a list of POS-tagged,
normalized tokens for the sentence """
return TreeUtility.dump_tokens(tokens, tree, error_index=err_index)
with Fast_Parser(verbose=False) as parser: # Don't emit diagnostic messages
pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform)
from queries.builtin import create_name_register
register = create_name_register(toklist, session, all_names=all_names)
return pgs, stats, register
full_tree = None
def xform(tokens, tree, err_index):
""" Transformation function that yields a simplified parse tree
with POS-tagged, normalized terminal leaves for the sentence """
if err_index is not None:
return TreeUtility.dump_tokens(tokens, tree, error_index=err_index)
# Successfully parsed: return a simplified tree for the sentence
nonlocal full_tree
# We are assuming that there is only one parsed sentence
if full_tree is None:
# Note the full tree of the first parsed paragraph
full_tree = tree
return TreeUtility._simplify_tree(tokens, tree)
with Fast_Parser(verbose=False) as parser:
pgs, stats, _ = TreeUtility._process_text(
parser, session, text, all_names, xform
)
if (
not pgs
or stats["num_parsed"] == 0
or not pgs[0]
or any("err" in t for t in pgs[0][0])
):
# The first sentence didn't parse: let's not beat around the bush with that fact
return (None, None, stats)
# Return the simplified tree, full tree and stats
assert full_tree is not None
return (pgs[0][0], full_tree, stats)
# binary grammar file, regardless of file timestamps. This helps
# in query development, as query grammar fragment strings may change
# without any .grammar source file change (which is the default
# trigger for generating new binary grammar files).
return self.read_from_generator(
fname,
grammar_generator(),
verbose,
binary_fname,
force_new_binary=Settings.DEBUG,
)
except (IOError, OSError):
raise GrammarError("Unable to open or read grammar file", fname, 0)
class QueryParser(Fast_Parser):
""" A subclass of Fast_Parser, specialized to parse queries """
_GRAMMAR_BINARY_FILE = Fast_Parser._GRAMMAR_FILE + ".query.bin"
# Keep a separate grammar class instance and time stamp for
# QueryParser. This Python sleight-of-hand overrides
# class attributes that are defined in BIN_Parser, see binparser.py.
_grammar_ts = None
_grammar = None
_grammar_class = QueryGrammar
# Also keep separate class instances of the C grammar and its timestamp
_c_grammar = ffi.NULL
_c_grammar_ts = None
log_str = (
"Greynir instance starting with "
"host={0}:{1}, db_host={2}:{3} on Python {4}".format(
Settings.HOST,
Settings.PORT,
Settings.DB_HOSTNAME,
Settings.DB_PORT,
sys.version.replace("\n", " "),
)
)
logging.info(log_str)
print(log_str)
sys.stdout.flush()
# Running as a server module: pre-load the grammar into memory
with Fast_Parser() as fp:
pass
def _init_class(cls) -> None:
""" Initialize class attributes """
if cls._parser is None:
cls._parser = Fast_Parser(verbose=False) # Don't emit diagnostic messages
else:
# Terminal: append the text
result.append(node["x"].replace(" ", "_"))
# This uses a custom simplification scheme
simple_tree = TreeUtility._simplify_tree(
tokens,
tree,
nt_map=_TEST_NT_MAP,
id_map=_TEST_ID_MAP,
terminal_map=_TEST_TERMINAL_MAP,
)
push(simple_tree)
return "".join(result)
with Fast_Parser(verbose=False) as parser:
pgs, stats, _ = TreeUtility._process_text(
parser, session, text, all_names=None, xform=xform
)
# pgs is a list of paragraphs, each being a list of sentences
# To access the first parsed sentence, use pgs[0][0]
return (pgs, stats)
def tag_text(session, text, all_names=False):
""" Parse plain text and return the parsed paragraphs as lists of sentences
where each sentence is a list of tagged tokens """
# Don't emit diagnostic messages
with Fast_Parser(verbose=False) as parser:
return TreeUtility.raw_tag_text(parser, session, text, all_names=all_names)