How to use the reynir.tokenize function in reynir

To help you get started, we’ve selected a few reynir examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mideind / Greynir / tests / test_processors.py View on Github external
Í miðbæ Reykjavíkur er herrafataverslunin Geysir.

       Mér er sagt að Geysir sé hættur að gjósa.
       
       Geysir er hættur að gjósa.
       
       Geysir er gamall goshver.
       
       Fyrirtækið Apple-búðin selur Apple Mac tölvur.
       Fyrirtækið Origo selur IBM tölvur.
       
       Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf.
       
    """
    toklist = tokenize(text)
    fp = Fast_Parser(verbose=False)
    ip = IncrementalParser(fp, toklist, verbose=False)
    # Dict of parse trees in string dump format,
    # stored by sentence index (1-based)
    trees = OrderedDict()
    num_sent = 0
    for p in ip.paragraphs():
        for sent in p.sentences():
            num_sent += 1
            num_tokens = len(sent)
            assert sent.parse(), "Sentence does not parse: " + sent.text
            # Obtain a text representation of the parse tree
            token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
            # Create a verbose text representation of
            # the highest scoring parse tree
            tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
github mideind / Greynir / fetcher.py View on Github external
def to_tokens(soup, enclosing_session=None):
        """ Convert an HTML soup root into a parsable token stream """

        # Extract the text content of the HTML into a list
        tlist = Fetcher.TextList()
        Fetcher.extract_text(soup, tlist)
        text = tlist.result()

        # Tokenize the resulting text, returning a generator
        token_stream = tokenize(text)
        return recognize_entities(token_stream, enclosing_session=enclosing_session)
github mideind / Greynir / treeutil.py View on Github external
def _process_text(parser, session, text, all_names, xform):
        """ Low-level utility function to parse text and return the result of
            a transformation function (xform) for each sentence.
            Set all_names = True to get a comprehensive name register.
            Set all_names = False to get a simple name register.
            Set all_names = None to get no name register. """
        t0 = time.time()
        # Demarcate paragraphs in the input
        text = mark_paragraphs(text)
        # Tokenize the result
        token_stream = tokenize(text)
        toklist = list(recognize_entities(token_stream, enclosing_session=session))
        t1 = time.time()
        pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform)

        if all_names is None:
            register = None
        else:
            from queries.builtin import create_name_register

            register = create_name_register(toklist, session, all_names=all_names)

        t2 = time.time()
        stats["tok_time"] = t1 - t0
        stats["parse_time"] = t2 - t1
        stats["total_time"] = t2 - t0
        return (pgs, stats, register)
github mideind / Greynir / query.py View on Github external
def parse(self, result):
        """ Parse the query from its string, returning True if valid """
        self._tree = None  # Erase previous tree, if any
        self._error = None  # Erase previous error, if any
        self._qtype = None  # Erase previous query type, if any
        self._key = None
        self._toklist = None

        q = self._query.strip()
        if not q:
            self.set_error("E_EMPTY_QUERY")
            return False

        toklist = tokenize(q, auto_uppercase=self._auto_uppercase and q.islower())
        toklist = list(toklist)
        # The following seems not to be needed and may complicate things
        # toklist = list(recognize_entities(toklist, enclosing_session=self._session))

        actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt))
        if actual_q:
            actual_q = actual_q[0].upper() + actual_q[1:]
            if not any(actual_q.endswith(s) for s in ("?", ".", "!")):
                actual_q += "?"

        # Update the beautified query string, as the actual_q string
        # probably has more correct capitalization
        self.set_beautified_query(actual_q)

        if Settings.DEBUG:
            # Log the query string as seen by the parser
github mideind / Greynir / postagger.py View on Github external
def tag(self, toklist_or_text):
        """ Assign IFD tags to the given toklist, putting the tag in the
            "i" field of each non-punctuation token. If a string is passed,
            tokenize it first. Return the toklist so modified. """
        if isinstance(toklist_or_text, str):
            toklist = list(tokenize(toklist_or_text))
        else:
            toklist = list(toklist_or_text)

        tagsets = []
        for t in toklist:
            if not t.txt:
                continue
            taglist = self.tag_single_token(t)
            if taglist:
                #    display = " | ".join("{0} {1:.2f}".format(w, p) for w, p in taglist)
                #    print("{0:20}: {1}".format(t.txt, display))
                tagsets.append(taglist)

        _, tags = self._most_likely(tagsets)

        if not tags: