How to use the reynir.TOK function in reynir

To help you get started, we’ve selected a few reynir examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mideind / Greynir / postagger.py View on Github external
TOK.WORD,
                    TOK.ENTITY,
                    TOK.PUNCTUATION,
                }:
                    # For tokens except words, entities and punctuation, include the val field
                    if t.kind == TOK.PERSON:
                        d["v"], d["g"] = TreeUtility.choose_full_name(
                            t.val, case=None, gender=None
                        )
                    else:
                        d["v"] = t.val
                if t.kind in {
                    TOK.WORD,
                    TOK.ENTITY,
                    TOK.PERSON,
                    TOK.NUMBER,
                    TOK.YEAR,
                    TOK.ORDINAL,
                    TOK.PERCENT,
                }:
                    d["i"] = tags[ix]
                    ix += 1
                if t.kind == TOK.WORD and " " in d["x"]:
                    # Some kind of phrase: split it
                    xlist = d["x"].split()
                    for x in xlist:
                        d["x"] = x
                        if x == "og":
                            # Probably intermediate word: fjármála- og efnahagsráðherra
                            yield dict(x="og", i="c")
                        else:
                            yield d.copy()
github mideind / Greynir / postagger.py View on Github external
def gen_tokens():
            """ Generate a Greynir token sequence from a tagging result """
            ix = 0
            for t in toklist:
                if not t.txt:
                    continue
                # The code below should correspond to TreeUtility._describe_token()
                d = dict(x=t.txt)
                if t.kind == TOK.WORD:
                    # set d["m"] to the meaning
                    pass
                else:
                    d["k"] = t.kind
                if t.val is not None and t.kind not in {
                    TOK.WORD,
                    TOK.ENTITY,
                    TOK.PUNCTUATION,
                }:
                    # For tokens except words, entities and punctuation, include the val field
                    if t.kind == TOK.PERSON:
                        d["v"], d["g"] = TreeUtility.choose_full_name(
                            t.val, case=None, gender=None
                        )
                    else:
                        d["v"] = t.val
                if t.kind in {
                    TOK.WORD,
                    TOK.ENTITY,
                    TOK.PERSON,
                    TOK.NUMBER,
                    TOK.YEAR,
                    TOK.ORDINAL,
github mideind / Greynir / nertokenizer.py View on Github external
def recognize_entities(
    token_stream: Iterator[Tok], enclosing_session=None, token_ctor=TOK
) -> Iterator[Tok]:

    """ Parse a stream of tokens looking for (capitalized) entity names
        The algorithm implements N-token lookahead where N is the
        length of the longest entity name having a particular initial word.
        Adds a named entity recognition layer on top of the
        reynir.bintokenizer.tokenize() function.

    """

    # Token queue
    tq = []  # type: List[Tok]
    # Phrases we're considering. Note that an entry of None
    # indicates that the accumulated phrase so far is a complete
    # and valid known entity name.
    state = defaultdict(list)  # type: Dict[Union[str, None], List[Tuple[List[str], Entity]]]
github mideind / Greynir / query.py View on Github external
def _parse(toklist):
        """ Parse a token list as a query """
        bp = Query._parser
        assert bp is not None
        num_sent = 0
        num_parsed_sent = 0
        rdc = Reducer(bp.grammar)
        trees = dict()
        sent = []  # type: List[Tok]

        for t in toklist:
            if t[0] == TOK.S_BEGIN:
                sent = []
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if not slen:
                    continue
                num_sent += 1
                # Parse the accumulated sentence
                num = 0
                try:
                    # Parse the sentence
                    forest = bp.go(sent)
                    if forest is not None:
                        num = Fast_Parser.num_combinations(forest)
                        if num > 1:
                            # Reduce the resulting forest
                            forest = rdc.go(forest)
github mideind / Greynir / postagger.py View on Github external
# add 'ct' as a possibility (it does not come directly from a BÍN mark)
                s.add("ct")
            # Add a +1 bias to the counts so that no lemma/tag pairs have zero frequency
            prob = self.lemma_count(txt) + len(s)
            d = self.lemma_tags(txt)
            # It is possible for the probabilities of the tags in set s
            # not to add up to 1.0. This can happen if the tokenizer has
            # eliminated certain BÍN meanings due to updated settings
            # in Pref.conf.
            return [(tag, (d.get(tag, 0) + 1) / prob) for tag in s]

        if token.kind == TOK.WORD:
            taglist = ifd_taglist_word(token.txt, token.val)
        elif token.kind == TOK.ENTITY:
            taglist = ifd_taglist_entity(token.txt)
        elif token.kind == TOK.PERSON:
            taglist = ifd_taglist_person(token.txt, token.val)
        elif token.kind == TOK.NUMBER:
            taglist = [("tfkfn", 1.0)]  # !!!
        elif token.kind == TOK.YEAR:
            taglist = [("ta", 1.0)]
        elif token.kind == TOK.PERCENT:
            taglist = [("tp", 1.0)]
        elif token.kind == TOK.ORDINAL:
            taglist = [("lxexsf", 1.0)]
        # elif token.kind == TOK.CURRENCY:
        #    taglist = None
        # elif token.kind == TOK.AMOUNT:
        #    taglist = None
        # elif token.kind == TOK.DATE:
        #    taglist = None
        elif token.kind == TOK.PUNCTUATION:
github mideind / Greynir / queries / builtin.py View on Github external
def create_name_register(tokens, session, all_names=False) -> RegisterType:
    """ Assemble a dictionary of person and entity names
        occurring in the token list """
    register = {}  # type: RegisterType
    for t in tokens:
        if t.kind == TOK.PERSON:
            gn = t.val
            for pn in gn:
                add_name_to_register(pn.name, register, session, all_names=all_names)
        elif t.kind == TOK.ENTITY:
            add_entity_to_register(t.txt, register, session, all_names=all_names)
    return register