How to use the reynir.bintokenizer.tokenize function in reynir

To help you get started, we’ve selected a few reynir examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mideind / Greynir / nn / nntree.py View on Github external
def tokenize_and_merge_possible_mw_tokens(text, flat_tree):
    mw_tokens = list(bintokenizer.tokenize(text))  # multi-word tokens
    mw_tokens = [tok.txt.split(" ") for tok in mw_tokens if tok.txt is not None]
    sw_tokens = [tok for toks in mw_tokens for tok in toks]  # single-word tokens

    parse_tokens = list(flat_tree.split(" "))
    parse_terminals = filter(lambda x: x[1][0].islower(), enumerate(parse_tokens))

    leaf_idx_to_parse_idx = {
        leaf_idx: ptok_idx
        for (leaf_idx, (ptok_idx, ptok)) in enumerate(parse_terminals)
    }

    offset = 0
    merge_list = []
    for mw_token in mw_tokens:
        sw_count = len(mw_token)
        idxed_mw_token = [(idx + offset, token) for (idx, token) in enumerate(mw_token)]
github mideind / Greynir / nn / nnclient.py View on Github external
def _normalize_text(cls, text):
        """ Preprocess text and normalize for parsing network """
        pgs = text.split("\n")
        normalized_pgs = [
            [
                tok.txt
                for tok in list(bintokenizer.tokenize(pg))
                if BIN_Token.is_understood(tok)
            ]
            for pg in pgs
        ]
        return [
            " ".join(tok for tok in npg if tok) for npg in normalized_pgs
        ]
github mideind / Greynir / nn / utils.py View on Github external
def index_text(text: str) -> Tuple[Dict[int, List[int]], Dict[int, str]]:
    """ Segments contiguous (Icelandic) text into paragraphs and sentences
        and returns:
            dictionary of sentence indices to sentences
            dictionary of paragraph index to constituent sentence indices"""
    text = prep_text_for_tokenizer(text)
    tok_stream = bintokenizer.tokenize(text)

    pgs = tokenizer.paragraphs(tok_stream)
    pg_idx_to_sent_idx = dict()  # type: Dict[int, List[int]]
    sent_idx_to_sent = dict()  # type: Dict[int, str]
    curr_sent_idx = 0
    curr_pg_idx = 0

    for pg in pgs:
        sent_idxs = []
        for _, sent in pg:
            curr_sent = list(filter(BIN_Token.is_understood, sent))  # type: List[Tok]
            curr_sent_text = tokenizer.normalized_text_from_tokens(curr_sent)
            sent_idxs.append(curr_sent_idx)
            sent_idx_to_sent[curr_sent_idx] = curr_sent_text
            curr_sent_idx += 1
        pg_idx_to_sent_idx[curr_pg_idx] = sent_idxs
github mideind / Greynir / nn / nnclient.py View on Github external
def _normalize_sentence(cls, single_sentence):
        """ Preprocess text and normalize for parsing network """
        return [
            tok.txt
            for tok in bintokenizer.tokenize(single_sentence)
            if BIN_Token.is_understood(tok)
        ]
github mideind / Greynir / routes / words.py View on Github external
w = san
        elif t.kind == TOK.PERSON:
            cat = "person_" + t.val[0].gender
        elif t.kind == TOK.ENTITY:
            cat = "entity"
        return (w, cat)

    # Parse arg string into word/cat tuples
    wds = _str2words(warg)

    # Try to tokenize each item that doesn't have a category
    nwds = []
    for w, c in wds:
        if c is None or c == CAT_UNKNOWN:
            # Try to tokenize
            tokens = list(filter(lambda x: x.kind in _VALID_TOKENS, tokenize(w)))
            for t in tokens:
                nwds.append(cat4token(t))
        else:
            nwds.append((w, c))

    # Filter all words not in allowed category and restrict no. words
    words = list(filter(lambda x: x[1] in _VALID_WCATS, nwds))
    words = words[:_MAX_NUM_WORDS]

    # Generate date labels
    now = datetime.utcnow()
    delta = date_to - date_from
    with changedlocale(category="LC_TIME"):
        # Group by week if period longer than 3 months
        label_date_strings = []  # type: List[Union[str, Tuple[str, str]]]
        if delta.days >= _SHOW_WEEKS_CUTOFF:
github mideind / Greynir / nn / utils.py View on Github external
def split_text(text: str) -> List[List[str]]:
    """ Segments contiguous (Icelandic) text into paragraphs and sentences
        and returns a list of lists
    """
    text = prep_text_for_tokenizer(text)
    tok_stream = bintokenizer.tokenize(text)
    pgs = tokenizer.paragraphs(tok_stream)
    data = []  # type: List[List[str]]
    for pg in pgs:
        pg_data = []  # type: List[str]
        for _, sentence in pg:
            sentence = list(filter(BIN_Token.is_understood, sentence))
            sentence_text = tokenizer.normalized_text_from_tokens(sentence)
            pg_data.append(sentence_text)
        data.append(pg_data)
    return data