How to use the depccg.tokens.Token function in depccg

To help you get started, we’ve selected a few depccg examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github masashi-y / depccg / depccg / __main__.py View on Github external
elif not sys.stdin.isatty():
        input_type = sys.stdin
    else:
        # reading from keyboard
        input_type = None
        sys.stdout.flush()
        sys.stderr.flush()
        logging.getLogger().setLevel(logging.CRITICAL)

    while True:
        fin = [line for line in ([input()] if input_type is None else input_type) if len(line.strip()) > 0]
        if len(fin) == 0:
            break

        if args.input_format == 'POSandNERtagged':
            tagged_doc = [[Token.from_piped(token) for token in sent.strip().split(' ')] for sent in fin]
            doc = [' '.join(token.word for token in sent) for sent in tagged_doc]
            res = parser.parse_doc(doc,
                                   probs=probs,
                                   tag_list=tag_list,
                                   batchsize=args.batchsize)
        elif args.input_format == 'json':
            doc = [json.loads(line) for line in fin]
            tagged_doc = annotate_fun(
                [[word for word in sent['words'].split(' ')] for sent in doc])
            res = parser.parse_json(doc)
        elif args.input_format == 'partial':
            doc, constraints = zip(*[read_partial_tree(l.strip()) for l in fin])
            tagged_doc = annotate_fun(doc)
            res = parser.parse_doc(doc,
                                   probs=probs,
                                   tag_list=tag_list,
github masashi-y / depccg / depccg / tokens.py View on Github external
def from_piped(cls, string: str) -> 'Token':
        # WORD|POS|NER or WORD|LEMMA|POS|NER
        # or WORD|LEMMA|POS|NER|CHUCK
        items = string.split('|')
        if len(items) == 5:
            word, lemma, pos, entity, chunk = items
        elif len(items) == 4:
            word, lemma, pos, entity = items
            chunk = 'XX'
        else:
            assert len(items) == 3
            word, pos, entity = items
            lemma = 'XX'
            chunk = 'XX'

        return Token(word=word,
                     lemma=lemma,
                     pos=pos,
                     entity=entity,
                     chunk=chunk)
github masashi-y / depccg / depccg / tokens.py View on Github external
from janome.tokenizer import Tokenizer
    except ImportError:
        logger.error('failed to import janome. please install it by "pip install janome".')
        exit(1)

    logger.info('use Janome to tokenize and annotate POS infos.')
    tokenizer = Tokenizer()
    res = []
    raw_sentences = []
    for sentence in sentences:
        sentence = ''.join(sentence)
        tokenized = tokenizer.tokenize(sentence)
        tokens = []
        for token in tokenized:
            pos, pos1, pos2, pos3 = token.part_of_speech.split(',')
            token = Token(word=token.surface,
                          surf=token.surface,
                          pos=pos,
                          pos1=pos1,
                          pos2=pos2,
                          pos3=pos3,
                          inflectionForm=token.infl_form,
                          inflectionType=token.infl_type,
                          reading=token.reading,
                          base=token.base_form)
            tokens.append(token)
        raw_sentence = [token.surface for token in tokenized]
        res.append(tokens)
        raw_sentences.append(raw_sentence)
    return res, raw_sentences
github masashi-y / depccg / depccg / tokens.py View on Github external
res, error = proc.communicate()
    try:
        tagged_sentences = res.decode('utf-8').strip().split('\n')
        tagged_sentences = [[tuple(token.split('|')) for token in sentence.strip().split(' ')]
                            for sentence in tagged_sentences]
    except:
        raise RuntimeError('failed to process C&C output. there might have been some problem '
                           'during running C&C pipeline?\n'
                           f'stderr:\n {error}')

    res = []
    for sentence in tagged_sentences:
        words, poss = zip(*[(word, pos) for word, pos, _ in sentence])
        lemmas = stemmer.analyze(list(words), list(poss))
        tokens = [Token(word=word, pos=pos, entity=ner, lemma=lemma.lower(), chunk='XX')
                  for (word, pos, ner), lemma in zip(sentence, lemmas)]
        res.append(tokens)
    return res
github masashi-y / depccg / depccg / tools / reader.py View on Github external
def reduce(item: str) -> None:
        nonlocal position
        if item[-1] != ')':
            token = Token(word=item)
            tokens.append(token)
            stack.append(item)
            return

        reduce(item[:-1])
        if isinstance(stack[-1], str):
            word = stack.pop()
            category = stack.pop()
            tree = Tree.make_terminal(word, category, lang)
            position += 1
        else:
            assert isinstance(stack[-1], Tree)
            children = []
            while isinstance(stack[-1], Tree):
                tree = stack.pop()
                children.append(tree)
github masashi-y / depccg / depccg / tools / reader.py View on Github external
cat = Category.parse(attrib['cat'])
                children = [rec(child) for child in node.getchildren()]
                if len(children) == 1:
                    return Tree.make_unary(cat, children[0], lang)
                else:
                    assert len(children) == 2
                    left, right = children
                    combinator = guess_combinator_by_triplet(
                                    binary_rules, cat, left.cat, right.cat)
                    combinator = combinator or UNKNOWN_COMBINATOR
                    return Tree.make_binary(cat, left, right, combinator, lang)
            else:
                assert node.tag == 'lf'
                cat = Category.parse(attrib['cat'])
                word = attrib['word']
                token = Token(word=attrib['word'],
                              pos=attrib['pos'],
                              entity=attrib['entity'],
                              lemma=attrib['lemma'],
                              chunk=attrib['chunk'])
                tokens.append(token)
                return Tree.make_terminal(word, cat, lang)
        tokens = []
github masashi-y / depccg / depccg / tokens.py View on Github external
res = []
    for sentence in docs:
        tokens = []
        for token in sentence:
            if token.ent_iob_ == 'O':
                ner = token.ent_iob_
            else:
                ner = token.ent_iob_ + '-' + token.ent_type_

            # takes care of pronoun
            if token.lemma_ == '-PRON-':
                lemma = str(token).lower()
            else:
                lemma = token.lemma_.lower()
            tokens.append(
                Token(word=str(token),
                      pos=token.tag_,
                      entity=ner,
                      lemma=lemma,
                      chunk='XX'))
        res.append(tokens)
    if tokenize:
        return res, raw_sentences
    else:
        return res
github masashi-y / depccg / depccg / tools / reader.py View on Github external
return Tree.make_terminal(word, cat, lang)

        spans = {span.attrib['id']: span for span in tree.xpath('./span')}
        return rec(spans[tree.attrib['root']])

    trees = etree.parse(filename).getroot()
    sentences = trees[0][0].xpath('sentence')
    for sentence in sentences:
        token_and_ids = []
        for token in sentence.xpath('.//token'):
            token_attribs = dict(token.attrib)
            token_id = token_attribs['id']
            for no_need in ['id', 'start', 'cat']:
                if no_need in token_attribs:
                    del token_attribs[no_need]
            token_and_ids.append((token_id, Token(**token_attribs)))
        tokens = [token for _, token in token_and_ids]
        for ccg in sentence.xpath('./ccg'):
            tree = parse(ccg, dict(token_and_ids))
            yield ccg.attrib['id'], tokens, tree