How to use depccg - 10 common examples

To help you get started, we’ve selected a few depccg examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github masashi-y / depccg / depccg / fixed_length_n_step_lstm.py View on Github external
def _make_tensor_descriptor_array(xs, length):
    """Make an array of pointers denoting pointers of tensor descriptors.

    """
    descs = []
    batch_size = xs.shape[0] // length
    for i in range(length):
        x = xs[i*batch_size:(i+1)*batch_size]
        if x.ndim < 3:
            shape = x.shape + (1,) * (3 - x.ndim)
            x = x.reshape(shape)
        desc = cudnn.create_tensor_nd_descriptor(x)
        descs.append(desc)
    return PointerArray([d.value for d in descs], descs)
github masashi-y / depccg / depccg / tools / reader.py View on Github external
def rec(node):
            attrib = node.attrib
            if 'terminal' not in attrib:
                cat = Category.parse(attrib['category'])
                children = [rec(spans[child]) for child in attrib['child'].split(' ')]
                if len(children) == 1:
                    return Tree.make_unary(cat, children[0], lang)
                else:
                    assert len(children) == 2
                    left, right = children
                    combinator = guess_combinator_by_triplet(
                                    binary_rules, cat, left.cat, right.cat)
                    combinator = combinator or UNKNOWN_COMBINATOR
                    return Tree.make_binary(cat, left, right, combinator, lang)
            else:
                cat = Category.parse(attrib['category'])
                word = try_get_surface(tokens[attrib['terminal']])
                return Tree.make_terminal(word, cat, lang)
github masashi-y / depccg / depccg / tools / reader.py View on Github external
def rec(node):
            attrib = node.attrib
            if 'terminal' not in attrib:
                cat = Category.parse(attrib['category'])
                children = [rec(spans[child]) for child in attrib['child'].split(' ')]
                if len(children) == 1:
                    return Tree.make_unary(cat, children[0], lang)
                else:
                    assert len(children) == 2
                    left, right = children
                    combinator = guess_combinator_by_triplet(
                                    binary_rules, cat, left.cat, right.cat)
                    combinator = combinator or UNKNOWN_COMBINATOR
                    return Tree.make_binary(cat, left, right, combinator, lang)
            else:
                cat = Category.parse(attrib['category'])
                word = try_get_surface(tokens[attrib['terminal']])
                return Tree.make_terminal(word, cat, lang)
github masashi-y / depccg / depccg / models / my_allennlp / dataset / supertagging_dataset.py View on Github external
def text_to_instance(self,
                         sentence: str,
                         tags: List[str] = None,
                         deps: List[int] = None,
                         weight: float = 1.0) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        tokens = [Token(utils.normalize(token)) for token in sentence.split(' ')]
        token_field = TextField(tokens, self._token_indexers)
        metadata = MetadataField({'words': sentence})
        weight = ArrayField(numpy.array([weight], 'f'))
        fields = {
            'words': token_field,
            'metadata': metadata,
            'weight': weight,
        }
        if tags is not None and deps is not None:
            fields['head_tags'] = SequenceLabelField(
                tags, token_field, label_namespace='head_tags')
            fields['head_indices'] = SequenceLabelField(
                deps, token_field, label_namespace='head_indices')
        return Instance(fields)
github masashi-y / depccg / depccg / tools / evaluate.py View on Github external
try:
        lines = open(file)
    except IOError as e:
        die(f'could not open gold_deps file ({e.strerror})')

    deps, udeps = set(), set()
    for line in lines:
        line = line.strip()
        if line.startswith('<s>'):
            yield deps, udeps
            deps, udeps = set(), set()
            continue
        arg_index, pred_index, cat, slot, arg, pred = line.split()[:6]
        pred = f'{utils.normalize(pred)}_{int(pred_index) + 1}'
        arg = f'{utils.normalize(arg)}_{int(arg_index) + 1}'
        deps.add((pred, cat, slot, arg))
        udeps.add((pred, arg))
    assert len(deps) == 0 and len(udeps) == 0
</s>
github masashi-y / depccg / depccg / tools / evaluate.py View on Github external
lines = open(file)
    except IOError as e:
        die(f'could not open gold_deps file ({e.strerror})')

    deps, udeps = set(), set()
    for line in lines:
        line = line.strip()
        if line.startswith('<s>'):
            yield deps, udeps
            deps, udeps = set(), set()
            continue
        arg_index, pred_index, cat, slot, arg, pred = line.split()[:6]
        pred = f'{utils.normalize(pred)}_{int(pred_index) + 1}'
        arg = f'{utils.normalize(arg)}_{int(arg_index) + 1}'
        deps.add((pred, cat, slot, arg))
        udeps.add((pred, arg))
    assert len(deps) == 0 and len(udeps) == 0
</s>
github masashi-y / depccg / depccg / __main__.py View on Github external
elif not sys.stdin.isatty():
        input_type = sys.stdin
    else:
        # reading from keyboard
        input_type = None
        sys.stdout.flush()
        sys.stderr.flush()
        logging.getLogger().setLevel(logging.CRITICAL)

    while True:
        fin = [line for line in ([input()] if input_type is None else input_type) if len(line.strip()) > 0]
        if len(fin) == 0:
            break

        if args.input_format == 'POSandNERtagged':
            tagged_doc = [[Token.from_piped(token) for token in sent.strip().split(' ')] for sent in fin]
            doc = [' '.join(token.word for token in sent) for sent in tagged_doc]
            res = parser.parse_doc(doc,
                                   probs=probs,
                                   tag_list=tag_list,
                                   batchsize=args.batchsize)
        elif args.input_format == 'json':
            doc = [json.loads(line) for line in fin]
            tagged_doc = annotate_fun(
                [[word for word in sent['words'].split(' ')] for sent in doc])
            res = parser.parse_json(doc)
        elif args.input_format == 'partial':
            doc, constraints = zip(*[read_partial_tree(l.strip()) for l in fin])
            tagged_doc = annotate_fun(doc)
            res = parser.parse_doc(doc,
                                   probs=probs,
                                   tag_list=tag_list,
github masashi-y / depccg / depccg / tokens.py View on Github external
def from_piped(cls, string: str) -> 'Token':
        # WORD|POS|NER or WORD|LEMMA|POS|NER
        # or WORD|LEMMA|POS|NER|CHUCK
        items = string.split('|')
        if len(items) == 5:
            word, lemma, pos, entity, chunk = items
        elif len(items) == 4:
            word, lemma, pos, entity = items
            chunk = 'XX'
        else:
            assert len(items) == 3
            word, pos, entity = items
            lemma = 'XX'
            chunk = 'XX'

        return Token(word=word,
                     lemma=lemma,
                     pos=pos,
                     entity=entity,
                     chunk=chunk)
github masashi-y / depccg / depccg / tokens.py View on Github external
from janome.tokenizer import Tokenizer
    except ImportError:
        logger.error('failed to import janome. please install it by "pip install janome".')
        exit(1)

    logger.info('use Janome to tokenize and annotate POS infos.')
    tokenizer = Tokenizer()
    res = []
    raw_sentences = []
    for sentence in sentences:
        sentence = ''.join(sentence)
        tokenized = tokenizer.tokenize(sentence)
        tokens = []
        for token in tokenized:
            pos, pos1, pos2, pos3 = token.part_of_speech.split(',')
            token = Token(word=token.surface,
                          surf=token.surface,
                          pos=pos,
                          pos1=pos1,
                          pos2=pos2,
                          pos3=pos3,
                          inflectionForm=token.infl_form,
                          inflectionType=token.infl_type,
                          reading=token.reading,
                          base=token.base_form)
            tokens.append(token)
        raw_sentence = [token.surface for token in tokenized]
        res.append(tokens)
        raw_sentences.append(raw_sentence)
    return res, raw_sentences
github masashi-y / depccg / depccg / tokens.py View on Github external
res, error = proc.communicate()
    try:
        tagged_sentences = res.decode('utf-8').strip().split('\n')
        tagged_sentences = [[tuple(token.split('|')) for token in sentence.strip().split(' ')]
                            for sentence in tagged_sentences]
    except:
        raise RuntimeError('failed to process C&C output. there might have been some problem '
                           'during running C&C pipeline?\n'
                           f'stderr:\n {error}')

    res = []
    for sentence in tagged_sentences:
        words, poss = zip(*[(word, pos) for word, pos, _ in sentence])
        lemmas = stemmer.analyze(list(words), list(poss))
        tokens = [Token(word=word, pos=pos, entity=ner, lemma=lemma.lower(), chunk='XX')
                  for (word, pos, ner), lemma in zip(sentence, lemmas)]
        res.append(tokens)
    return res