How to use the hanlp.common.vocab.Vocab function in hanlp

To help you get started, we’ve selected a few hanlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hankcs / HanLP / hanlp / transform / text.py View on Github external
def fit(self, trn_path: str, **kwargs) -> int:
        self.vocab = Vocab()
        num_samples = 0
        for x, y in self.file_to_inputs(trn_path):
            self.vocab.update(x)
            num_samples += 1
        return num_samples
github hankcs / HanLP / hanlp / components / taggers / transformers / transformer_transform.py View on Github external
def fit(self, trn_path: str, **kwargs) -> int:
        self.tag_vocab = Vocab(unk_token=None)
        num_samples = 0
        for words, tags in self.file_to_inputs(trn_path, gold=True):
            num_samples += 1
            self.tag_vocab.update(tags)
        return num_samples
github hankcs / HanLP / hanlp / transform / tsv.py View on Github external
def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \
        -> Tuple[Vocab, Vocab, Vocab]:
    word_vocab = Vocab()
    char_vocab = Vocab()
    tag_vocab = Vocab(unk_token=None)
    with open(tsv_file_path, encoding='utf-8') as tsv_file:
        for line in tsv_file:
            cells = line.strip().split()
            if cells:
                word, tag = cells
                if lower:
                    word_vocab.add(word.lower())
                else:
                    word_vocab.add(word)
                char_vocab.update(list(word))
                tag_vocab.add(tag)
    if lock_word_vocab:
        word_vocab.lock()
    if lock_char_vocab:
        char_vocab.lock()
github hankcs / HanLP / hanlp / transform / tsv.py View on Github external
def fit(self, trn_path: str, **kwargs) -> int:
        self.word_vocab = Vocab()
        self.tag_vocab = Vocab(pad_token=None, unk_token=None)
        num_samples = 0
        for words, tags in self.file_to_inputs(trn_path, True):
            self.word_vocab.update(words)
            self.tag_vocab.update(tags)
            num_samples += 1
        if self.char_vocab:
            self.char_vocab = Vocab()
            for word in self.word_vocab.token_to_idx.keys():
                if word in (self.word_vocab.pad_token, self.word_vocab.unk_token):
                    continue
                self.char_vocab.update(list(word))
        return num_samples
github hankcs / HanLP / hanlp / transform / tsv.py View on Github external
def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \
        -> Tuple[Vocab, Vocab, Vocab]:
    word_vocab = Vocab()
    char_vocab = Vocab()
    tag_vocab = Vocab(unk_token=None)
    with open(tsv_file_path, encoding='utf-8') as tsv_file:
        for line in tsv_file:
            cells = line.strip().split()
            if cells:
                word, tag = cells
                if lower:
                    word_vocab.add(word.lower())
                else:
                    word_vocab.add(word)
                char_vocab.update(list(word))
                tag_vocab.add(tag)
    if lock_word_vocab:
        word_vocab.lock()
    if lock_char_vocab:
        char_vocab.lock()
    if lock_tag_vocab:
github hankcs / HanLP / hanlp / common / vocab.py View on Github external
def create_label_vocab() -> Vocab:
    return Vocab(pad_token=None, unk_token=None)
github hankcs / HanLP / hanlp / transform / txt.py View on Github external
def vocab_from_txt(txt_file_path, bigram_only=False, window_size=4, **kwargs) -> Tuple[Vocab, Vocab, Vocab]:
    char_vocab, ngram_vocab, tag_vocab = Vocab(), Vocab(), Vocab(pad_token=None, unk_token=None)
    for X, Y in generate_ngram_bmes(txt_file_path, bigram_only, window_size, gold=True):
        char_vocab.update(X[0])
        for ngram in X[1:]:
            ngram_vocab.update(filter(lambda x: x, ngram))
        tag_vocab.update(Y)
    return char_vocab, ngram_vocab, tag_vocab
github hankcs / HanLP / hanlp / common / component.py View on Github external
def load_vocabs(self, save_dir, filename='vocabs.json'):
        save_dir = get_resource(save_dir)
        vocabs = SerializableDict()
        vocabs.load_json(os.path.join(save_dir, filename))
        for key, value in vocabs.items():
            vocab = Vocab()
            vocab.copy_from(value)
            setattr(self.transform, key, vocab)