How to use hanlp - 10 common examples

To help you get started, we’ve selected a few hanlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hankcs / HanLP / hanlp / components / parsers / conll.py View on Github external
lengths = [self.len_of_sent(i) for i in corpus]
                if len(corpus) < 32:
                    n_buckets = 1
                else:
                    n_buckets = min(self.config.n_buckets, len(corpus))
                buckets = dict(zip(*kmeans(lengths, n_buckets)))
                sizes, buckets = zip(*[
                    (size, bucket) for size, bucket in buckets.items()
                ])
                # the number of chunks in each bucket, which is clipped by
                # range [1, len(bucket)]
                chunks = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in
                          zip(sizes, buckets)]
                range_fn = randperm if shuffle else arange
                max_samples_per_batch = self.config.get('max_samples_per_batch', None)
                for i in tolist(range_fn(len(buckets))):
                    split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1
                                   for j in range(chunks[i])]  # how many sentences in each batch
                    for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes):
                        indices = [buckets[i][j] for j in tolist(batch_indices)]
                        if max_samples_per_batch:
                            for j in range(0, len(indices), max_samples_per_batch):
                                yield from self.batched_inputs_to_batches(corpus, indices[j:j + max_samples_per_batch],
                                                                          shuffle)
                        else:
                            yield from self.batched_inputs_to_batches(corpus, indices, shuffle)
github hankcs / HanLP / tests / demo / zh / demo_serving.py View on Github external
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-06 20:23
import hanlp
from hanlp.common.component import KerasComponent

tagger: KerasComponent = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN)
print(tagger('商品 和 服务'.split()))
tagger.serve()
github hankcs / HanLP / tests / train / zh / cws / train_ctb6_cws_bert.py View on Github external
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22

from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_base_ctb6'
tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese',
              metrics='f1')
# tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
github hankcs / HanLP / tests / train / zh / cws / train_ctb6_cws_albert.py View on Github external
# Author: hankcs
# Date: 2019-12-28 22:22

from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_albert_ctb6'
tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir,
              transformer='/home/ubuntu/hankcs/laser/data/transformer/albert_base_tf2',
              metrics='f1', learning_rate=5e-5, epochs=3)
tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
github hankcs / HanLP / tests / train / zh / cws / train_large_bert_cws.py View on Github external
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_base_100million'
tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese',
              metrics='accuracy', batch_size=32)
tokenizer.load(save_dir, metrics='f1')
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
github hankcs / HanLP / tests / train / zh / cws / train_ctb6_cws_convseg.py View on Github external
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     epsilon=1e-8, clipnorm=5)
tokenizer.fit(CTB6_CWS_TRAIN,
              CTB6_CWS_VALID,
              save_dir,
              word_embed={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': True,
                              'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
                              'expand_vocab': False,
                              'lowercase': False,
                          }},
              optimizer=optimizer,
              window_size=0,
              weight_norm=True)
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')
github hankcs / HanLP / tests / train / zh / cws / train_large_conv_cws.py View on Github external
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     epsilon=1e-8, clipnorm=5)
tokenizer.fit(CTB6_CWS_TRAIN,
              CTB6_CWS_VALID,
              save_dir,
              word_embed={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': True,
                              'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
                              'expand_vocab': False,
                              'lowercase': False,
                          }},
              optimizer=optimizer,
              window_size=0,
              weight_norm=True)
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')
github hankcs / HanLP / tests / train / zh / cws / train_large_bert_cws.py View on Github external
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_base_100million'
tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese',
              metrics='accuracy', batch_size=32)
tokenizer.load(save_dir, metrics='f1')
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
github hankcs / HanLP / tests / train / zh / cws / train_ctb6_cws_bert.py View on Github external
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22

from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_base_ctb6'
tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese',
              metrics='f1')
# tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
github hankcs / HanLP / tests / train / zh / cws / train_large_conv_cws.py View on Github external
# Date: 2019-12-29 21:58

import tensorflow as tf

from hanlp.components.tok import NgramConvTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR
from tests import cdroot

cdroot()
tokenizer = NgramConvTokenizer()
save_dir = 'data/model/cws/ctb6_cws'
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     epsilon=1e-8, clipnorm=5)
tokenizer.fit(CTB6_CWS_TRAIN,
              CTB6_CWS_VALID,
              save_dir,
              word_embed={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': True,
                              'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
                              'expand_vocab': False,
                              'lowercase': False,
                          }},
              optimizer=optimizer,
              window_size=0,
              weight_norm=True)
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')