Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22
from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot
cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_base_ctb6'
tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese',
metrics='f1')
# tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
# Author: hankcs
# Date: 2019-12-28 22:22
from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot
cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_albert_ctb6'
tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_VALID, save_dir,
transformer='/home/ubuntu/hankcs/laser/data/transformer/albert_base_tf2',
metrics='f1', learning_rate=5e-5, epochs=3)
tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
from hanlp.components.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_VALID, CTB6_CWS_TEST
from tests import cdroot
cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/cws_bert_base_100million'
tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_VALID, save_dir, transformer='bert-base-chinese',
metrics='accuracy', batch_size=32)
tokenizer.load(save_dir, metrics='f1')
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
epsilon=1e-8, clipnorm=5)
tokenizer.fit(CTB6_CWS_TRAIN,
CTB6_CWS_VALID,
save_dir,
word_embed={'class_name': 'HanLP>Word2VecEmbedding',
'config': {
'trainable': True,
'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
'expand_vocab': False,
'lowercase': False,
}},
optimizer=optimizer,
window_size=0,
weight_norm=True)
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
epsilon=1e-8, clipnorm=5)
tokenizer.fit(CTB6_CWS_TRAIN,
CTB6_CWS_VALID,
save_dir,
word_embed={'class_name': 'HanLP>Word2VecEmbedding',
'config': {
'trainable': True,
'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
'expand_vocab': False,
'lowercase': False,
}},
optimizer=optimizer,
window_size=0,
weight_norm=True)
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')
CTB6_CWS_VALID,
save_dir,
embeddings={'class_name': 'HanLP>Word2VecEmbedding',
'config': {
'trainable': True,
'filepath': RADICAL_CHAR_EMBEDDING_100,
'expand_vocab': False,
'lowercase': False,
}},
early_stopping_patience=5,
batch_size=64,
max_seq_len=64,
metrics='accuracy'
)
tokenizer.load(save_dir, metrics='f1')
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')