Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-05-23 17:26
import os
from pyhanlp import SafeJClass
from tests.test_utility import ensure_data
NaiveBayesClassifier = SafeJClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')
sogou_corpus_path = ensure_data('搜狗文本分类语料库迷你版',
'http://file.hankcs.com/corpus/sogou-text-classification-corpus-mini.zip')
def train_or_load_classifier():
model_path = sogou_corpus_path + '.ser'
if os.path.isfile(model_path):
return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path))
classifier = NaiveBayesClassifier()
classifier.train(sogou_corpus_path)
model = classifier.getModel()
IOUtil.saveObjectTo(model, model_path)
return NaiveBayesClassifier(model)
def predict(classifier, text):
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-05-23 17:26
import os
from pyhanlp import SafeJClass
from tests.test_utility import ensure_data
NaiveBayesClassifier = SafeJClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')
sogou_corpus_path = ensure_data('搜狗文本分类语料库迷你版',
'http://file.hankcs.com/corpus/sogou-text-classification-corpus-mini.zip')
def train_or_load_classifier():
model_path = sogou_corpus_path + '.ser'
if os.path.isfile(model_path):
return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path))
classifier = NaiveBayesClassifier()
classifier.train(sogou_corpus_path)
model = classifier.getModel()
IOUtil.saveObjectTo(model, model_path)
return NaiveBayesClassifier(model)
if sys.version_info[0] < 3:
reload(sys)
sys.setdefaultencoding("utf-8")
# raise "Must be using Python 3"
from absl import flags # absl-py
from absl import logging # absl-py
FLAGS = flags.FLAGS
import unittest
import threading
import time
from pyhanlp import HanLP, SafeJClass
# 在线程体外部用SafeJClass线程安全地引入类名
CRFLexicalAnalyzer = SafeJClass("com.hankcs.hanlp.model.crf.CRFLexicalAnalyzer")
class MyThread(threading.Thread):
def __init__(self, name, counter, analyzer):
threading.Thread.__init__(self)
self.thread_name = name
self.counter = counter
self.analyzer = analyzer
def run(self):
print("Starting " + self.thread_name)
while self.counter:
time.sleep(1)
sentence = self.analyzer.analyze("商品和服务")
print("%s: %s, seg: %s" % (self.thread_name, time.ctime(time.time()), sentence))
self.counter -= 1
def test_data_path():
"""
获取测试数据路径,位于$root/data/test,根目录由配置文件指定。
:return:
"""
data_path = os.path.join(HANLP_DATA_PATH, 'test')
if not os.path.isdir(data_path):
os.mkdir(data_path)
return data_path
ratio = progress_size / total_size
ratio = max(1e-8, ratio)
percent = ratio * 100
eta = duration / ratio * (1 - ratio)
minutes = eta / 60
seconds = eta % 60
sys.stdout.write("\r%.2f%%, %d MB, %d KB/s, 还有 %d 分 %2d 秒 " %
(percent, progress_size / (1024 * 1024), speed, minutes, seconds))
sys.stdout.flush()
import socket
socket.setdefaulttimeout(10)
urllib.urlretrieve(url, tmp_path, reporthook)
print()
except BaseException as e:
eprint('下载失败 {} 由于 {}'.format(url, repr(e)))
doc_url = 'https://github.com/hankcs/pyhanlp'
eprint('请参考 %s 执行手动安装.' % doc_url)
eprint('或手动下载 {} 到 {}'.format(url, path))
if os.path.isfile(tmp_path):
os.remove(tmp_path)
browser_open(doc_url)
exit(1)
remove_file(path)
os.rename(tmp_path, path)
return True
STATIC_ROOT)
else:
HANLP_JAR_VERSION = os.path.basename(HANLP_JAR_PATH)[len('hanlp-'):-len('.jar')]
if HANLP_VERBOSE:
print("加载 HanLP jar [%s] ..." % HANLP_JAR_PATH)
print("加载 HanLP config [%s/hanlp.properties] ..." % (STATIC_ROOT))
print("加载 HanLP data [%s/data] ..." % (STATIC_ROOT))
java_url = 'https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html'
pathsep = os.pathsep
jvm_path = None
try:
jvm_path = getDefaultJVMPath()
except JVMNotFoundException as e:
eprint('找不到Java,请安装JDK8:%s' % java_url)
browser_open(java_url)
exit(1)
except JVMNotSupportedException as e:
eprint('Java位数与Python不一致,请重新安装一致的Java、Python、JPype1(必须都为32位或64位)')
browser_open(java_url)
exit(1)
if platform.system().startswith('CYGWIN'):
if not jvm_path.startswith('/cygdrive'): # CYGWIN 使用了宿主机器的JVM,必须将路径翻译为真实路径
pathsep = ';'
if STATIC_ROOT.startswith('/usr/lib'):
cygwin_root = os.popen('cygpath -w /').read().strip().replace('\\', '/')
STATIC_ROOT = cygwin_root + STATIC_ROOT[len('/usr'):]
HANLP_JAR_PATH = cygwin_root + HANLP_JAR_PATH[len('/usr'):]
PATH_CONFIG = cygwin_root + PATH_CONFIG[len('/usr'):]
elif STATIC_ROOT.startswith('/cygdrive'):
driver = STATIC_ROOT.split('/')
# 启动JVM
startJVM(
jvm_path,
JAVA_JAR_CLASSPATH,
"-Xms%s" %
HANLP_JVM_XMS,
"-Xmx%s" %
HANLP_JVM_XMX, convertStrings=True)
# 确保启动正常
try:
JClass('com.hankcs.hanlp.HanLP')
except java.lang.NoClassDefFoundError as e:
from pyhanlp.static import install_hanlp_jar
eprint('你的 {} 破损了,现在重新下载'.format(HANLP_JAR_PATH))
install_hanlp_jar()
eprint('下载成功,请重新启动程序')
exit(1)
percent = ratio * 100
eta = duration / ratio * (1 - ratio)
minutes = eta / 60
seconds = eta % 60
sys.stdout.write("\r%.2f%%, %d MB, %d KB/s, 还有 %d 分 %2d 秒 " %
(percent, progress_size / (1024 * 1024), speed, minutes, seconds))
sys.stdout.flush()
import socket
socket.setdefaulttimeout(10)
urllib.urlretrieve(url, tmp_path, reporthook)
print()
except BaseException as e:
eprint('下载失败 {} 由于 {}'.format(url, repr(e)))
doc_url = 'https://github.com/hankcs/pyhanlp'
eprint('请参考 %s 执行手动安装.' % doc_url)
eprint('或手动下载 {} 到 {}'.format(url, path))
if os.path.isfile(tmp_path):
os.remove(tmp_path)
browser_open(doc_url)
exit(1)
remove_file(path)
os.rename(tmp_path, path)
return True
if HANLP_VERBOSE:
print("加载 HanLP jar [%s] ..." % HANLP_JAR_PATH)
print("加载 HanLP config [%s/hanlp.properties] ..." % (STATIC_ROOT))
print("加载 HanLP data [%s/data] ..." % (STATIC_ROOT))
java_url = 'https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html'
pathsep = os.pathsep
jvm_path = None
try:
jvm_path = getDefaultJVMPath()
except JVMNotFoundException as e:
eprint('找不到Java,请安装JDK8:%s' % java_url)
browser_open(java_url)
exit(1)
except JVMNotSupportedException as e:
eprint('Java位数与Python不一致,请重新安装一致的Java、Python、JPype1(必须都为32位或64位)')
browser_open(java_url)
exit(1)
if platform.system().startswith('CYGWIN'):
if not jvm_path.startswith('/cygdrive'): # CYGWIN 使用了宿主机器的JVM,必须将路径翻译为真实路径
pathsep = ';'
if STATIC_ROOT.startswith('/usr/lib'):
cygwin_root = os.popen('cygpath -w /').read().strip().replace('\\', '/')
STATIC_ROOT = cygwin_root + STATIC_ROOT[len('/usr'):]
HANLP_JAR_PATH = cygwin_root + HANLP_JAR_PATH[len('/usr'):]
PATH_CONFIG = cygwin_root + PATH_CONFIG[len('/usr'):]
elif STATIC_ROOT.startswith('/cygdrive'):
driver = STATIC_ROOT.split('/')
cygwin_driver = '/'.join(driver[:3])
win_driver = driver[2].upper() + ':'
HANLP_JAR_PATH = HANLP_JAR_PATH.replace(cygwin_driver, win_driver)
STATIC_ROOT = STATIC_ROOT.replace(cygwin_driver, win_driver)
JAVA_JAR_CLASSPATH = JAVA_JAR_CLASSPATH + pathsep + os.path.join(STATIC_ROOT, jar)
if HANLP_VERBOSE: print("设置 JAVA_JAR_CLASSPATH [%s]" % JAVA_JAR_CLASSPATH)
# 启动JVM
startJVM(
jvm_path,
JAVA_JAR_CLASSPATH,
"-Xms%s" %
HANLP_JVM_XMS,
"-Xmx%s" %
HANLP_JVM_XMX, convertStrings=True)
# 确保启动正常
try:
JClass('com.hankcs.hanlp.HanLP')
except java.lang.NoClassDefFoundError as e:
from pyhanlp.static import install_hanlp_jar
eprint('你的 {} 破损了,现在重新下载'.format(HANLP_JAR_PATH))
install_hanlp_jar()
eprint('下载成功,请重新启动程序')
exit(1)