Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_juman_wrapper(self):
try:
from pyknp import Juman
juman = Juman()
result = juman.analysis(u"これはペンです。")
logger.debug(','.join(mrph.midasi for mrph in result))
for mrph in result.mrph_list():
assert isinstance(mrph, pyknp.Morpheme)
logger.debug(u"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
% (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
except ImportError:
logger.debug('skip test_juman_wrapper')
#-*- encoding: utf-8 -*-
from __future__ import absolute_import
from pyknp import Juman
import unittest
JUMAN = Juman()
def juman(input_str):
return JUMAN.analysis(input_str)
class SimpleTest(unittest.TestCase):
def test(self):
test_str = u"この文を解析してください。"
result = juman(test_str)
self.assertEqual(len(result), 7)
self.assertEqual(''.join(mrph.midasi for mrph in result), test_str)
self.assertGreaterEqual(len(result.spec().split("\n")), 7)
if __name__ == '__main__':
def call_juman_interface(self, input_str):
# type: (text_type) -> MList
"""* What you can do
- You call Juman tokenizer interface.
* Output
- pyknp.MList
"""
if isinstance(self.jumanpp_obj, Juman):
ml_token_object = self.jumanpp_obj.analysis(input_str=input_str)
elif isinstance(self.jumanpp_obj, JumanppHnadler):
try:
result_token = self.jumanpp_obj.query(input_string=input_str)
except ProcessDownException:
"""Unix process is down by any reason."""
logger.warning("Re-starting unix process because it takes longer time than {} seconds...".format(self.jumanpp_obj.timeout_second))
self.jumanpp_obj.restart_process()
self.jumanpp_obj.query(self.dummy_text)
result_token = self.jumanpp_obj.query(input_string=input_str)
ml_token_object = MList(result_token)
except UnicodeDecodeError:
logger.warning(msg="Process is down by some reason. It restarts process automatically.")
self.jumanpp_obj.restart_process()
self.jumanpp_obj.query(self.dummy_text)
result_token = self.jumanpp_obj.query(input_string=input_str)
def text2token(text, tokenizer, lang='en'):
# convert a text to tokens which can be interpreted in BERT model
if lang == 'en':
text = text.replace('_', '[MASK]')
masked_text = "[CLS] " + text + " [SEP]"
tokenized_text = tokenizer.tokenize(masked_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
elif lang == 'jp':
jumanapp = Juman()
juman_res = jumanapp.analysis(text)
tokenized_text = [mrph.midasi for mrph in juman_res.mrph_list()]
tokenized_text.insert(0, '[CLS]')
tokenized_text.append('[SEP]')
tokenized_text = [
'[MASK]' if token == '_' else token for token in tokenized_text
]
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
masked_index = tokenized_text.index('[MASK]')
segments_ids = [0] * len(tokenized_text)
tokens_ts = np.array([indexed_tokens])
segments_ts = np.array([segments_ids])
# input length fixed by max_seq_len
# (ailia should manage adoptable input size)
def call_juman_interface(self, input_str):
# type: (text_type)->MList
if isinstance(self.juman, pyknp.Juman):
result = self.juman.analysis(input_str)
return result
elif isinstance(self.juman, JumanppHnadler):
try:
result_analysis = self.juman.query(input_str)
except UnicodeDecodeError:
logger.warning(msg="Process is down by some reason. It restarts process automatically.")
self.juman.restart_process()
result_analysis = self.juman.query(input_string=input_str)
return MList(result_analysis)
else:
raise Exception('Not defined.')
pass
self.is_use_pyknp = True
else:
pass
if server is not None:
# use server mode #
self.juman = pyknp.Juman(command=command, server=server, port=port,
timeout=self.timeout, rcfile=rcfile, option=option,
pattern=pattern, jumanpp=False, **args)
if six.PY3:
# It overwrites juman_lines() method #
self.juman.juman_lines = self.__monkey_patch_juman_lines
elif is_use_pyknp and server is None:
# use unix process with pyknp
self.juman = pyknp.Juman(command=command, server=server, port=port,
timeout=self.timeout, rcfile=rcfile, option=option,
pattern=pattern, jumanpp=False, **args)
else:
# use unix process with pexpect(RECOMMENDED) #
self.juman = JumanppHnadler(jumanpp_command=command,
option=self.option,
pattern=self.pattern,
timeout_second=self.timeout)