Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_tokenizer_morpheme_split(self):
from sudachipy import tokenizer
ms = self.tokenizer_obj.tokenize('東京都', tokenizer.Tokenizer.SplitMode.C)
self.assertEqual(1, ms.size())
self.assertEqual(ms[0].surface(), '東京都')
ms_a = ms[0].split(tokenizer.Tokenizer.SplitMode.A)
self.assertEqual(2, ms_a.size())
self.assertEqual(ms_a[0].surface(), '東京')
self.assertEqual(ms_a[1].surface(), '都')
def test_tokenizer_morpheme_split(self):
from sudachipy import tokenizer
ms = self.tokenizer_obj.tokenize('東京都', tokenizer.Tokenizer.SplitMode.C)
self.assertEqual(1, ms.size())
self.assertEqual(ms[0].surface(), '東京都')
ms_a = ms[0].split(tokenizer.Tokenizer.SplitMode.A)
self.assertEqual(2, ms_a.size())
self.assertEqual(ms_a[0].surface(), '東京')
self.assertEqual(ms_a[1].surface(), '都')
def split(self, mode, index, wi):
if mode is tokenizer.Tokenizer.SplitMode.A:
word_ids = wi.a_unit_split
elif mode is tokenizer.Tokenizer.SplitMode.B:
word_ids = wi.b_unit_split
else:
return [self.__getitem__(index)]
if len(word_ids) == 0 or len(word_ids) == 1:
return [self.__getitem__(index)]
offset = self.path[index].get_begin()
nodes = []
for wid in word_ids:
n = latticenode.LatticeNode(self.lexicon, 0, 0, 0, wid)
n.set_begin(offset)
offset += n.get_word_info().head_word_length
n.set_end(offset)
nodes.append(n)
def _command_tokenize(args, print_usage):
if args.version:
print_version()
return
_input_files_checker(args, print_usage)
if args.mode == "A":
mode = tokenizer.Tokenizer.SplitMode.A
elif args.mode == "B":
mode = tokenizer.Tokenizer.SplitMode.B
else:
mode = tokenizer.Tokenizer.SplitMode.C
stdout_logger = logging.getLogger(__name__)
output = sys.stdout
if args.fpath_out:
output = open(args.fpath_out, "w", encoding="utf-8")
handler = logging.StreamHandler(output)
handler.setLevel(logging.DEBUG)
stdout_logger.addHandler(handler)
stdout_logger.setLevel(logging.DEBUG)
stdout_logger.propagate = False
print_all = args.a
enable_dump = args.d
def create(self):
return tokenizer.Tokenizer(self.grammar, self.lexicon, self.input_text_plugins, self.oov_provider_plugins, self.path_rewrite_plugins)
def _command_tokenize(args, print_usage):
with open(args.fpath_setting, "r", encoding="utf-8") as f:
settings = json.load(f)
if args.mode == "A":
mode = tokenizer.Tokenizer.SplitMode.A
elif args.mode == "B":
mode = tokenizer.Tokenizer.SplitMode.B
else:
mode = tokenizer.Tokenizer.SplitMode.C
output = sys.stdout
if args.fpath_out:
output = open(args.fpath_out, "w", encoding="utf-8")
print_all = args.a
is_enable_dump = args.d
dict_ = dictionary.Dictionary(settings)
tokenizer_obj = dict_.create()
if is_enable_dump:
tokenizer_obj.set_dump_output(output)
input_ = fileinput.input(args.input_files, openhook=fileinput.hook_encoded("utf-8"))
run(tokenizer_obj, mode, input_, output, print_all)
def try_import_sudachipy_split_mode():
try:
from sudachipy import tokenizer
return tokenizer.Tokenizer.SplitMode
except ImportError:
raise ImportError(
"Japanese support requires SudachiPy distributed with ja language model"
)
def read_user_dictionary(self, filename):
with open(filename, 'r+b') as user_dic:
bytes_ = mmap.mmap(user_dic.fileno(), 0, prot=mmap.PROT_READ)
self.buffers.append(bytes_)
user_lexicon = dictionarylib.doublearraylexicon.DoubleArrayLexicon(bytes_, 0)
tokenizer_ = tokenizer.JapaneseTokenizer(self.grammar, self.lexicon, self.input_text_plugins, self.oov_provider_plugins, [])
user_lexicon.calclate_cost(tokenizer_)
self.lexicon.append(user_lexicon)
name="sudachi ({})".format(mode), with_postag=with_postag,
)
try:
self._tokenizer = dictionary.Dictionary().create()
except KeyError:
msg = "Loading a dictionary fails."
msg += " ( see https://github.com/WorksApplications/SudachiPy#install-dict-packages )" # NOQA
raise KeyError(msg)
_mode = mode.capitalize()
if _mode == "A":
self._mode = tokenizer.Tokenizer.SplitMode.A
elif _mode == "B":
self._mode = tokenizer.Tokenizer.SplitMode.B
elif _mode == "C":
self._mode = tokenizer.Tokenizer.SplitMode.C
else:
raise ValueError("Invalid mode is specified. Mode should be A, B, or C.") # NOQA
def _command_tokenize(args, print_usage):
with open(args.fpath_setting, "r", encoding="utf-8") as f:
settings = json.load(f)
if args.mode == "A":
mode = tokenizer.Tokenizer.SplitMode.A
elif args.mode == "B":
mode = tokenizer.Tokenizer.SplitMode.B
else:
mode = tokenizer.Tokenizer.SplitMode.C
output = sys.stdout
if args.fpath_out:
output = open(args.fpath_out, "w", encoding="utf-8")
print_all = args.a
is_enable_dump = args.d
dict_ = dictionary.Dictionary(settings)
tokenizer_obj = dict_.create()
if is_enable_dump: