Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def setUp(self):
pass
resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources')
self.dict_ = Dictionary(os.path.join(resource_dir, 'numeric_sudachi.json'), resource_dir)
self.tokenizer = self.dict_.create()
self.plugin = JoinNumericPlugin(None)
self.plugin.set_up(self.dict_.grammar)
def setUp(self):
resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources')
self.dict_ = Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir=resource_dir)
def setUp(self):
resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources')
self.dict_ = Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir)
self.tokenizer = self.dict_.create()
self.plugin = JoinKatakanaOovPlugin(settings['pathRewritePlugin'][1])
def setUp(self):
resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources')
self.dict_ = dictionary.Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir)
self.tokenizer_obj = self.dict_.create()
stdout_logger = logging.getLogger(__name__)
output = sys.stdout
if args.fpath_out:
output = open(args.fpath_out, "w", encoding="utf-8")
handler = logging.StreamHandler(output)
handler.setLevel(logging.DEBUG)
stdout_logger.addHandler(handler)
stdout_logger.setLevel(logging.DEBUG)
stdout_logger.propagate = False
print_all = args.a
enable_dump = args.d
try:
dict_ = dictionary.Dictionary(config_path=args.fpath_setting)
tokenizer_obj = dict_.create()
input_ = fileinput.input(args.in_files, openhook=fileinput.hook_encoded("utf-8"))
run(tokenizer_obj, mode, input_, print_all, stdout_logger, enable_dump)
finally:
if args.fpath_out:
output.close()
if args.mode == "A":
mode = tokenizer.Tokenizer.SplitMode.A
elif args.mode == "B":
mode = tokenizer.Tokenizer.SplitMode.B
else:
mode = tokenizer.Tokenizer.SplitMode.C
output = sys.stdout
if args.fpath_out:
output = open(args.fpath_out, "w", encoding="utf-8")
print_all = args.a
is_enable_dump = args.d
dict_ = dictionary.Dictionary(settings)
tokenizer_obj = dict_.create()
if is_enable_dump:
tokenizer_obj.set_dump_output(output)
input_ = fileinput.input(args.input_files, openhook=fileinput.hook_encoded("utf-8"))
run(tokenizer_obj, mode, input_, output, print_all)
output.close()
others.
"""
try:
from sudachipy import tokenizer
from sudachipy import dictionary
except ImportError:
msg = "Importing sudachipy failed for some reason."
msg += "\n 1. make sure SudachiPy is successfully installed."
msg += "\n 2. make sure dictionary is successfully installed."
raise ImportError(msg)
super(SudachiTokenizer, self).__init__(
name="sudachi ({})".format(mode), with_postag=with_postag,
)
try:
self._tokenizer = dictionary.Dictionary().create()
except KeyError:
msg = "Loading a dictionary fails."
msg += " ( see https://github.com/WorksApplications/SudachiPy#install-dict-packages )" # NOQA
raise KeyError(msg)
_mode = mode.capitalize()
if _mode == "A":
self._mode = tokenizer.Tokenizer.SplitMode.A
elif _mode == "B":
self._mode = tokenizer.Tokenizer.SplitMode.B
elif _mode == "C":
self._mode = tokenizer.Tokenizer.SplitMode.C
else:
raise ValueError("Invalid mode is specified. Mode should be A, B, or C.") # NOQA
def __init__(self, config):
""" Construct a SudachiPy-based tokenizer.
Note that this tokenizer uses regex for sentence segmentation.
"""
if config['lang'] != 'ja':
raise Exception("SudachiPy tokenizer is only allowed in Japanese pipelines.")
check_sudachipy()
from sudachipy import tokenizer
from sudachipy import dictionary
self.tokenizer = dictionary.Dictionary().create()