How to use the sudachipy.dictionary.Dictionary function in SudachiPy

To help you get started, we’ve selected a few SudachiPy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github WorksApplications / SudachiPy / tests / plugin / test_join_numeric_plugin.py View on Github external
def setUp(self):
        pass
        resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources')
        self.dict_ = Dictionary(os.path.join(resource_dir, 'numeric_sudachi.json'), resource_dir)
        self.tokenizer = self.dict_.create()
        self.plugin = JoinNumericPlugin(None)
        self.plugin.set_up(self.dict_.grammar)
github WorksApplications / SudachiPy / tests / test_dictionary.py View on Github external
def setUp(self):
        resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources')
        self.dict_ = Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir=resource_dir)
github WorksApplications / SudachiPy / tests / plugin / test_join_katakana_oov_plugin.py View on Github external
def setUp(self):
        resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources')
        self.dict_ = Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir)
        self.tokenizer = self.dict_.create()
        self.plugin = JoinKatakanaOovPlugin(settings['pathRewritePlugin'][1])
github WorksApplications / SudachiPy / tests / test_tokenizer.py View on Github external
def setUp(self):
        resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resources')
        self.dict_ = dictionary.Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir)
        self.tokenizer_obj = self.dict_.create()
github WorksApplications / SudachiPy / sudachipy / command_line.py View on Github external
stdout_logger = logging.getLogger(__name__)
    output = sys.stdout
    if args.fpath_out:
        output = open(args.fpath_out, "w", encoding="utf-8")
    handler = logging.StreamHandler(output)
    handler.setLevel(logging.DEBUG)
    stdout_logger.addHandler(handler)
    stdout_logger.setLevel(logging.DEBUG)
    stdout_logger.propagate = False

    print_all = args.a
    enable_dump = args.d

    try:
        dict_ = dictionary.Dictionary(config_path=args.fpath_setting)
        tokenizer_obj = dict_.create()
        input_ = fileinput.input(args.in_files, openhook=fileinput.hook_encoded("utf-8"))
        run(tokenizer_obj, mode, input_, print_all, stdout_logger, enable_dump)
    finally:
        if args.fpath_out:
            output.close()
github megagonlabs / ginza / sudachipy / command_line.py View on Github external
if args.mode == "A":
        mode = tokenizer.Tokenizer.SplitMode.A
    elif args.mode == "B":
        mode = tokenizer.Tokenizer.SplitMode.B
    else:
        mode = tokenizer.Tokenizer.SplitMode.C

    output = sys.stdout
    if args.fpath_out:
        output = open(args.fpath_out, "w", encoding="utf-8")

    print_all = args.a

    is_enable_dump = args.d

    dict_ = dictionary.Dictionary(settings)
    tokenizer_obj = dict_.create()
    if is_enable_dump:
        tokenizer_obj.set_dump_output(output)

    input_ = fileinput.input(args.input_files, openhook=fileinput.hook_encoded("utf-8"))
    run(tokenizer_obj, mode, input_, output, print_all)

    output.close()
github himkt / tiny_tokenizer / konoha / word_tokenizers / sudachi_tokenizer.py View on Github external
others.
        """
        try:
            from sudachipy import tokenizer
            from sudachipy import dictionary
        except ImportError:
            msg = "Importing sudachipy failed for some reason."
            msg += "\n  1. make sure SudachiPy is successfully installed."
            msg += "\n  2. make sure dictionary is successfully installed."
            raise ImportError(msg)

        super(SudachiTokenizer, self).__init__(
            name="sudachi ({})".format(mode), with_postag=with_postag,
        )
        try:
            self._tokenizer = dictionary.Dictionary().create()
        except KeyError:
            msg = "Loading a dictionary fails."
            msg += " ( see https://github.com/WorksApplications/SudachiPy#install-dict-packages )"  # NOQA
            raise KeyError(msg)

        _mode = mode.capitalize()
        if _mode == "A":
            self._mode = tokenizer.Tokenizer.SplitMode.A
        elif _mode == "B":
            self._mode = tokenizer.Tokenizer.SplitMode.B
        elif _mode == "C":
            self._mode = tokenizer.Tokenizer.SplitMode.C
        else:
            raise ValueError("Invalid mode is specified. Mode should be A, B, or C.")  # NOQA
github stanfordnlp / stanza / stanza / pipeline / external / sudachipy.py View on Github external
def __init__(self, config):
        """ Construct a SudachiPy-based tokenizer.

        Note that this tokenizer uses regex for sentence segmentation.
        """
        if config['lang'] != 'ja':
            raise Exception("SudachiPy tokenizer is only allowed in Japanese pipelines.")

        check_sudachipy()
        from sudachipy import tokenizer
        from sudachipy import dictionary

        self.tokenizer = dictionary.Dictionary().create()