How to use the sudachipy.config function in SudachiPy

To help you get started, we’ve selected a few SudachiPy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github megagonlabs / ginza / sudachipy / dictionary.py View on Github external
self.grammar = None
        self.lexicon = None
        self.input_text_plugins = []
        self.oov_provider_plugins = []
        self.path_rewrite_plugins = []
        self.buffers = []
        self.header = None

        self.read_system_dictionary(os.path.join(config.RESOURCEDIR, settings["systemDict"]))
        """
        for p in settings["editConnectionPlugin"]:
            p.set_up(self.grammar)
            p.edit(self.grammar)
        """

        self.read_character_definition(os.path.join(config.RESOURCEDIR, settings["characterDefinitionFile"]))

        default_input_text_plugin = plugin.input_text.default_input_text_plugin.DefaultInputTextPlugin()
        self.input_text_plugins = [default_input_text_plugin]
        for p in self.input_text_plugins:
            p.set_up()

        simple_oov_plugin = plugin.oov.simple_oov_plugin.SimpleOovPlugin()
        mecab_oov_plugin = plugin.oov.mecab_oov_plugin.MeCabOovPlugin()
        self.oov_provider_plugins = [mecab_oov_plugin, simple_oov_plugin]
        if not self.oov_provider_plugins:
            raise AttributeError("no OOV provider")
        for p in self.oov_provider_plugins:
            p.set_up(self.grammar)

        join_numeric_plugin = plugin.path_rewrite.join_numeric_plugin.JoinNumericPlugin()
        join_katakana_oov_plugin = plugin.path_rewrite.join_katakana_oov_plugin.JoinKatakanaOovPlugin()
github megagonlabs / ginza / sudachipy / dictionary.py View on Github external
def __init__(self, settings):
        self.grammar = None
        self.lexicon = None
        self.input_text_plugins = []
        self.oov_provider_plugins = []
        self.path_rewrite_plugins = []
        self.buffers = []
        self.header = None

        self.read_system_dictionary(os.path.join(config.RESOURCEDIR, settings["systemDict"]))
        """
        for p in settings["editConnectionPlugin"]:
            p.set_up(self.grammar)
            p.edit(self.grammar)
        """

        self.read_character_definition(os.path.join(config.RESOURCEDIR, settings["characterDefinitionFile"]))

        default_input_text_plugin = plugin.input_text.default_input_text_plugin.DefaultInputTextPlugin()
        self.input_text_plugins = [default_input_text_plugin]
        for p in self.input_text_plugins:
            p.set_up()

        simple_oov_plugin = plugin.oov.simple_oov_plugin.SimpleOovPlugin()
        mecab_oov_plugin = plugin.oov.mecab_oov_plugin.MeCabOovPlugin()
        self.oov_provider_plugins = [mecab_oov_plugin, simple_oov_plugin]
github WorksApplications / SudachiPy / sudachipy / dictionary.py View on Github external
def __init__(self, config_path=None, resource_dir=None):
        config.settings.set_up(config_path, resource_dir)
        self.grammar = None
        self.lexicon = None
        self.input_text_plugins = []
        self.edit_connection_plugin = []
        self.oov_provider_plugins = []
        self.path_rewrite_plugins = []
        self.dictionaries = []
        self.header = None
        self._read_system_dictionary(config.settings.system_dict_path())

        # self.edit_connection_plugin = [InhibitConnectionPlugin()]
        # for p in self.edit_connection_plugin:
        #     p.set_up(self.grammar)
        #     p.edit(self.grammar)

        self._read_character_definition(config.settings.char_def_path())

        self.input_text_plugins = get_input_text_plugins()
        for p in self.input_text_plugins:
            p.set_up()

        self.oov_provider_plugins = get_oov_plugins()
        if not self.oov_provider_plugins:
            raise AttributeError("no OOV provider")
        for p in self.oov_provider_plugins:
github megagonlabs / ginza / sudachipy / command_line.py View on Github external
def main():
    parser = argparse.ArgumentParser(description="Japanese Morphological Analyzer")
    subparsers = parser.add_subparsers()

    # root parser
    parser.add_argument("-v", "--version", action="version", version="%(prog)s v0.1.1")

    # tokenize parser
    parser_tk = subparsers.add_parser('tokenize', help='see `tokenize -h`', description='Japanese Morphological Analyze')
    parser_tk.add_argument("-r", dest="fpath_setting", metavar="file",
                           default=config.SETTINGFILE, help="the setting file in JSON format")
    parser_tk.add_argument("-m", dest="mode", choices=["A", "B", "C"], default="C", help="the mode of splitting")
    parser_tk.add_argument("-o", dest="fpath_out", metavar="file", help="the output file")
    parser_tk.add_argument("-a", action="store_true", help="print all of the fields")
    parser_tk.add_argument("-d", action="store_true", help="print the debug information")
    parser_tk.add_argument("input_files", metavar="input file(s)", nargs=argparse.REMAINDER)
    parser_tk.set_defaults(handler=_command_tokenize, print_usage=parser_tk.print_usage)

    # build dictionary parser
    parser_bd = subparsers.add_parser('build', help='see `build -h`', description='Build Sudachi Dictionary')
    parser_bd.add_argument('-o', dest='out_file', metavar='file', default='system.dic',
                           help='output file (default: system.dic)')
    parser_bd.add_argument('-d', dest='description', default='', metavar='string', required=False,
                           help='description comment to be embedded on dictionary')
    required_named_bd = parser_bd.add_argument_group('required named arguments')
    required_named_bd.add_argument('-m', dest='matrix_file', metavar='file', required=True,
                                   help='connection matrix file with MeCab\'s matrix.def format')
github megagonlabs / ginza / sudachipy / plugin / oov / mecab_oov_plugin.py View on Github external
def set_up(self, grammar):
        char_def = os.path.join(config.RESOURCEDIR, "char.def")
        if not char_def:
            raise AttributeError("charDef is not defined")
        self.read_character_property(char_def)

        unk_def = os.path.join(config.RESOURCEDIR, "unk.def")
        if not unk_def:
            raise AttributeError("unkDef is not defined")
        self.read_oov(unk_def, grammar)
github WorksApplications / SudachiPy / sudachipy / plugin / input_text / utils.py View on Github external
def get_input_text_plugins() -> List[InputTextPlugin]:
    key_word = 'inputTextPlugin'
    if key_word not in config.settings:
        return []
    ps = []
    for obj in config.settings[key_word]:
        ps.append(get_input_text_plugin(obj))
    return ps
github WorksApplications / SudachiPy / sudachipy / plugin / connect_cost / inhibitconnectioncost.py View on Github external
def set_up(self, grammar: Grammar) -> None:
        if 'inhibitedPair' in config.settings:
            self._inhibit_pairs = config.settings['inhibitedPair']
github WorksApplications / SudachiPy / sudachipy / plugin / connect_cost / inhibitconnectioncost.py View on Github external
def set_up(self, grammar: Grammar) -> None:
        if 'inhibitedPair' in config.settings:
            self._inhibit_pairs = config.settings['inhibitedPair']
github WorksApplications / SudachiPy / sudachipy / plugin / oov / mecab_oov_plugin.py View on Github external
def set_up(self, grammar):
        char_def = os.path.join(config.settings.resource_dir, self.__chardef_filename)
        if not char_def:
            raise AttributeError("charDef is not defined")
        self.read_character_property(char_def)

        unk_def = os.path.join(config.settings.resource_dir, self.__unkdef_filename)
        if not unk_def:
            raise AttributeError("unkDef is not defined")
        self.read_oov(unk_def, grammar)
github WorksApplications / SudachiPy / sudachipy / plugin / path_rewrite / utils.py View on Github external
def get_path_rewrite_plugins() -> List[PathRewritePlugin]:
    if 'pathRewritePlugin' not in config.settings:
        return []
    ps = []
    for obj in config.settings['pathRewritePlugin']:
        ps.append(get_path_rewrite_plugin(obj))
    return ps