How to use SudachiPy - 10 common examples

To help you get started, we’ve selected a few SudachiPy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github WorksApplications / SudachiPy / sudachipy / dictionarylib / charactercategory.py View on Github external
n = len(self.range_list)
        end = n
        pivot = (begin + end) // 2
        while 0 <= pivot < n:
            range_ = self.range_list[pivot]
            if range_.contains(code_point):
                return range_.categories
            if range_.lower(code_point):
                begin = pivot
            else:  # range_.higher(code_point)
                end = pivot
            new_pivot = (begin + end) // 2
            if new_pivot == pivot:
                break
            pivot = new_pivot
        return {categorytype.CategoryType.DEFAULT}
github WorksApplications / SudachiPy / tests / test_tokenizer.py View on Github external
def test_tokenizer_morpheme_split(self):
        from sudachipy import tokenizer
        ms = self.tokenizer_obj.tokenize('東京都', tokenizer.Tokenizer.SplitMode.C)
        self.assertEqual(1, ms.size())
        self.assertEqual(ms[0].surface(), '東京都')

        ms_a = ms[0].split(tokenizer.Tokenizer.SplitMode.A)
        self.assertEqual(2, ms_a.size())
        self.assertEqual(ms_a[0].surface(), '東京')
        self.assertEqual(ms_a[1].surface(), '都')
github WorksApplications / SudachiPy / tests / test_tokenizer.py View on Github external
def test_tokenizer_morpheme_split(self):
        from sudachipy import tokenizer
        ms = self.tokenizer_obj.tokenize('東京都', tokenizer.Tokenizer.SplitMode.C)
        self.assertEqual(1, ms.size())
        self.assertEqual(ms[0].surface(), '東京都')

        ms_a = ms[0].split(tokenizer.Tokenizer.SplitMode.A)
        self.assertEqual(2, ms_a.size())
        self.assertEqual(ms_a[0].surface(), '東京')
        self.assertEqual(ms_a[1].surface(), '都')
github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_parseline(self):
        builder = DictionaryBuilder(logger=self.logger)
        entry = builder.parse_line(
            '京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*'.split(','))
        self.assertEqual('京都', entry.headword)
        self.assertEqual([6, 6, 5293], entry.parameters)
        self.assertEqual(0, entry.wordinfo.pos_id)
        self.assertEqual('*', entry.aunit_split_string)
        self.assertEqual('*', entry.bunit_split_string)

        entry = builder.parse_line(
            '京都,-1,-1,0,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*'.split(','))
        self.assertIsNone(entry.headword)
        self.assertEqual(0, entry.wordinfo.pos_id)
github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_parse_line_empty_headword(self):
        builder = DictionaryBuilder(logger=self.logger)
        with self.assertRaises(ValueError) as cm:
            builder.parse_line(',6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*'.split(','))
        self.assertEqual('headword is empty', cm.exception.args[0])
github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_parse_line_same_readingform(self):
        builder = DictionaryBuilder(logger=self.logger)
        entry = builder.parse_line('〒,6,6,5293,〒,名詞,普通名詞,一般,*,*,*,〒,〒,*,A,*,*,*'.split(','))
        self.assertEqual('〒', entry.wordinfo.reading_form)
github WorksApplications / SudachiPy / tests / test_utf8inputtext.py View on Github external
def test_get_char_category_types(self):
        input_ = self.builder.build()
        self.assertTrue(dictionarylib.categorytype.CategoryType.ALPHA in input_.get_char_category_types(0))
        self.assertTrue(dictionarylib.categorytype.CategoryType.ALPHA in input_.get_char_category_types(2))
        self.assertTrue(dictionarylib.categorytype.CategoryType.ALPHA in input_.get_char_category_types(5))
        self.assertTrue(dictionarylib.categorytype.CategoryType.NUMERIC in input_.get_char_category_types(6))
        self.assertTrue(dictionarylib.categorytype.CategoryType.HIRAGANA in input_.get_char_category_types(7))
        self.assertTrue(dictionarylib.categorytype.CategoryType.HIRAGANA in input_.get_char_category_types(9))
        self.assertTrue(dictionarylib.categorytype.CategoryType.NUMERIC in input_.get_char_category_types(10))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KANJI in input_.get_char_category_types(13))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KANJI in input_.get_char_category_types(18))
        self.assertTrue(dictionarylib.categorytype.CategoryType.DEFAULT in input_.get_char_category_types(19))
        self.assertTrue(dictionarylib.categorytype.CategoryType.DEFAULT in input_.get_char_category_types(22))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KATAKANA in input_.get_char_category_types(23))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KATAKANA in input_.get_char_category_types(26))
        self.assertTrue(dictionarylib.categorytype.CategoryType.KATAKANA in input_.get_char_category_types(31))
github WorksApplications / SudachiPy / tests / plugin / test_mecab_oov_plugin.py View on Github external
def test_read_oov(self):
        oov = os.path.join(self.test_dir, 'test.txt')
        with open(oov, 'w') as wf:
            wf.write("DEFAULT,1,2,3,補助記号,一般,*,*,*,*\n")
            wf.write("DEFAULT,3,4,5,補助記号,一般,*,*,*,*\n")
        plugin = MeCabOovPlugin()
        plugin.categories[CategoryType.DEFAULT] = MeCabOovPlugin.CategoryInfo()
        plugin.read_oov(oov, mock_grammar.mocked_grammar)
        self.assertEqual(1, len(plugin.oov_list))
        self.assertEqual(2, len(plugin.oov_list[CategoryType.DEFAULT]))
        self.assertEqual(1, plugin.oov_list[CategoryType.DEFAULT][0].left_id)
        self.assertEqual(2, plugin.oov_list[CategoryType.DEFAULT][0].right_id)
        self.assertEqual(3, plugin.oov_list[CategoryType.DEFAULT][0].cost)
        self.assertEqual(0, plugin.oov_list[CategoryType.DEFAULT][0].pos_id)
github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_write_intarray(self):
        builder = DictionaryBuilder(logger=self.logger)
        position = builder.byte_buffer.tell()
        builder.write_intarray([])
        self.assertEqual(0, builder.byte_buffer.getvalue()[position])
        builder.write_intarray([1, 2, 3])
        self.assertEqual(3, builder.byte_buffer.getvalue()[position + 1])
        self.assertEqual(1, int.from_bytes(builder.byte_buffer.getvalue()[position + 2:position + 6], byteorder='little', signed=True))
        self.assertEqual(2, int.from_bytes(builder.byte_buffer.getvalue()[position + 6:position + 10], byteorder='little', signed=True))
        self.assertEqual(3, int.from_bytes(builder.byte_buffer.getvalue()[position + 10:position + 14], byteorder='little', signed=True))
github WorksApplications / SudachiPy / tests / dictionarylib / test_dictionarybuilder.py View on Github external
def test_parse_splitinfo(self):
        builder = DictionaryBuilder(logger=self.logger)
        builder.entries.extend([None] * 4)
        self.assertEqual([], builder.parse_splitinfo('*'))
        self.assertEqual([1, 2, 3], builder.parse_splitinfo('1/2/3'))
        self.assertEqual(2, builder.parse_splitinfo('1/U2/3')[1])

        mocked_lexicon = mock.Mock(spec=Lexicon)
        mocked_lexicon.size.return_value = 4
        builder = UserDictionaryBuilder(None, mocked_lexicon)
        builder.entries += [None, None, None]
        self.assertEqual([1, 2 | 1 << 28, 3], builder.parse_splitinfo("1/U2/3"))