How to use the konoha.konoha_token.Token function in konoha

To help you get started, we’ve selected a few konoha examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github himkt / tiny_tokenizer / tests / word_tokenizer / test_segmentation.py View on Github external
def test_word_tokenize_with_sentencepiece(self):
        """Test Sentencepiece tokenizer."""
        try:
            tokenizer1 = WordTokenizer(
                tokenizer="Sentencepiece", model_path="data/model.spm"
            )
            tokenizer2 = WordTokenizer(
                tokenizer="Sentencepiece", model_path="data/model.spm"
            )
        except ImportError:
            pytest.skip("skip sentencepiece")

        expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")]  # NOQA
        result1 = tokenizer1.tokenize(SENTENCE1)
        result2 = tokenizer2.tokenize(SENTENCE1)
        assert expect == result1  # NOQA
        assert result1 == result2
github himkt / tiny_tokenizer / tests / word_tokenizer / test_segmentation.py View on Github external
def test_word_tokenize_with_sudachi_mode_b(self):
        """Test Sudachi tokenizer."""
        try:
            tokenizer = WordTokenizer(tokenizer="Sudachi", mode="B")
        except ImportError:
            pytest.skip("skip sudachi")

        expect = [Token(surface=w) for w in "医薬品 安全 管理 責任者".split(" ")]
        result = tokenizer.tokenize(SENTENCE2)
        self.assertEqual(expect, result)
github himkt / tiny_tokenizer / tests / word_tokenizer / test_segmentation.py View on Github external
def test_word_tokenize_with_sudachi_mode_a(self):
        """Test Sudachi tokenizer."""
        try:
            tokenizer = WordTokenizer(tokenizer="Sudachi", mode="A")
        except ImportError:
            pytest.skip("skip sudachi")

        expect = [Token(surface=w) for w in "医薬 品 安全 管理 責任 者".split(" ")]
        result = tokenizer.tokenize(SENTENCE2)
        self.assertEqual(expect, result)
github himkt / tiny_tokenizer / tests / word_tokenizer / test_segmentation.py View on Github external
def test_word_tokenize_with_kytea_using_custom_model(self):
        try:
            tokenizer = WordTokenizer(tokenizer="KyTea", model_path="data/model.knm")
        except ImportError:
            pytest.skip("skip kytea")

        expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")]  # NOQA
        result = tokenizer.tokenize(SENTENCE1)
        assert expect == result  # NOQA
github himkt / tiny_tokenizer / tests / word_tokenizer / test_segmentation.py View on Github external
def test_word_tokenize_with_whitespace(self):
        """Test Character tokenizer."""
        tokenizer1 = WordTokenizer(tokenizer="Whitespace")
        tokenizer2 = WordTokenizer(tokenizer="whitespace")
        # assert tokenizer1 == tokenizer2
        expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]  # NOQA
        result1 = tokenizer1.tokenize(SENTENCE3)
        result2 = tokenizer2.tokenize(SENTENCE3)
        assert expect == result1  # NOQA
        assert result1 == result2
github himkt / tiny_tokenizer / tests / word_tokenizer / test_segmentation.py View on Github external
def test_word_tokenize_with_character(self):
        """Test Character tokenizer."""
        tokenizer1 = WordTokenizer(tokenizer="Character")
        tokenizer2 = WordTokenizer(tokenizer="character")
        # assert tokenizer1 == tokenizer2
        expect = [Token(surface=w) for w in "吾 輩 は 猫 で あ る".split(" ")]  # NOQA
        result1 = tokenizer1.tokenize(SENTENCE1)
        result2 = tokenizer2.tokenize(SENTENCE1)
        assert expect == result1  # NOQA
        assert result1 == result2
github himkt / tiny_tokenizer / tests / word_tokenizer / test_mecab_tokenizer.py View on Github external
def test_word_tokenize_with_mecab():
    try:
        import natto

        del natto
    except ImportError:
        pytest.skip("natto-py is not installed.")

    tokenizer = WordTokenizer(tokenizer="MeCab")
    expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
github himkt / tiny_tokenizer / tests / word_tokenizer / test_kytea_tokenizer.py View on Github external
def test_word_tokenize_with_kytea():
    try:
        import Mykytea
        del Mykytea
    except ImportError:
        pytest.skip("Mykytea is not installed.")

    tokenizer = WordTokenizer(tokenizer="KyTea")
    expect = [Token(surface=w) for w in "吾輩 は 猫 で あ る".split(" ")]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result
github himkt / tiny_tokenizer / tests / word_tokenizer / test_sudachi_tokenizer.py View on Github external
def test_postagging_with_sudachi_mode_a():
    """Test Sudachi tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="sudachi", mode="A", with_postag=True)
    except ImportError:
        pytest.skip("SudachiPy is not installed.")

    expect = [Token(**kwargs) for kwargs in sudachi_tokens_list]
    result = tokenizer.tokenize("医薬品安全管理責任者")
    assert expect == result
github himkt / tiny_tokenizer / tests / word_tokenizer / test_mecab_tokenizer.py View on Github external
def test_postagging_with_mecab():
    """Test MeCab tokenizer."""
    try:
        tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True)
    except ImportError:
        pytest.skip("natto-py is not installed.")

    expect = [Token(**kwargs) for kwargs in mecab_tokens_list]
    result = tokenizer.tokenize("吾輩は猫である")
    assert expect == result