How to use the konoha.sentence_tokenizer.SentenceTokenizer function in konoha

To help you get started, we’ve selected a few konoha examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github himkt / tiny_tokenizer / tests / sentence_tokenizer / test_sentence_tokenize.py View on Github external
def test_sentence_tokenize_with_combined():
    corpus = SentenceTokenizer()
    expect = ["わんわん。", "「にゃ?」(にゃー)わんわん。", "「わおーん。」(犬より。)"]
    result = corpus.tokenize(DOCUMENT4)
    assert expect == result
github himkt / tiny_tokenizer / tests / sentence_tokenizer / test_sentence_tokenize.py View on Github external
def test_sentence_tokenize():
    corpus = SentenceTokenizer()
    expect = ["私は猫である。", "にゃお。", "にゃにゃ", "わんわん。", "にゃーにゃー。"]
    result = corpus.tokenize(DOCUMENT1)
    assert expect == result
github himkt / tiny_tokenizer / tests / sentence_tokenizer / test_sentence_tokenize.py View on Github external
def test_sentence_tokenize_with_bracket():
    corpus = SentenceTokenizer()
    expect = ["私は猫である(ただしかわいいものとする。異議は認める)。", "にゃお。", "にゃにゃ"]
    result = corpus.tokenize(DOCUMENT2)
    assert expect == result
github himkt / tiny_tokenizer / tests / sentence_tokenizer / test_sentence_tokenize.py View on Github external
def test_sentence_tokenize_with_quotation():
    corpus = SentenceTokenizer()
    expect = ["猫「にゃおにゃ。ただしかわいいものとする。異議は認める」。", "にゃお。", "にゃにゃ"]
    result = corpus.tokenize(DOCUMENT3)
    assert expect == result
github himkt / tiny_tokenizer / konoha / sentence_tokenizer.py View on Github external
def tokenize(self, document) -> List[str]:
        """
        Divide a raw document into sentences.
        :param document: a raw document
        :type document: str
        :return: list of sentences
        :rtype list[str]
        """

        for pattern in SentenceTokenizer.PATTERNS:
            pattern = re.compile(pattern)  # type: ignore
            document = re.sub(pattern, self.conv_period, document)

        result = []
        for line in document.split("\n"):
            line = line.rstrip()
            line = line.replace("\n", "")
            line = line.replace("\r", "")
            line = line.replace("。", "。\n")
            sentences = line.split("\n")

            for sentence in sentences:
                if not sentence:
                    continue

                period_special = SentenceTokenizer.PERIOD_SPECIAL