How to use the pythainlp.tokenize.sent_tokenize function in pythainlp

To help you get started, we’ve selected a few pythainlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github BLKSerene / Wordless / src / wordless_text / wordless_text_processing.py View on Github external
for j, char in enumerate(line):
                        if j > i and char not in ['。', '!', '?', '!', '?', '’', '”', ')', ')']:
                            sentences.append(line[sentence_start : j])

                            sentence_start = j

                            break

            if sentence_start <= len(line):
                sentences.append(line[sentence_start:])
    # Russian
    elif sentence_tokenizer == main.tr('razdel - Russian Sentenizer'):
        sentences = [sentence.text for sentence in razdel.sentenize(text)]
    # Thai
    elif sentence_tokenizer == main.tr('PyThaiNLP - Thai Sentence Tokenizer'):
        sentences = pythainlp.tokenize.sent_tokenize(text)
    # Tibetan
    elif sentence_tokenizer == main.tr('Wordless - Tibetan Sentence Tokenizer'):
        sentences = text.split()
    # Vietnamese
    elif sentence_tokenizer == main.tr('Underthesea - Vietnamese Sentence Tokenizer'):
        sentences = underthesea.sent_tokenize(text)

    # Strip spaces
    sentences = [sentence.strip() for sentence in sentences]

    sentences = wordless_text_utils.record_boundary_sentences(sentences, text)

    return sentences
github PyThaiNLP / pythainlp / pythainlp / summarize / freq.py View on Github external
def summarize(self, text: str, n: int, tokenizer: str = "newmm") -> List[str]:
        sents = sent_tokenize(text, engine="whitespace+newline")
        word_tokenized_sents = [word_tokenize(sent, engine=tokenizer) for sent in sents]
        self.__freq = self.__compute_frequencies(word_tokenized_sents)
        ranking = defaultdict(int)

        for i, sent in enumerate(word_tokenized_sents):
            for w in sent:
                if w in self.__freq:
                    ranking[i] += self.__freq[w]
        summaries_idx = self.__rank(ranking, n)

        return [sents[j] for j in summaries_idx]
github PyThaiNLP / pythainlp / pythainlp / summarize / __init__.py View on Github external
'''

            summarize(text, n=1)
            # output: ['บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์']

            summarize(text, n=3)
            # output: ['บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์',
            # 'เดิมเป็นบ้านของเจ้าพระยามหาโยธา',
            # 'เจ้าพระยามหาโยธา']
    """
    sents = []

    if engine == "frequency":
        sents = FrequencySummarizer().summarize(text, n, tokenizer)
    else:  # if engine not found, return first n sentences
        sents = sent_tokenize(text, engine="whitespace+newline")[:n]

    return sents
github PyThaiNLP / pythainlp / examples / tokenize.py View on Github external
# -*- coding: utf-8 -*-

from pythainlp.tokenize import sent_tokenize, word_tokenize

text = "ฉันรักภาษาไทย เพราะฉันใช้ภาษาไทย "
print(text)

print(sent_tokenize(text))
# ['ฉันรักภาษาไทย', 'เพราะฉันใช้ภาษาไทย', '']

print(word_tokenize(text))
# ['ฉัน', 'รัก', 'ภาษาไทย', ' ', 'เพราะ', 'ฉัน', 'ใช้', 'ภาษาไทย', ' ']

print(word_tokenize(text, whitespaces=False))
# ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'ใช้', 'ภาษาไทย']

text2 = "กฎหมายแรงงาน"
print(text2)

print(word_tokenize(text2))
# ['กฎหมายแรงงาน']

print(word_tokenize(text2, engine="longest"))
# ['กฎหมาย', 'แรงงาน']
github PyThaiNLP / pythainlp / pythainlp / cli / tokenize.py View on Github external
def __init__(self, *args, **kwargs):
        self.keep_whitespace = True
        self.algorithm = DEFAULT_SENT_TOKENIZE_ENGINE
        self.separator = DEFAULT_SENT_TOKEN_SEPARATOR
        self.run = sent_tokenize
        super().__init__(*args, **kwargs)