Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for j, char in enumerate(line):
if j > i and char not in ['。', '!', '?', '!', '?', '’', '”', ')', ')']:
sentences.append(line[sentence_start : j])
sentence_start = j
break
if sentence_start <= len(line):
sentences.append(line[sentence_start:])
# Russian
elif sentence_tokenizer == main.tr('razdel - Russian Sentenizer'):
sentences = [sentence.text for sentence in razdel.sentenize(text)]
# Thai
elif sentence_tokenizer == main.tr('PyThaiNLP - Thai Sentence Tokenizer'):
sentences = pythainlp.tokenize.sent_tokenize(text)
# Tibetan
elif sentence_tokenizer == main.tr('Wordless - Tibetan Sentence Tokenizer'):
sentences = text.split()
# Vietnamese
elif sentence_tokenizer == main.tr('Underthesea - Vietnamese Sentence Tokenizer'):
sentences = underthesea.sent_tokenize(text)
# Strip spaces
sentences = [sentence.strip() for sentence in sentences]
sentences = wordless_text_utils.record_boundary_sentences(sentences, text)
return sentences
def summarize(self, text: str, n: int, tokenizer: str = "newmm") -> List[str]:
sents = sent_tokenize(text, engine="whitespace+newline")
word_tokenized_sents = [word_tokenize(sent, engine=tokenizer) for sent in sents]
self.__freq = self.__compute_frequencies(word_tokenized_sents)
ranking = defaultdict(int)
for i, sent in enumerate(word_tokenized_sents):
for w in sent:
if w in self.__freq:
ranking[i] += self.__freq[w]
summaries_idx = self.__rank(ranking, n)
return [sents[j] for j in summaries_idx]
'''
summarize(text, n=1)
# output: ['บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์']
summarize(text, n=3)
# output: ['บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์',
# 'เดิมเป็นบ้านของเจ้าพระยามหาโยธา',
# 'เจ้าพระยามหาโยธา']
"""
sents = []
if engine == "frequency":
sents = FrequencySummarizer().summarize(text, n, tokenizer)
else: # if engine not found, return first n sentences
sents = sent_tokenize(text, engine="whitespace+newline")[:n]
return sents
# -*- coding: utf-8 -*-
from pythainlp.tokenize import sent_tokenize, word_tokenize
text = "ฉันรักภาษาไทย เพราะฉันใช้ภาษาไทย "
print(text)
print(sent_tokenize(text))
# ['ฉันรักภาษาไทย', 'เพราะฉันใช้ภาษาไทย', '']
print(word_tokenize(text))
# ['ฉัน', 'รัก', 'ภาษาไทย', ' ', 'เพราะ', 'ฉัน', 'ใช้', 'ภาษาไทย', ' ']
print(word_tokenize(text, whitespaces=False))
# ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'ใช้', 'ภาษาไทย']
text2 = "กฎหมายแรงงาน"
print(text2)
print(word_tokenize(text2))
# ['กฎหมายแรงงาน']
print(word_tokenize(text2, engine="longest"))
# ['กฎหมาย', 'แรงงาน']
def __init__(self, *args, **kwargs):
self.keep_whitespace = True
self.algorithm = DEFAULT_SENT_TOKENIZE_ENGINE
self.separator = DEFAULT_SENT_TOKEN_SEPARATOR
self.run = sent_tokenize
super().__init__(*args, **kwargs)