How to use the nltk.tokenize.sent_tokenize function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github NVIDIA / Megatron-LM / data_utils / View on Github external
def sentence_split(self, document):
        """split document into sentences"""
        lines = document.split('\n')
        if self.presplit_sentences:
            return [line for line in lines if line]
        rtn = []
        for line in lines:
            if line != '':
        return rtn
github Barqawiz / Shakkala / View on Github external
def get_sentences(data):

    return [sent for line in re.split("[\n,،]+", data) if line for sent in sent_tokenize(line.strip()) if sent]
    #return [sent for line in data.split('\n') if line for sent in sent_tokenize(line) if sent]
github aseempatni / author-profiling / src / PANdata / codes / View on Github external
authorFileNames = os.listdir(directory)

author = {}
for file in authorFileNames:
    if file.endswith(".xml"):
        file_path = directory+"/"+file
        xmldoc = minidom.parse(file_path)
        rawdocuments = xmldoc.getElementsByTagName('document')
        length = 0
        vocabulary = set()
        for document in rawdocuments:
            text = removeTag_CDATA_section(document.firstChild.nodeValue.strip())
            sentences = nltk.tokenize.sent_tokenize(text)
            for sentence in sentences:
                length = length + len(sentence.split())
            if vocabulary:
                vocabulary = set(text.split())
            no_of_sentences = no_of_sentences+len(sentences)
        if vocabulary:
            author[file.split('.')[0]] = [length,len(vocabulary),no_of_sentences]
            author[file.split('.')[0]] = [length,0,no_of_sentences]

ifile = open('truth.txt')
truth_data = ifile.readlines()
github Rostlab / nalaf / source / preprocessing / View on Github external
def split(self, dataset):
        :type dataset:
        for part in
                part.sentences = sent_tokenize(part.text)
github koreyou / abae-chainer / abae / View on Github external
def read_dataset(paths):
    for p in paths:
        with open(p) as fin:
            text =
        for s in sent_tokenize(text):
            words = word_tokenize(s)
            yield [w for w in words if w not in _stop_words]
github Hsankesara / DeepResearch / Hierarchical_Attention_Network / View on Github external
def preprocessing(self):
        """Preprocessing of the text to make it more resonant for training
        paras = []
        labels = []
        texts = []
        for idx in range(self.text.shape[0]):
            text = self.clean_string(self.text[idx])
            sentences = tokenize.sent_tokenize(text)
        tokenizer = Tokenizer(num_words=self.max_features, oov_token=True)
        data = np.zeros((len(texts), self.max_senten_num,
                         self.max_senten_len), dtype='int32')
        for i, sentences in enumerate(paras):
            for j, sent in enumerate(sentences):
                if j < self.max_senten_num:
                    wordTokens = text_to_word_sequence(sent)
                    k = 0
                    for _, word in enumerate(wordTokens):
                        if k < self.max_senten_len and word in tokenizer.word_index and tokenizer.word_index[word] < self.max_features:
                            data[i, j, k] = tokenizer.word_index[word]
                            k = k+1
        self.word_index = tokenizer.word_index
        if self.verbose == 1:
github aseempatni / author-profiling / src / feature / View on Github external
def count_sents_in_quotes(text):
	if v>1 or v==0:

		return v
	if text[-1]=='.' or text[-1]=='!' or text[-1]=='.' :
		return 1
	return 0
github DavidIsrawi / SummarizeMe / server / View on Github external
result = dict()
    result["text"] = ""
    result["stats"] = dict()
    result["stats"]["relevant_words"] = []
    result["stats"]["word_length"] = 0
    result["stats"]["avg_contrast"] = ""
    result["stats"]["avg_current"] = ""
    result["stats"]["totalSummaries"] = 0

    news = text
    summarySize = 0; # Store size of summary to retrieve stats

    # RegexpTokenizer used to avoid punctuation signs
    tokenizer = RegexpTokenizer(r"[a-zA-Z_']+")
    words = tokenizer.tokenize(news)
    sentences = sent_tokenize(news)

    # Retrieve set to remove stopwords from analysis
    stopWords = set(stopwords.words("english"))

    # Use stemmers in the future, maybe run the code with both and retrieve most efficient
    ps = PorterStemmer()
    pss = SnowballStemmer("english")
    freq = dict()            # Frequency array for words
    sentenceVal = dict()     # Number of instances a word from freq is contained in a sentence

    for w in words:
        w = w.lower()
        if w in stopWords:
        if w in freq:
            freq[w] += 1
github dmmiller612 / bert-extractive-summarizer / View on Github external
def __process_sentences(self, v) -> List[str]:
        sentence = tokenize.sent_tokenize(v)
        return sentence
github sloria / TextBlob / View on Github external
def extract(self, text):
        '''Return a list of noun phrases (strings) for body of text.'''
        sentences = nltk.tokenize.sent_tokenize(text)
        noun_phrases = []
        for sentence in sentences:
            parsed = self._parse_sentence(sentence)
            # Get the string representation of each subtree that is a
            # noun phrase tree
            phrases = [_normalize_tags(filter_insignificant(each,
                       self.INSIGNIFICANT_SUFFIXES)) for each in parsed
                       if isinstance(each, nltk.tree.Tree) and each.label()
                       == 'NP' and len(filter_insignificant(each)) >= 1
                       and _is_match(each, cfg=self.CFG)]
            nps = [tree2str(phrase) for phrase in phrases]
        return noun_phrases