How to use the sumy.utils.get_stop_words function in sumy

To help you get started, we’ve selected a few sumy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github miso-belica / sumy / tests / test_summarizers / test_luhn.py View on Github external
def test_real_example():
    parser = PlaintextParser.from_string(
        "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. "
        "Přerostly až v reparát z jazyka na konci školního roku. "
        "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. "
        "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě "
        "o rok mladších dětí budoval vedoucí pozici. "
        "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.",
        Tokenizer("czech")
    )
    summarizer = LuhnSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("czech")

    returned = summarizer(parser.document, 2)
    assert list(map(to_unicode, returned)) == [
        "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.",
        "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici.",
    ]
github miso-belica / sumy / tests / test_utils / test_utils.py View on Github external
def test_ok_stop_words_language():
    stop_words = get_stop_words("french")
    assert len(stop_words) > 1
github miso-belica / sumy / sumy / evaluation / __main__.py View on Github external
def build_kl(parser, language):
    summarizer = KLSummarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)

    return summarizer
github sidhusmart / WACAO / webwhatsapi / __init__.py View on Github external
inputLine = inputLine + message['message'] + '. '
        # blob = TextBlob(inputLine)
        # wordCounts = blob.word_counts
        # sortedWordCounts = sorted(wordCounts, key=wordCounts.get, reverse=True)
        # outputLine = " ".join(sortedWordCounts[:5])
        # outputLine = groupName.capitalize() + " summarized as " + outputLine
        # self.send_to_whatsapp_id("WACAO!",outputLine)

        LANGUAGE = "english"
        SENTENCES_COUNT = '20%'

        outputLine = groupName.capitalize() + " summarized as: \n"
        parser = PlaintextParser.from_string(inputLine, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = LsaSummarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            outputLine = outputLine + unicode(str(sentence), "utf-8") + "\n"
        self.send_to_whatsapp_id("WACAO!",outputLine)
        # print "sum_basic:"
github miso-belica / sumy / sumy / evaluation / __main__.py View on Github external
def build_edmundson(parser, language):
    summarizer = EdmundsonSummarizer(Stemmer(language))
    summarizer.null_words = get_stop_words(language)
    summarizer.bonus_words = parser.significant_words
    summarizer.stigma_words = parser.stigma_words

    return summarizer
github mikelkl / news_summarization / newssum / feature_extraction / sentence_feature.py View on Github external
def get_global_term_freq(parsers):
        """
        :param parsers: newssum.parser.StoryParser
        :return: tuple, (vectorizer, X)
            vectorizer, sklearn.feature_extraction.text.CountVectorizer.
            X, Document-term matrix.
        """
        vectorizer = CountVectorizer(stop_words=get_stop_words("english"))
        if type(parsers) is list:
            corpus = [parser.body for parser in parsers]
        else:
            corpus = [parsers.body]
        X = vectorizer.fit_transform(corpus)
        return (vectorizer, X)
github mikelkl / news_summarization / newssum / parsers / plaintext.py View on Github external
def __init__(self, text, pos_tagger=pos_tag, keep_only_n_and_adj=True, remove_stopwords=True,
                 stemming_mode="stemming"):
        self.body = text.strip()
        self.sents = sent_tokenize(self.body)
        super().__init__(word_tokenize, get_stop_words("english"), pos_tagger, keep_only_n_and_adj, remove_stopwords,
                         stemming_mode)
github miso-belica / sumy / sumy / evaluation / __main__.py View on Github external
def build_lsa(parser, language):
    summarizer = LsaSummarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)

    return summarizer
github OpenGenus / vidsum / code / sum.py View on Github external
""" Generate segmented summary

    Args:
        srt_file(str) : The name of the SRT FILE
        n_sentences(int): No of sentences
        language(str) : Language of subtitles (default to English)

    Returns:
        list: segment of subtitles

    """
    parser = PlaintextParser.from_string(
        srt_to_txt(srt_file), Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)
    segment = []
    for sentence in summarizer(parser.document, n_sentences):
        index = int(re.findall("\(([0-9]+)\)", str(sentence))[0])
        item = srt_file[index]
        segment.append(srt_segment_to_range(item))
    return segment
github miso-belica / sumy / sumy / evaluation / __main__.py View on Github external
def build_text_rank(parser, language):
    summarizer = TextRankSummarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)

    return summarizer