How to use the sumy.parsers.html.HtmlParser function in sumy

To help you get started, we’ve selected a few sumy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github miso-belica / sumy / tests / test_parsers.py View on Github external
def test_annotated_text(self):
        path = expand_resource_path("snippets/paragraphs.html")
        url = "http://www.snippet.org/paragraphs.html"
        parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

        document = parser.document

        self.assertEqual(len(document.paragraphs), 2)

        self.assertEqual(len(document.paragraphs[0].headings), 1)
        self.assertEqual(len(document.paragraphs[0].sentences), 1)

        self.assertEqual(to_unicode(document.paragraphs[0].headings[0]),
            "Toto je nadpis prvej úrovne")
        self.assertEqual(to_unicode(document.paragraphs[0].sentences[0]),
            "Toto je prvý odstavec a to je fajn.")

        self.assertEqual(len(document.paragraphs[1].headings), 0)
        self.assertEqual(len(document.paragraphs[1].sentences), 2)
github ucfnlp / multidoc_summarization / src / sumy_summarize.py View on Github external
elif summary_method == 'kl':
        summary_fn = KLSummarizer
    elif summary_method == 'sumbasic':
        summary_fn = SumBasicSummarizer
    else:
        raise Exception('Could not find summary method ' + summary_method)

    if not os.path.exists(os.path.join(out_dir, summary_method, reference_folder)):
        os.makedirs(os.path.join(out_dir, summary_method, reference_folder))
    if not os.path.exists(os.path.join(out_dir, summary_method, decoded_folder)):
        os.makedirs(os.path.join(out_dir, summary_method, decoded_folder))
    print (os.path.join(out_dir, summary_method))
    article_names = sorted(os.listdir(articles_dir))
    for art_idx, article_name in enumerate(tqdm(article_names)):
        file = os.path.join(articles_dir, article_name)
        parser = HtmlParser.from_file(file, "", Tokenizer("english"))
        summarizer = summary_fn()

        summary = summarizer(parser.document, 5) #Summarize the document with 5 sentences
        summary = [str(sentence) for sentence in summary]
        with open(os.path.join(out_dir, summary_method, decoded_folder, article_name), 'wb') as f:
            f.write('\n'.join(summary))

        summary_tokenized = []
        for sent in summary:
            summary_tokenized.append(' '.join(nltk.tokenize.word_tokenize(sent.lower())))
        with open(os.path.join(abstract_dir, article_name)) as f:
            abstracts_text = f.read()
        abstracts = abstracts_text.split('\n\n')
        abstracts_sentences = []
        for abs_idx, abstract in enumerate(abstracts):
            abstract_sents = abstract.split('\n')
github genekogan / text-learning / summarize / summarize.py View on Github external
def main(url, num_sentences=10, language='english'):
	parser = HtmlParser.from_url(url, Tokenizer(language))
	stemmer = Stemmer(language)
	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words(language)
	for sentence in summarizer(parser.document, num_sentences):
		print(sentence)
github SHARVAI101 / KJSCE-Writeup-Creator / writeup.py View on Github external
searchlink=clean_links[i]
			print("Search Link --> "+str(searchlink))

			if searchlink[-4:]=='.pdf' or searchlink[-4:]=='.ppt':
				# go to next link id the current link is a ppt or pdf
				print("Can't include ppts or pdfs, trying next link on Google")
				linkno+=1
				if linkno>9:
					# if number of links on one page have been exceede, go to the next google link page
					num_page+=1
					linkno=0
			else:
				LANGUAGE = "english"
				SENTENCES_COUNT = 10

				parser = HtmlParser.from_url(searchlink, Tokenizer(LANGUAGE))

				# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				# Summarisation using Luhn Summarizer
				stopwords1 = set(stopwords.words('english'))

				datastring=''

				# using the LuhnSummarizer
				summarizer = LuhnSummarizer() 
				summarizer.stop_words = stopwords1
				for sentence in summarizer(parser.document, SENTENCES_COUNT):
					# print(sentence)
					datastring+=str(sentence)

				return datastring
		except:
github jim-schwoebel / voicebook / chapter_5_generation / generate_summary.py View on Github external
parser = PlaintextParser.from_file(textfile, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    # now summarize: output as [txtfile]_summary.txt
    g=open(textfile[0:-4]+'_summary.txt','w')
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        g.write(str(sentence))
    g.close()
    os.system('open %s'%(textfile[0:-4]+'_summary.txt'))
elif ftype in ['w']:
    # for URLS
    url=input('what link would you like to summarize on Wikipedia? \n')
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # for plaintext
    #parser = PlaintextParser.from_file("poetry.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    # now summarize: output as [txtfile]_summary.txt
    g=open('web_summary.txt','w')
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        g.write(str(sentence))
    g.close()
    os.system('open web_summary.txt')
github BernhardWenzel / scraping-microservice-java-python-rabbitmq / python-scraping-service / scraper.py View on Github external
def scrape(self, url):
        complete_url = url
        try:
            # get summary
            print "Retrieving page summary of %s... " % url

            parser = HtmlParser.from_url(complete_url, Tokenizer(LANGUAGE))
            stemmer = Stemmer(LANGUAGE)

            summarizer = Summarizer(stemmer)
            summarizer.stop_words = get_stop_words(LANGUAGE)

            url_summary = ''.join(str(sentence) for sentence in summarizer(parser.document, SENTENCES_COUNT))

        except Exception, e:
            url_summary = "Could not scrape summary. Reason: %s" % e.message

        print "Done: %s = %s" % (url, url_summary)

        # create scraping result
        scraping_result = ScrapingResult()

        scraping_result.summary = url_summary
github amyxzhang / wikum / wikum / website / views.py View on Github external
stemmer = Stemmer("english")
    summarizer = Summarizer(stemmer)
    
    comment_ids = request.POST.getlist('d_ids[]')
    
    sent_list = []
    
    for comment_id in comment_ids:
        comment = Comment.objects.get(id=comment_id)
        text = comment.text
        
        text = re.sub('<br>', ' ', text)
        text = re.sub('<br>', ' ', text)
        
        parser = HtmlParser.from_string(text, '', Tokenizer("english"))
        
        num_sents = request.GET.get('num_sents', None)
        if not num_sents:
            all_sents = parser.tokenize_sentences(text)
            num_sents = floor(float(len(all_sents))/3.0)
        
        sents = summarizer(parser.document, num_sents)
         
        
        for sent in sents:
            if 'https://en.wikipedia.org/wiki/' in comment.article.url:
                text = parser.parse(sent._text)
                sent = ''
                in_tag = False
                for c in text:
                    if c == '&lt;':
github adityasarvaiya / Automatic_Question_Generation / aqg / utils / summarizer.py View on Github external
def summarize_from_url(self,url):

        parser = HtmlParser.from_url(url, Tokenizer(self.LANGUAGE))
        stemmer = Stemmer(self.LANGUAGE)
        summarizer = Summarizer(stemmer)
        file_1 = open("summarizer_output.txt","w+")
        file_2 = open("summarizer_output2.txt","w+")
        for sentence in summarizer(parser.document, self.SENTENCES_COUNT):
            file_2.write(str(sentence))
            file_1.write(str(sentence))
            file_1.write("\n")
        file_1.close()
        file_2.close()
github miso-belica / sumy / sumy / evaluation / __main__.py View on Github external
from ..parsers.plaintext import PlaintextParser
from ..summarizers.random import RandomSummarizer
from ..summarizers.luhn import LuhnSummarizer
from ..summarizers.edmundson import EdmundsonSummarizer
from ..summarizers.lsa import LsaSummarizer
from ..summarizers.text_rank import TextRankSummarizer
from ..summarizers.lex_rank import LexRankSummarizer
from ..summarizers.sum_basic import SumBasicSummarizer
from ..summarizers.kl import KLSummarizer
from ..nlp.stemmers import Stemmer
from . import precision, recall, f_score, cosine_similarity, unit_overlap
from . import rouge_1, rouge_2, rouge_l_sentence_level, rouge_l_summary_level


PARSERS = {
    "html": HtmlParser,
    "plaintext": PlaintextParser,
}


def build_random(parser, language):
    return RandomSummarizer()


def build_luhn(parser, language):
    summarizer = LuhnSummarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)

    return summarizer


def build_edmundson(parser, language):
github miso-belica / sumy / sumy / __main__.py View on Github external
from .utils import ItemsCount, get_stop_words, read_stop_words, fetch_url
from ._compat import to_string, to_unicode, to_bytes, PY3
from .nlp.tokenizers import Tokenizer
from .parsers.html import HtmlParser
from .parsers.plaintext import PlaintextParser
from .summarizers.luhn import LuhnSummarizer
from .summarizers.edmundson import EdmundsonSummarizer
from .summarizers.lsa import LsaSummarizer
from .summarizers.text_rank import TextRankSummarizer
from .summarizers.lex_rank import LexRankSummarizer
from .summarizers.sum_basic import SumBasicSummarizer
from .summarizers.kl import KLSummarizer
from .nlp.stemmers import Stemmer

PARSERS = {
    "html": HtmlParser,
    "plaintext": PlaintextParser,
}

AVAILABLE_METHODS = {
    "luhn": LuhnSummarizer,
    "edmundson": EdmundsonSummarizer,
    "lsa": LsaSummarizer,
    "text-rank": TextRankSummarizer,
    "lex-rank": LexRankSummarizer,
    "sum-basic": SumBasicSummarizer,
    "kl": KLSummarizer,
}


def main(args=None):
    args = docopt(to_string(__doc__), args, version=__version__)