Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
We compute similarity of the same sentences. These should be exactly the same and
therefor have similarity close to 1.0.
see https://github.com/miso-belica/sumy/issues/58
"""
sentence1 = ["this", "sentence", "is", "simple", "sentence"]
tf1 = {"this": 1/2, "sentence": 1.0, "is": 1/2, "simple": 1/2}
sentence2 = ["this", "sentence", "is", "simple", "sentence"]
tf2 = {"this": 1/2, "sentence": 1.0, "is": 1/2, "simple": 1/2}
idf = {
"this": 2/2,
"sentence": 2/2,
"is": 2/2,
"simple": 2/2,
}
summarizer = LexRankSummarizer()
cosine = summarizer.cosine_similarity(sentence1, sentence2, tf1, tf2, idf)
assert abs(1.0 - cosine) < 0.00001
def test_tf_metrics():
summarizer = LexRankSummarizer()
sentences = [
("this", "sentence", "is", "simple", "sentence"),
("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too"),
]
metrics = summarizer._compute_tf(sentences)
expected = [
{"this": 1/2, "is": 1/2, "simple": 1/2, "sentence": 1.0},
{"this": 1/3, "is": 2/3, "yes": 1/3, "simple": 1/3, "sentence": 1/3, "too": 1.0},
]
assert expected == metrics
def auto_summarize_comment(request):
from sumy.nlp.stemmers import Stemmer
#from sumy.utils import get_stop_words
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
#from sumy.summarizers.lsa import LsaSummarizer as Summarizer
#from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer
from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
stemmer = Stemmer("english")
summarizer = Summarizer(stemmer)
comment_ids = request.POST.getlist('d_ids[]')
sent_list = []
for comment_id in comment_ids:
comment = Comment.objects.get(id=comment_id)
text = comment.text
text = re.sub('<br>', ' ', text)
text = re.sub('<br>', ' ', text)
parser = HtmlParser.from_string(text, '', Tokenizer("english"))
num_sents = request.GET.get('num_sents', None)
if not num_sents:
def sumy_summarizer(docx):
parser = PlaintextParser.from_string(docx,Tokenizer("english"))
lex_summarizer = LexRankSummarizer()
summary = lex_summarizer(parser.document,3)
summary_list = [str(sentence) for sentence in summary]
result = ' '.join(summary_list)
return result
def scrape(self, url):
complete_url = url
try:
# get summary
print "Retrieving page summary of %s... " % url
parser = HtmlParser.from_url(complete_url, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
url_summary = ''.join(str(sentence) for sentence in summarizer(parser.document, SENTENCES_COUNT))
except Exception, e:
url_summary = "Could not scrape summary. Reason: %s" % e.message
print "Done: %s = %s" % (url, url_summary)
# create scraping result
scraping_result = ScrapingResult()
scraping_result.summary = url_summary
scraping_result.url = url
return scraping_result
def summarize_from_file(self,file_name):
parser = PlaintextParser.from_file(file_name, Tokenizer(self.LANGUAGE))
stemmer = Stemmer(self.LANGUAGE)
summarizer = Summarizer(stemmer)
file_1 = open("summarizer_output.txt","w+")
file_2 = open("summarizer_output2.txt","w+")
for sentence in summarizer(parser.document, self.SENTENCES_COUNT):
file_2.write(str(sentence))
file_1.write(str(sentence))
file_1.write("\n")
file_1.close()
file_2.close()
def get_summary_lex_rank(self,num_sentence):
from sumy.parsers.plaintext import PlaintextParser # other parsers available for HTML etc.
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer # We're choosing Lexrank, other algorithms are also built in
try:
parser = HtmlParser.from_url(self.url, Tokenizer("english"))
except:
try:
parser = PlaintextParser.from_string(self.body, Tokenizer("english"))
except Exception as e:
raise(e)
summarizer = LexRankSummarizer()
summary = summarizer(parser.document, num_sentence)
out=''
for sentence in summary:
out+= str(sentence)
return out
from .summarizers.lex_rank import LexRankSummarizer
from .summarizers.sum_basic import SumBasicSummarizer
from .summarizers.kl import KLSummarizer
from .nlp.stemmers import Stemmer
PARSERS = {
"html": HtmlParser,
"plaintext": PlaintextParser,
}
AVAILABLE_METHODS = {
"luhn": LuhnSummarizer,
"edmundson": EdmundsonSummarizer,
"lsa": LsaSummarizer,
"text-rank": TextRankSummarizer,
"lex-rank": LexRankSummarizer,
"sum-basic": SumBasicSummarizer,
"kl": KLSummarizer,
}
def main(args=None):
args = docopt(to_string(__doc__), args, version=__version__)
summarizer, parser, items_count = handle_arguments(args)
for sentence in summarizer(parser.document, items_count):
if PY3:
print(to_unicode(sentence))
else:
print(to_bytes(sentence))
return 0
for x in ["1","2","su4"]:
for y in ["precision","recall","f_score"]:
key = "rouge_%s_%s" % (x,y)
val = results_dict[key]
log_str += "%.4f\t" % (val)
log_str += "\n"
print(log_str)
results_file = os.path.join(dir_to_write, "ROUGE_results.txt")
print("Writing final ROUGE results to %s...", results_file)
with open(results_file, "w") as f:
f.write(log_str)
for summary_method in summary_methods:
print('Summarizing using the method: ' + summary_method)
if summary_method == 'lexrank':
summary_fn = LexRankSummarizer
elif summary_method == 'kl':
summary_fn = KLSummarizer
elif summary_method == 'sumbasic':
summary_fn = SumBasicSummarizer
else:
raise Exception('Could not find summary method ' + summary_method)
if not os.path.exists(os.path.join(out_dir, summary_method, reference_folder)):
os.makedirs(os.path.join(out_dir, summary_method, reference_folder))
if not os.path.exists(os.path.join(out_dir, summary_method, decoded_folder)):
os.makedirs(os.path.join(out_dir, summary_method, decoded_folder))
print (os.path.join(out_dir, summary_method))
article_names = sorted(os.listdir(articles_dir))
for art_idx, article_name in enumerate(tqdm(article_names)):
file = os.path.join(articles_dir, article_name)
parser = HtmlParser.from_file(file, "", Tokenizer("english"))