Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_normalized_words_frequencies():
words = "a b c d e c b d c e e d e d e".split()
model = TfDocumentModel(tuple(words))
assert model.normalized_term_frequency("a") == pytest.approx(1/5)
assert model.normalized_term_frequency("b") == pytest.approx(2/5)
assert model.normalized_term_frequency("c") == pytest.approx(3/5)
assert model.normalized_term_frequency("d") == pytest.approx(4/5)
assert model.normalized_term_frequency("e") == pytest.approx(5/5)
assert model.normalized_term_frequency("z") == pytest.approx(0.0)
assert model.most_frequent_terms() == ("e", "d", "c", "b", "a")
def test_single_sentence(summarizer):
s = Sentence("I am one slightly longer sentence.", Tokenizer("english"))
document = build_document([s])
returned = summarizer(document, 10)
assert len(returned) == 1
def test_most_frequent_terms_empty():
tokenizer = Tokenizer("english")
model = TfDocumentModel("", tokenizer)
assert model.most_frequent_terms() == ()
assert model.most_frequent_terms(10) == ()
def test_slovak_alias_into_czech_tokenizer():
tokenizer = Tokenizer("slovak")
assert tokenizer.language == "slovak"
sentences = tokenizer.to_sentences("""
Je to veľmi fajn. Bodaj by nie.
Ale na druhej strane čo je to oproti inému?
To nechám na čitateľa.
""")
expected = (
"Je to veľmi fajn.",
"Bodaj by nie.",
"Ale na druhej strane čo je to oproti inému?",
"To nechám na čitateľa.",
)
assert expected == sentences
def test_terms():
tokenizer = Tokenizer("english")
text = "wA wB wC wD wB wD wE"
model = TfDocumentModel(text, tokenizer)
terms = tuple(sorted(model.terms))
assert terms == ("wa", "wb", "wc", "wd", "we")
def test_empty_document(self):
document = build_document()
summarizer = TextRankSummarizer(Stemmer("english"))
returned = summarizer(document, 10)
self.assertEqual(len(returned), 0)
def test_empty_document():
document = build_document()
summarizer = ReductionSummarizer(Stemmer("english"))
returned = summarizer(document, 10)
assert len(returned) == 0
def test_real_example():
"""Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
parser = PlaintextParser.from_string(
load_resource("snippets/prevko.txt"),
Tokenizer("czech")
)
summarizer = LsaSummarizer(Stemmer("czech"))
summarizer.stop_words = get_stop_words("czech")
sentences = summarizer(parser.document, 2)
assert len(sentences) == 2
def test_parse_plaintext_long(self):
parser = PlaintextParser.from_string("""
Ako sa máš? Ja dobre! A ty? No
mohlo to byť aj lepšie!!! Ale pohodička.
TOTO JE AKOŽE NADPIS
A toto je text pod ním, ktorý je textový.
A tak ďalej...
VEĽKOLEPÉ PREKVAPENIE
Tretí odstavec v tomto texte je úplne o ničom. Ale má
vety a to je hlavné. Takže sa majte na pozore ;-)
A tak ďalej...
A tak este dalej!
""", Tokenizer("czech"))
def test_article_example():
"""Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
parser = PlaintextParser.from_string(
load_resource("articles/prevko_cz_1.txt"),
Tokenizer("czech")
)
summarizer = LexRankSummarizer(stem_word)
summarizer.stop_words = get_stop_words("czech")
sentences = summarizer(parser.document, 20)
assert len(sentences) == 20