How to use the justext.core.classify_paragraphs function in jusText

To help you get started, we’ve selected a few jusText examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github miso-belica / jusText / tests / test_classify_paragraphs.py View on Github external
def test_stopwords_high(self):
        paragraphs = [
            self._paragraph(text="0 1 2 3 4 5 6 7 8 9"),
            self._paragraph(text="0 1 2 3 4 5 6 7 8 9"*2),
        ]

        classify_paragraphs(
            paragraphs,
            ("0",),
            max_link_density=1,
            length_low=0,
            stopwords_high=0,
            length_high=20
        )

        assert paragraphs[0].cf_class == "neargood"
        assert paragraphs[1].cf_class == "good"
github miso-belica / jusText / tests / test_classify_paragraphs.py View on Github external
def test_length_low(self):
        paragraphs = [
            self._paragraph(text="0 1 2 3 4 5 6 7 8 9"*2, chars_count_in_links=0),
            self._paragraph(text="0 1 2 3 4 5 6 7 8 9"*2, chars_count_in_links=20),
        ]

        classify_paragraphs(paragraphs, (), max_link_density=1, length_low=1000)

        assert paragraphs[0].cf_class == "short"
        assert paragraphs[1].cf_class == "bad"
github miso-belica / jusText / tests / test_classify_paragraphs.py View on Github external
def test_stopwords_low(self):
        paragraphs = [
            self._paragraph(text="0 0 0 0 1 2 3 4 5 6 7 8 9"),
            self._paragraph(text="0 1 2 3 4 5 6 7 8 9"),
            self._paragraph(text="1 2 3 4 5 6 7 8 9"),
        ]

        classify_paragraphs(
            paragraphs,
            ("0", "1",),
            max_link_density=1,
            length_low=0,
            stopwords_high=1000,
            stopwords_low=0.2
        )

        assert paragraphs[0].cf_class == "neargood"
        assert paragraphs[1].cf_class == "neargood"
        assert paragraphs[2].cf_class == "bad"
github miso-belica / jusText / tests / test_classify_paragraphs.py View on Github external
def test_max_link_density(self):
        paragraphs = [
            self._paragraph(text="0123456789"*2, chars_count_in_links=0),
            self._paragraph(text="0123456789"*2, chars_count_in_links=20),
            self._paragraph(text="0123456789"*8, chars_count_in_links=40),
            self._paragraph(text="0123456789"*8, chars_count_in_links=39),
            self._paragraph(text="0123456789"*8, chars_count_in_links=41),
        ]

        classify_paragraphs(paragraphs, (), max_link_density=0.5)

        assert paragraphs[0].cf_class == "short"
        assert paragraphs[1].cf_class == "bad"
        assert paragraphs[2].cf_class == "bad"
        assert paragraphs[3].cf_class == "bad"
        assert paragraphs[4].cf_class == "bad"