How to use the justext.core.ParagraphMaker function in jusText

To help you get started, we’ve selected a few jusText examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github miso-belica / jusText / tests / test_sax.py View on Github external
def test_links(self):
        """Inline text should be treated as separate paragraph."""
        html_string = (
            ''
            '<a>I am <strong>top</strong>-inline\n\n\n\n and I am happy \n</a>'
            '<p>normal text</p>'
            '<code>\nvar i = -INFINITY;\n</code>'
            '<div>after <a>text</a> with variable <var>N</var> </div>'
            '   I am inline\n\n\n\n and I am happy \n'
            ''
        )
        dom = html.fromstring(html_string)

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        assert len(paragraphs) == 5

        self.assert_paragraphs_equal(
            paragraphs[0],
            words_count=7,
            tags_count=2,
            text="I am top-inline\nand I am happy",
            chars_count_in_links=31
        )
        self.assert_paragraphs_equal(
            paragraphs[1],
            words_count=2,
            tags_count=0,
            text="normal text"
        )
        self.assert_paragraphs_equal(
github miso-belica / jusText / tests / test_sax.py View on Github external
def test_no_paragraphs(self):
        html_string = ''
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        assert html_string == returned

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        assert len(paragraphs) == 0
github miso-belica / jusText / justext / core.py View on Github external
def justext(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
        length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
        stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
        max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
        encoding=None, default_encoding=DEFAULT_ENCODING,
        enc_errors=DEFAULT_ENC_ERRORS, preprocessor=preprocessor):
    """
    Converts an HTML page into a list of classified paragraphs. Each paragraph
    is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
    """
    dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
    dom = preprocessor(dom)

    paragraphs = ParagraphMaker.make_paragraphs(dom)

    classify_paragraphs(paragraphs, stoplist, length_low, length_high,
        stopwords_low, stopwords_high, max_link_density, no_headings)
    revise_paragraph_classification(paragraphs, max_heading_distance)

    return paragraphs