How to use the justext.core.ParagraphMaker.make_paragraphs function in jusText

To help you get started, we’ve selected a few jusText examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github miso-belica / jusText / tests / test_sax.py View on Github external
def test_multiple_line_break(self):
        html_string = (
            ''
            '  normal text   <br><br> another   text  '
            ''
        )
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        assert html_string == returned

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        assert len(paragraphs) == 2

        self.assert_paragraphs_equal(
            paragraphs[0],
            text="normal text",
            words_count=2,
            tags_count=0
        )
        self.assert_paragraphs_equal(
            paragraphs[1],
            text="another text",
            words_count=2,
            tags_count=0
        )
github miso-belica / jusText / tests / test_sax.py View on Github external
def test_whitespace_handling(self):
        html_string = (
            ''
            '<p>pre<em>in</em>post \t pre  <span class="class"> in </span>  post</p>'
            '<div>pre<em> in </em>post</div>'
            '<pre>pre<em>in </em>post</pre>'
            '<blockquote>pre<em> in</em>post</blockquote>'
            ''
        )
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        assert html_string == returned

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        assert len(paragraphs) == 4

        self.assert_paragraphs_equal(
            paragraphs[0],
            text="preinpost pre in post",
            words_count=4,
            tags_count=2
        )
        self.assert_paragraphs_equal(
            paragraphs[1],
            text="pre in post",
            words_count=3,
            tags_count=1
        )
        self.assert_paragraphs_equal(
            paragraphs[2],
github miso-belica / jusText / tests / test_sax.py View on Github external
def test_inline_text_in_body(self):
        """Inline text should be treated as separate paragraph."""
        html_string = (
            ''
            '<sup>I am <strong>top</strong>-inline\n\n\n\n and I am happy \n</sup>'
            '<p>normal text</p>'
            '<code>\nvar i = -INFINITY;\n</code>'
            '<div>after text with variable <var>N</var> </div>'
            '   I am inline\n\n\n\n and I am happy \n'
            ''
        )
        dom = html.fromstring(html_string)

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        assert len(paragraphs) == 5

        self.assert_paragraphs_equal(
            paragraphs[0],
            words_count=7,
            tags_count=2,
            text="I am top-inline\nand I am happy"
        )
        self.assert_paragraphs_equal(
            paragraphs[1],
            words_count=2,
            tags_count=0,
            text="normal text"
        )
        self.assert_paragraphs_equal(
            paragraphs[2],
github miso-belica / jusText / tests / test_sax.py View on Github external
def test_basic(self):
        html_string = (
            ''
            '<h1>Header</h1>'
            '<p>text and some <em>other</em> words <span class="class">that I</span> have in my head now</p>'
            '<p>footer</p>'
            ''
        )
        dom = html.fromstring(html_string)

        returned = html.tostring(dom).decode("utf8")
        assert html_string == returned

        paragraphs = ParagraphMaker.make_paragraphs(dom)
        assert len(paragraphs) == 3

        self.assert_paragraphs_equal(paragraphs[0], text="Header", words_count=1, tags_count=0)

        text = "text and some other words that I have in my head now"
        self.assert_paragraphs_equal(paragraphs[1], text=text, words_count=12, tags_count=2)

        self.assert_paragraphs_equal(paragraphs[2], text="footer", words_count=1, tags_count=0)