How to use the justext.core.JustextError function in jusText

To help you get started, we’ve selected a few jusText examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github miso-belica / jusText / tests / test_html_encoding.py View on Github external
def test_fake_encoding_in_meta(self):
        html = ' ľščťžäňôě'

        with pytest.raises(JustextError):
            decode_html(html.encode("iso-8859-2"), errors='strict')
github miso-belica / jusText / tests / test_html_encoding.py View on Github external
def test_unknown_encoding_in_strict_mode(self):
        html = 'ľščťžäňôě'
        with pytest.raises(JustextError):
            decode_html(html.encode("iso-8859-2"), errors='strict')
github pschwede / AnchorBot / bot.py View on Github external
def get_article(entry):
    page = ""
    content = ""
    picture = ""
    media = ""

    page = get_html(entry.link)
    language = guess_language(page)
    try:
        content = remove_boilerplate(page, language=language)
    except justext.core.JustextError:
        pass
    try:
        picture = find_picture(page)
    except requests.exceptions.Timeout:
        pass

    media = find_media(page)

    keywords = find_keywords(entry.title)
    article = {"link": entry.link,
               "title": entry.title,
               "release": time(),
               "content": content,
               "media": media,
               "image": picture,
               "keywords": keywords,
github miso-belica / jusText / justext / core.py View on Github external
PARAGRAPH_TAGS = [
    'body', 'blockquote', 'caption', 'center', 'col', 'colgroup', 'dd',
    'div', 'dl', 'dt', 'fieldset', 'form', 'legend', 'optgroup', 'option',
    'p', 'pre', 'table', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr',
    'ul', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
]
DEFAULT_ENCODING = 'utf8'
DEFAULT_ENC_ERRORS = 'replace'
CHARSET_META_TAG_PATTERN = re.compile(br"""]+charset=["']?([^'"/>\s]+)""", re.IGNORECASE)


class JustextError(Exception):
    "Base class for jusText exceptions."


class JustextInvalidOptions(JustextError):
    pass


def html_to_dom(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=DEFAULT_ENC_ERRORS):
    """Converts HTML to DOM."""
    if isinstance(html, unicode):
        decoded_html = html
        # encode HTML for case it's XML with encoding declaration
        forced_encoding = encoding if encoding else default_encoding
        html = html.encode(forced_encoding, errors)
    else:
        decoded_html = decode_html(html, default_encoding, encoding, errors)

    try:
        dom = lxml.html.fromstring(decoded_html, parser=lxml.html.HTMLParser())
    except ValueError: