How to use the justext.justext function in jusText

To help you get started, we’ve selected a few jusText examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github microsoft / macaw / macaw / util / text_parser.py View on Github external
def html_to_clean_text(html):
    """
    Converting an HTML document to clean text.
    Args:
        html(str): The content of an HTML web page.

    Returns:
        A str containing the clean content of the web page.
    """
    paragraphs = justext.justext(html, justext.get_stoplist("English"))
    clean_text_list = []
    for paragraph in paragraphs:
        if not paragraph.is_boilerplate:
            clean_text_list.append(paragraph.text)
    return '\n'.join(clean_text_list)
github tomazk / Text-Extraction-Evaluation / src / txtexeval / extractor.py View on Github external
def extract(self):
        html = self.data_instance.get_raw_html()
        html = html.encode(self.data_instance.raw_encoding,'ignore')
        paragraphs = justext.justext(html, justext.get_stoplist('English'),
                             encoding = self.data_instance.raw_encoding)    
        good_paragraphs = []
        for para in paragraphs:
            if para['class'] == 'good':
                paragraph_text = para['text']
                # this asseration makes sure we catch string and unicode only
                assert isinstance(paragraph_text, basestring)
                if type(paragraph_text) == unicode:
                    good_paragraphs.append(paragraph_text.encode('utf8', 'ignore'))
                else:
                    good_paragraphs.append(paragraph_text)
            
        return '\n\n'.join(good_paragraphs)
github kootenpv / sky / sky / scraper.py View on Github external
def get_content(self, html):
        # I should refactor the other get_content when this fails into here
        lang_mapping = {'nl': 'Dutch', 'en': 'English', 'com': 'English'}
        if self.detected_language not in lang_mapping:
            return ''
        lang = lang_mapping[self.detected_language]
        body_content = [x.text for x in justext.justext(html, justext.get_stoplist(lang))
                        if not x.is_boilerplate and not x.is_heading]
        return body_content
github chiphuyen / lazynlp / lazynlp / cleaner.py View on Github external
def parse_html(page):
    """ Clean HTML tags for webpages that aren't Gutenberg books
    """
    try:
        parts = justext.justext(page, justext.get_stoplist('English'))
    except lxml.etree.ParserError as e:
        print('Page empty')
        return ''
    except UnicodeDecodeError as e:
        print("Can't decode utf-8")
        return ''
    paragraphs = []
    for part in parts:
        if not part.is_boilerplate:
            paragraphs.append(part.text)
    return '\n\n'.join(paragraphs)
github pschwede / AnchorBot / bot.py View on Github external
def remove_boilerplate(html, language="English"):
    try:
        paragraphs = justext.justext(html, justext.get_stoplist(language))
    except:
        return html  # TODO alternative to justext
    tag = lambda p: ("%s\n----\n" if p.is_heading else "%s\n\n") % p.text
    content = "".join([tag(p) for p in paragraphs if not p.is_boilerplate])
    return content
github microsoft / macaw / macaw / core / retrieval / doc.py View on Github external
An instance of Document. Note that the score is assigned to 0 and should be set later.
    """
    trec_doc_lower = trec_doc.lower()
    id = trec_doc[trec_doc_lower.find('') + len(''):trec_doc_lower.find('')].strip()
    title = id  # for some presentation reasons, the title of document is set to ids ID.
    if format == 'trectext':
        text = trec_doc[trec_doc_lower.find('
github andreypopp / extracty / extracty / content.py View on Github external
def remove_bad_by_classifier(doc):
    ps = justext.justext(
        doc, justext.get_stoplist('English'))
    to_delete = []
    good = []
    for p in ps:
        if p['class'] == 'bad':
            for el in doc.xpath(p['xpath']):
                to_delete.append((el, p['xpath']))
        elif p['class'] == 'good':
            good.append(p['xpath'])

    for el, xp in reversed(to_delete):
        if el.getparent() is not None and not any(xp in g for g in good):
            el.drop_tree()
github andreypopp / extracty / extracty / image.py View on Github external
def _find_heueristics(doc):
        ps = paragraphs or justext.justext(
            doc, justext.get_stoplist('English'))
        prev = None
        images = []
        for p in ps:
            if p['class'] == 'good':
                xpath = p['xpath']
                e = doc.xpath(xpath)
                if not e:
                    continue
                e = e[0]
                for prec in utils.precedings(e,
                        before=lambda x: prev is not None and prev is e):
                    if prec.tag == 'img' and prec.attrib.get('src'):
                        images.append(prec.attrib['src'])
                prev = e
github 9b / chirp / app / tasks / __init__.py View on Github external
if processed and not reprocess:
        # logger.debug("Skipping %s", article['uuid'])
        return {'article': processed, 'from_store': True}
    article['title'] = item.get('title', None)
    href = item.get('link', None)
    article['href'] = strip_google(href)
    article['source'] = derive_source(article['href'])
    article['collected'] = now_time()
    article['published'] = item.get('published', None)
    article['summary'] = item.get('summary', None)

    page_content = get_page_content(article['href'])
    if not page_content:
        logger.debug("No content found: %s" % article['href'])
        return {'article': None, 'from_store': True}
    paragraphs = justext.justext(page_content,
                                 justext.get_stoplist("English"),
                                 no_headings=True,
                                 max_heading_distance=150,
                                 length_high=140,
                                 max_link_density=0.4,
                                 stopwords_low=0.2,
                                 stopwords_high=0.3)
    text_content = list()
    for paragraph in paragraphs:
        if paragraph.is_boilerplate:
            continue
        text_content.append(paragraph.text)
    text_content = '\n'.join(text_content)
    tokens = get_tokens(text_content)

    article['word_count'] = len(tokens)