How to use the justext.get_stoplist function in jusText

To help you get started, we’ve selected a few jusText examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pschwede / AnchorBot / bot.py View on Github external
def guess_language(html):
    hits = dict()
    htmlset = set(str(html).split(" "))
    for lang in justext.get_stoplists():
        hits[lang] = len(set(justext.get_stoplist(lang)).intersection(htmlset))
    return max(hits, key=hits.get)
github 9b / chirp / app / tasks / __init__.py View on Github external
# logger.debug("Skipping %s", article['uuid'])
        return {'article': processed, 'from_store': True}
    article['title'] = item.get('title', None)
    href = item.get('link', None)
    article['href'] = strip_google(href)
    article['source'] = derive_source(article['href'])
    article['collected'] = now_time()
    article['published'] = item.get('published', None)
    article['summary'] = item.get('summary', None)

    page_content = get_page_content(article['href'])
    if not page_content:
        logger.debug("No content found: %s" % article['href'])
        return {'article': None, 'from_store': True}
    paragraphs = justext.justext(page_content,
                                 justext.get_stoplist("English"),
                                 no_headings=True,
                                 max_heading_distance=150,
                                 length_high=140,
                                 max_link_density=0.4,
                                 stopwords_low=0.2,
                                 stopwords_high=0.3)
    text_content = list()
    for paragraph in paragraphs:
        if paragraph.is_boilerplate:
            continue
        text_content.append(paragraph.text)
    text_content = '\n'.join(text_content)
    tokens = get_tokens(text_content)

    article['word_count'] = len(tokens)
    article['read_time'] = round(float(article['word_count'])/250, 2)
github comperiosearch / elasticsearch-eslib / eslib / web.py View on Github external
NOTE: quality dependent on correct language detection.

    :param page_str: str HTML page source.
    :param lang: str Google Translate language code.
    :param relaxed: boolean If True the span between the first and last good/near-good boilerplate match
        is returned. Short and bad segments in between are kept.
    :return: list List of non-boilerplate segments/paragraphs.
    """
    if lang not in GTRANS_JUSTEXT_LANG_MAP:
        #raise AttributeError("Can not remove boilerplate for language code lang='%s'." % lang)
        return []

    jt_lang = GTRANS_JUSTEXT_LANG_MAP[lang]

    paragraphs = justext.justext(page_str, justext.get_stoplist(jt_lang))

    if relaxed:
        good_indexes = [paragraphs.index(p) for p in paragraphs if p.class_type in ['near-good', 'good']]

        if len(good_indexes) == 0:
            return []

        return [paragraph.text for paragraph in paragraphs[min(good_indexes):max(good_indexes) + 1]]
    else:
        return [paragraph.text for paragraph in paragraphs if paragraph.class_type in ['near-good', 'good', 'short']]
github tomazk / Text-Extraction-Evaluation / src / txtexeval / extractor.py View on Github external
def extract(self):
        html = self.data_instance.get_raw_html()
        html = html.encode(self.data_instance.raw_encoding,'ignore')
        paragraphs = justext.justext(html, justext.get_stoplist('English'),
                             encoding = self.data_instance.raw_encoding)    
        good_paragraphs = []
        for para in paragraphs:
            if para['class'] == 'good':
                paragraph_text = para['text']
                # this asseration makes sure we catch string and unicode only
                assert isinstance(paragraph_text, basestring)
                if type(paragraph_text) == unicode:
                    good_paragraphs.append(paragraph_text.encode('utf8', 'ignore'))
                else:
                    good_paragraphs.append(paragraph_text)
            
        return '\n\n'.join(good_paragraphs)