Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def html_to_clean_text(html):
"""
Converting an HTML document to clean text.
Args:
html(str): The content of an HTML web page.
Returns:
A str containing the clean content of the web page.
"""
paragraphs = justext.justext(html, justext.get_stoplist("English"))
clean_text_list = []
for paragraph in paragraphs:
if not paragraph.is_boilerplate:
clean_text_list.append(paragraph.text)
return '\n'.join(clean_text_list)
def extract(self):
html = self.data_instance.get_raw_html()
html = html.encode(self.data_instance.raw_encoding,'ignore')
paragraphs = justext.justext(html, justext.get_stoplist('English'),
encoding = self.data_instance.raw_encoding)
good_paragraphs = []
for para in paragraphs:
if para['class'] == 'good':
paragraph_text = para['text']
# this asseration makes sure we catch string and unicode only
assert isinstance(paragraph_text, basestring)
if type(paragraph_text) == unicode:
good_paragraphs.append(paragraph_text.encode('utf8', 'ignore'))
else:
good_paragraphs.append(paragraph_text)
return '\n\n'.join(good_paragraphs)
def get_content(self, html):
# I should refactor the other get_content when this fails into here
lang_mapping = {'nl': 'Dutch', 'en': 'English', 'com': 'English'}
if self.detected_language not in lang_mapping:
return ''
lang = lang_mapping[self.detected_language]
body_content = [x.text for x in justext.justext(html, justext.get_stoplist(lang))
if not x.is_boilerplate and not x.is_heading]
return body_content
def parse_html(page):
""" Clean HTML tags for webpages that aren't Gutenberg books
"""
try:
parts = justext.justext(page, justext.get_stoplist('English'))
except lxml.etree.ParserError as e:
print('Page empty')
return ''
except UnicodeDecodeError as e:
print("Can't decode utf-8")
return ''
paragraphs = []
for part in parts:
if not part.is_boilerplate:
paragraphs.append(part.text)
return '\n\n'.join(paragraphs)
def remove_boilerplate(html, language="English"):
try:
paragraphs = justext.justext(html, justext.get_stoplist(language))
except:
return html # TODO alternative to justext
tag = lambda p: ("%s\n----\n" if p.is_heading else "%s\n\n") % p.text
content = "".join([tag(p) for p in paragraphs if not p.is_boilerplate])
return content
An instance of Document. Note that the score is assigned to 0 and should be set later.
"""
trec_doc_lower = trec_doc.lower()
id = trec_doc[trec_doc_lower.find('') + len(''):trec_doc_lower.find('')].strip()
title = id # for some presentation reasons, the title of document is set to ids ID.
if format == 'trectext':
text = trec_doc[trec_doc_lower.find('
def remove_bad_by_classifier(doc):
ps = justext.justext(
doc, justext.get_stoplist('English'))
to_delete = []
good = []
for p in ps:
if p['class'] == 'bad':
for el in doc.xpath(p['xpath']):
to_delete.append((el, p['xpath']))
elif p['class'] == 'good':
good.append(p['xpath'])
for el, xp in reversed(to_delete):
if el.getparent() is not None and not any(xp in g for g in good):
el.drop_tree()
def _find_heueristics(doc):
ps = paragraphs or justext.justext(
doc, justext.get_stoplist('English'))
prev = None
images = []
for p in ps:
if p['class'] == 'good':
xpath = p['xpath']
e = doc.xpath(xpath)
if not e:
continue
e = e[0]
for prec in utils.precedings(e,
before=lambda x: prev is not None and prev is e):
if prec.tag == 'img' and prec.attrib.get('src'):
images.append(prec.attrib['src'])
prev = e
if processed and not reprocess:
# logger.debug("Skipping %s", article['uuid'])
return {'article': processed, 'from_store': True}
article['title'] = item.get('title', None)
href = item.get('link', None)
article['href'] = strip_google(href)
article['source'] = derive_source(article['href'])
article['collected'] = now_time()
article['published'] = item.get('published', None)
article['summary'] = item.get('summary', None)
page_content = get_page_content(article['href'])
if not page_content:
logger.debug("No content found: %s" % article['href'])
return {'article': None, 'from_store': True}
paragraphs = justext.justext(page_content,
justext.get_stoplist("English"),
no_headings=True,
max_heading_distance=150,
length_high=140,
max_link_density=0.4,
stopwords_low=0.2,
stopwords_high=0.3)
text_content = list()
for paragraph in paragraphs:
if paragraph.is_boilerplate:
continue
text_content.append(paragraph.text)
text_content = '\n'.join(text_content)
tokens = get_tokens(text_content)
article['word_count'] = len(tokens)