Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def crawl_url(url):
g = Goose({'browser_user_agent': cfg.browser_user_agent, 'parser_class':'soup'})
r = g.fetcher.fetch_obj(url)
html = r.content.decode('utf-8').strip()
# Make src urls absolute
html = abs_src(html, r.url)
page = g.extract(raw_html=html)
infos = page.infos
infos['final_url'] = page.final_url
infos['status'] = r.status_code
infos['headers'] = r.headers
infos['link_hash'] = page.link_hash
infos['final_url'] = r.url
infos['domain'] = get_hostname(r.url)
infos['links'] = LinksExtractor(g.config, page).extract(url)
infos['meta']['robots'] = RobotsExtractor(g.config, page).extract()
infos['content'] = ' '.join(page.cleaned_text.split())
class LinksExtractor(BaseExtractor):
def extract(self, url):
links = []
items = self.parser.getElementsByTag(self.article._raw_doc, 'a')
for i in items:
href = get_canonical_url(self.parser.getAttribute(i, 'href'), url)
attr = {'href': href, 'text': self.parser.getText(i) or '', 'rel': self.parser.getAttribute(i, 'rel') or ''}
if attr:
links.append(attr)
return links
class RobotsExtractor(BaseExtractor):
def extract(self):
robots = []
kwargs = {'tag': 'meta', 'attr': 'name', 'value': 'robots'}
items = self.parser.getElementsByTag(self.article._raw_doc, **kwargs)
for i in items:
attr = self.parser.getAttribute(i, 'content')
if attr and len(attr):
attr = [a.strip().lower() for a in attr.split(',')]
robots.extend(attr)
return robots
def crawl_url(url):
g = Goose({'browser_user_agent': cfg.browser_user_agent, 'parser_class':'soup'})
def summary(url):
g = Goose()
article = g.extract(url)
title = article.title
publish_date = "None"
headlines = []
if title == None :
title = url
try:
image = article.top_image.src
except Exception:
if len(article.images)>0:
image = article.images[0]
else:
image = "http://www.sahalnews.com/wp-content/uploads/2014/12/news-update-.jpg"
for bullets in summarize(url,title,article.cleaned_text,n_bullets):
headlines.append(bullets)
if len(headlines)==0:
def get_text(url):
g = Goose({'stopwords_class': StopWordsChinese})
article = g.extract(url=url)
return article.cleaned_text
def _extract_content(self, html):
g = Goose({'enable_image_fetching': False})
article = g.extract(raw_html=html)
return article.cleaned_text
else:
raise TextUnavailable("Não existem textos disponíveis no NewsPlease para análise. Tente com Goose3")
# Definir data
if artigo.date_publish is not None:
data = str(artigo.date_publish)
elif artigo.date_modify is not None and artigo.date_modify is not "None":
data = str(artigo.date_modify)
else:
data = str(artigo.date_download)
objeto = ArticleObject(fixcharset(artigo.title), url, None, data, artigo.authors,
artigo.source_domain, text)
return objeto
except Exception:
from goose3 import Goose
g = Goose(
{'strict': False, 'use_meta_language': True,
'target_language': Config().values()['language'].replace("-", "_"),
'parser_class': 'lxml', 'enable_image_fetching': False, 'http_timeout': 1})
artigo = g.extract(url=url)
if artigo.cleaned_text:
text = fixcharset(artigo.cleaned_text)
elif artigo.meta_description:
text = fixcharset(artigo.meta_description)
else:
raise TextUnavailable("Não existem textos suficientes para análise.")
objeto = ArticleObject(fixcharset(artigo.title), url, None,
artigo.publish_date, artigo.authors, artigo.domain, text)
return objeto
def _extract_content(self, html):
g = Goose({'enable_image_fetching': False})
article = g.extract(raw_html=html)
return article.cleaned_text
def _extract_content(self, html):
g = Goose({'enable_image_fetching': False})
article = g.extract(raw_html=html)
return article.cleaned_text
self.keywords = []
self.names = []
self.fulltext = None
self.language = None
self.description = None
self.canonical_url = None
self.image = None
self.published_date = None
self.modified_date = None
self.scraped_date = None
self.contenthash = None
self.reading_time = None
config = Configuration()
config.enable_image_fetching = False
self.goose = Goose(config=config)
self.tree = None
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from goose3 import Goose
from goose3.extractors import BaseExtractor
from lib.utils import *
import config as cfg
class LinksExtractor(BaseExtractor):
def extract(self, url):
links = []
items = self.parser.getElementsByTag(self.article._raw_doc, 'a')
for i in items:
href = get_canonical_url(self.parser.getAttribute(i, 'href'), url)
attr = {'href': href, 'text': self.parser.getText(i) or '', 'rel': self.parser.getAttribute(i, 'rel') or ''}
if attr:
links.append(attr)
return links
class RobotsExtractor(BaseExtractor):
def extract(self):