How to use goose3 - 10 common examples

To help you get started, we’ve selected a few goose3 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jroakes / tech-seo-crawler / lib / crawler.py View on Github external
def crawl_url(url):
    g = Goose({'browser_user_agent': cfg.browser_user_agent, 'parser_class':'soup'})
    r = g.fetcher.fetch_obj(url)
    html = r.content.decode('utf-8').strip()
    # Make src urls absolute
    html = abs_src(html, r.url)
    page = g.extract(raw_html=html)
    infos = page.infos

    infos['final_url']       = page.final_url
    infos['status']          = r.status_code
    infos['headers']         = r.headers
    infos['link_hash']       = page.link_hash
    infos['final_url']       = r.url
    infos['domain']          = get_hostname(r.url)
    infos['links']           = LinksExtractor(g.config, page).extract(url)
    infos['meta']['robots']  = RobotsExtractor(g.config, page).extract()
    infos['content']         = ' '.join(page.cleaned_text.split())
github jroakes / tech-seo-crawler / lib / crawler.py View on Github external
class LinksExtractor(BaseExtractor):

    def extract(self, url):
        links = []
        items = self.parser.getElementsByTag(self.article._raw_doc, 'a')

        for i in items:
            href = get_canonical_url(self.parser.getAttribute(i, 'href'), url)
            attr = {'href': href, 'text': self.parser.getText(i) or '', 'rel': self.parser.getAttribute(i, 'rel') or ''}
            if attr:
                links.append(attr)
        return links


class RobotsExtractor(BaseExtractor):

    def extract(self):
        robots = []
        kwargs = {'tag': 'meta', 'attr': 'name', 'value': 'robots'}
        items = self.parser.getElementsByTag(self.article._raw_doc, **kwargs)
        for i in items:
            attr = self.parser.getAttribute(i, 'content')
            if attr and len(attr):
                attr = [a.strip().lower() for a in attr.split(',')]
                robots.extend(attr)
        return robots



def crawl_url(url):
    g = Goose({'browser_user_agent': cfg.browser_user_agent, 'parser_class':'soup'})
github team-anything / Briefly / App / subscribe.py View on Github external
def summary(url):
    g = Goose()
    article = g.extract(url)
    title = article.title
    publish_date = "None"
    headlines = []
    if title == None :
        title = url
    try:
        image = article.top_image.src
    except Exception:
        if len(article.images)>0:
            image = article.images[0]
        else:
            image = "http://www.sahalnews.com/wp-content/uploads/2014/12/news-update-.jpg"
    for bullets in summarize(url,title,article.cleaned_text,n_bullets):
        headlines.append(bullets)
    if len(headlines)==0:
github hoxis / to_voice / page2voice.py View on Github external
def get_text(url):
    g = Goose({'stopwords_class': StopWordsChinese})
    article = g.extract(url=url)
    return article.cleaned_text
github LuChang-CS / news-crawler / article / bbc_article.py View on Github external
def _extract_content(self, html):
        g = Goose({'enable_image_fetching': False})
        article = g.extract(raw_html=html)
        return article.cleaned_text
github verifiqueme / core / jano / controllers / ArticleExtractor.py View on Github external
else:
                raise TextUnavailable("Não existem textos disponíveis no NewsPlease para análise. Tente com Goose3")
            # Definir data
            if artigo.date_publish is not None:
                data = str(artigo.date_publish)
            elif artigo.date_modify is not None and artigo.date_modify is not "None":
                data = str(artigo.date_modify)
            else:
                data = str(artigo.date_download)

            objeto = ArticleObject(fixcharset(artigo.title), url, None, data, artigo.authors,
                                   artigo.source_domain, text)
            return objeto
        except Exception:
            from goose3 import Goose
            g = Goose(
                {'strict': False, 'use_meta_language': True,
                 'target_language': Config().values()['language'].replace("-", "_"),
                 'parser_class': 'lxml', 'enable_image_fetching': False, 'http_timeout': 1})
            artigo = g.extract(url=url)
            if artigo.cleaned_text:
                text = fixcharset(artigo.cleaned_text)
            elif artigo.meta_description:
                text = fixcharset(artigo.meta_description)
            else:
                raise TextUnavailable("Não existem textos suficientes para análise.")

            objeto = ArticleObject(fixcharset(artigo.title), url, None,
                                   artigo.publish_date, artigo.authors, artigo.domain, text)
            return objeto
github LuChang-CS / news-crawler / article / nytimes_article.py View on Github external
def _extract_content(self, html):
        g = Goose({'enable_image_fetching': False})
        article = g.extract(raw_html=html)
        return article.cleaned_text
github LuChang-CS / news-crawler / article / reuters_article.py View on Github external
def _extract_content(self, html):
        g = Goose({'enable_image_fetching': False})
        article = g.extract(raw_html=html)
        return article.cleaned_text
github fanmatics / metadoc / metadoc / extract / extractor.py View on Github external
self.keywords = []
    self.names = []
    self.fulltext = None
    self.language = None
    self.description = None
    self.canonical_url = None
    self.image = None
    self.published_date = None
    self.modified_date = None
    self.scraped_date = None
    self.contenthash = None
    self.reading_time = None

    config = Configuration()
    config.enable_image_fetching = False
    self.goose = Goose(config=config)

    self.tree = None
github jroakes / tech-seo-crawler / lib / crawler.py View on Github external
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


from goose3 import Goose
from goose3.extractors import BaseExtractor
from lib.utils import *

import config as cfg



class LinksExtractor(BaseExtractor):

    def extract(self, url):
        links = []
        items = self.parser.getElementsByTag(self.article._raw_doc, 'a')

        for i in items:
            href = get_canonical_url(self.parser.getAttribute(i, 'href'), url)
            attr = {'href': href, 'text': self.parser.getText(i) or '', 'rel': self.parser.getAttribute(i, 'rel') or ''}
            if attr:
                links.append(attr)
        return links


class RobotsExtractor(BaseExtractor):

    def extract(self):

goose3

Html Content / Article Extractor, web scrapping for Python3

Apache-2.0
Latest version published 10 months ago

Package Health Score

71 / 100
Full package analysis