How to use the gne.utils.config function in gne

To help you get started, we’ve selected a few gne examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github kingname / GeneralNewsExtractor / gne / extractor / TitleExtractor.py View on Github external
def extract(self, element: HtmlElement, title_xpath: str = '') -> str:
        title_xpath = title_xpath or config.get('title', {}).get('xpath')
        title = (self.extract_by_xpath(element, title_xpath)
                 or self.extract_by_htag_and_title(element)
                 or self.extract_by_title(element)
                 or self.extract_by_htag(element)
                 )
        return title.strip()
github kingname / GeneralNewsExtractor / gne / extractor / AuthorExtractor.py View on Github external
def extractor(self, element: HtmlElement, author_xpath=''):
        author_xpath = author_xpath or config.get('author', {}).get('xpath')
        if author_xpath:
            author = ''.join(element.xpath(author_xpath))
            return author
        text = ''.join(element.xpath('.//text()'))
        for pattern in self.author_pattern:
            author_obj = re.search(pattern, text)
            if author_obj:
                return author_obj.group(1)
        return ''
github kingname / GeneralNewsExtractor / gne / extractor / TimeExtractor.py View on Github external
def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str:
        publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath')
        publish_time = (self.extract_from_user_xpath(publish_time_xpath, element)  # 用户指定的 Xpath 是第一优先级
                        or self.extract_from_meta(element)   # 第二优先级从 Meta 中提取
                        or self.extract_from_text(element))  # 最坏的情况从正文中提取
        return publish_time
github kingname / GeneralNewsExtractor / gne / __init__.py View on Github external
title = TitleExtractor().extract(element, title_xpath=title_xpath)
        publish_time = TimeExtractor().extractor(element, publish_time_xpath=publish_time_xpath)
        author = AuthorExtractor().extractor(element, author_xpath=author_xpath)
        element = pre_parse(element)
        remove_noise_node(element, noise_node_list)
        content = ContentExtractor().extract(element,
                                             host=host,
                                             with_body_html=with_body_html,
                                             body_xpath=body_xpath)
        result = {'title': title,
                  'author': author,
                  'publish_time': publish_time,
                  'content': content[0][1]['text'],
                  'images': content[0][1]['images']
                  }
        if with_body_html or config.get('with_body_html', False):
            result['body_html'] = content[0][1]['body_html']
        return result