How to use gne - 10 common examples

To help you get started, we’ve selected a few gne examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github kingname / GeneralNewsExtractor / gne / extractor / TitleExtractor.py View on Github external
def extract_by_title(self, element):
        title_list = element.xpath('//title/text()')
        if not title_list:
            return ''
        title = re.split(TITLE_SPLIT_CHAR_PATTERN, title_list[0])
        if title:
            return title[0]
        else:
            return ''
github kingname / GeneralNewsExtractor / gne / extractor / TitleExtractor.py View on Github external
def extract(self, element: HtmlElement, title_xpath: str = '') -> str:
        title_xpath = title_xpath or config.get('title', {}).get('xpath')
        title = (self.extract_by_xpath(element, title_xpath)
                 or self.extract_by_htag_and_title(element)
                 or self.extract_by_title(element)
                 or self.extract_by_htag(element)
                 )
        return title.strip()
github kingname / GeneralNewsExtractor / gne / extractor / AuthorExtractor.py View on Github external
def extractor(self, element: HtmlElement, author_xpath=''):
        author_xpath = author_xpath or config.get('author', {}).get('xpath')
        if author_xpath:
            author = ''.join(element.xpath(author_xpath))
            return author
        text = ''.join(element.xpath('.//text()'))
        for pattern in self.author_pattern:
            author_obj = re.search(pattern, text)
            if author_obj:
                return author_obj.group(1)
        return ''
github kingname / GeneralNewsExtractor / gne / extractor / TimeExtractor.py View on Github external
def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str:
        publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath')
        publish_time = (self.extract_from_user_xpath(publish_time_xpath, element)  # 用户指定的 Xpath 是第一优先级
                        or self.extract_from_meta(element)   # 第二优先级从 Meta 中提取
                        or self.extract_from_text(element))  # 最坏的情况从正文中提取
        return publish_time
github kingname / GeneralNewsExtractor / gne / __init__.py View on Github external
title = TitleExtractor().extract(element, title_xpath=title_xpath)
        publish_time = TimeExtractor().extractor(element, publish_time_xpath=publish_time_xpath)
        author = AuthorExtractor().extractor(element, author_xpath=author_xpath)
        element = pre_parse(element)
        remove_noise_node(element, noise_node_list)
        content = ContentExtractor().extract(element,
                                             host=host,
                                             with_body_html=with_body_html,
                                             body_xpath=body_xpath)
        result = {'title': title,
                  'author': author,
                  'publish_time': publish_time,
                  'content': content[0][1]['text'],
                  'images': content[0][1]['images']
                  }
        if with_body_html or config.get('with_body_html', False):
            result['body_html'] = content[0][1]['body_html']
        return result
github kingname / GeneralNewsExtractor / gne / extractor / TitleExtractor.py View on Github external
GNE 成为全球最好的新闻提取模块-今日头条
        新华网:GNE 成为全球最好的新闻提取模块

        同时,新闻的某个 标签中也会包含这个新闻标题。

        因此,通过 h 标签与 title 的文字双向匹配,找到最适合作为新闻标题的字符串。
        但是,需要考虑到 title 与 h 标签中的文字可能均含有特殊符号,因此,不能直接通过
        判断 h 标签中的文字是否在 title 中来判断,这里需要中最长公共子串。
        :param element:
        :return:
        """
        h_tag_texts_list = element.xpath('(//h1//text() | //h2//text() | //h3//text() | //h4//text() | //h5//text())')
        title_text = ''.join(element.xpath('//title/text()'))
        news_title = ''
        for h_tag_text in h_tag_texts_list:
            lcs = get_longest_common_sub_string(title_text, h_tag_text)
            if len(lcs) > len(news_title):
                news_title = lcs
        return news_title
github kingname / GeneralNewsExtractor / example.py View on Github external
import json
import glob
from gne import GeneralNewsExtractor


if __name__ == '__main__':
    html_list = glob.glob('tests/**/*.html', recursive=True)
    for html_file in html_list:
        with open(html_file, encoding='utf-8') as f:
            html = f.read()
        extractor = GeneralNewsExtractor()
        result = extractor.extract(html,
                                   host='https://www.xxx.com',
                                   noise_node_list=['//div[@class="comment-list"]',
                                                    '//*[@style="display:none"]',
                                                    '//div[@class="statement"]'
                                                    ])
        print(f'>>>>>>>>>>>>>{html_file}>>>>>>>>>>>>>')
        print(json.dumps(result, indent=2, ensure_ascii=False))
        print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
github kingname / GeneralNewsExtractor / gne / extractor / TimeExtractor.py View on Github external
def __init__(self):
        self.time_pattern = DATETIME_PATTERN
github kingname / GeneralNewsExtractor / gne / extractor / TimeExtractor.py View on Github external
def extract_from_meta(self, element: HtmlElement) -> str:
        """
        一些很规范的新闻网站,会把新闻的发布时间放在 META 中,因此应该优先检查 META 数据
        :param element: 网页源代码对应的Dom 树
        :return: str
        """
        for xpath in PUBLISH_TIME_META:
            publish_time = element.xpath(xpath)
            if publish_time:
                return ''.join(publish_time)
        return ''