How to use the extruct.rdfa.RDFaExtractor function in extruct

To help you get started, we’ve selected a few extruct examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scrapinghub / extruct / tests / test_rdfa.py View on Github external
def test_w3c_rdf11primer(self):
        for i in [14]:
            fileprefix = 'w3c.rdf11primer.example{:03d}'.format(i)
            body = get_testdata('w3crdfa', fileprefix + '.html')
            expected = json.loads(
                    get_testdata('w3crdfa', fileprefix + '.expanded.json'
                ).decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, base_url='http://www.example.com/index.html')
            self.assertJsonLDEqual(data, expected)
github scrapinghub / extruct / tests / test_rdfa.py View on Github external
def test_expanded_opengraph_support(self):
        body = get_testdata('misc','expanded_OG_support_test.html')
        expected = json.loads(
                   get_testdata('misc','expanded_OG_support_test.json'
                   ).decode('UTF-8'))

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, base_url='http://www.example.com/index.html')

        self.assertJsonLDEqual(data,expected)
github scrapinghub / extruct / tests / test_rdfa.py View on Github external
def test_wikipedia_xhtml_rdfa_no_prefix(self):
        body = get_testdata('misc', 'Portfolio_Niels_Lubberman.html')
        expected = json.loads(
                   get_testdata('misc', 'Portfolio_Niels_Lubberman.json'
                   ).decode('UTF-8'))

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, base_url='http://nielslubberman.nl/drupal/')

        self.assertJsonLDEqual(data, expected)
github scrapinghub / extruct / tests / test_rdfa.py View on Github external
def test_wikipedia_xhtml_rdfa(self):
        fileprefix = 'xhtml+rdfa'
        body = get_testdata('wikipedia', fileprefix + '.html')
        expected = json.loads(
                   get_testdata('wikipedia', fileprefix + '.expanded.json'
                   ).decode('UTF-8'))

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, base_url='http://www.example.com/index.html')

        self.assertJsonLDEqual(data, expected)
github scrapinghub / extruct / tests / test_rdfa.py View on Github external
def test_w3c_rdfalite(self):
        for i in [3, 4, 5]:
            fileprefix = 'w3c.rdfalite.example{:03d}'.format(i)
            body = get_testdata('w3crdfa', fileprefix + '.html')
            expected = json.loads(
                    get_testdata('w3crdfa', fileprefix + '.expanded.json'
                ).decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, base_url='http://www.example.com/index.html')
            self.assertJsonLDEqual(data, expected)
github scrapinghub / extruct / tests / test_rdfa.py View on Github external
def test_w3c_rdfaprimer(self):
        for i in [5, 6, 7, 8, 9, 10, 11, 15]:
            fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i)
            print(fileprefix)
            body = get_testdata('w3crdfa', fileprefix + '.html')
            expected = json.loads(
                       get_testdata('w3crdfa', fileprefix + '.expanded.json'
                       ).decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, base_url='http://www.example.com/index.html')
            self.assertJsonLDEqual(data, expected)

            # This is for testing that the fix to issue 116 does not affect
            # severely rdfa output even in a presence of a bug in the code
            def mocked_fix_order(x, y, z):
                raise Exception()

            rdfae._fix_order = mocked_fix_order
            data = rdfae.extract(body, base_url='http://www.example.com/index.html')
            self.assertJsonLDEqual(data, expected)
github pmyteh / RISJbot / RISJbot / metadata.py View on Github external
def __init__(self, response, microdata=False, jsonld=False, rdfa=False):
        self.response = response
        self.microdata = microdata
        self.jsonld = jsonld
        self.rdfa = rdfa

        if rdfa:
            try:
                self.rdfae = RDFaExtractor()
                self.rdfadata = self.rdfae.extract(self.response.text,
                                                   url=self.response.url)
            except JSONDecodeError:
                pass
        if microdata:
            try:
                self.mde = MicrodataExtractor()
                self.mdedata = self.mde.extract(self.response.text)
            except JSONDecodeError:
                pass
        if jsonld:
            try:
                self.jlde = JsonLdExtractor()
                self.jldata = self.jlde.extract(self.response.text)
            except (JSONDecodeError, TypeError):
                self.jldata = []
github scrapinghub / extruct / extruct / __init__.py View on Github external
if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    processors = []
    if 'microdata' in syntaxes:
        processors.append(('microdata', MicrodataExtractor().extract_items))
    if 'json-ld' in syntaxes:
        processors.append(('json-ld', JsonLdExtractor().extract_items))
    if 'opengraph' in syntaxes:
        processors.append(('opengraph', OpenGraphExtractor().extract_items))
    if 'microformat' in syntaxes:
        processors.append(('microformat', MicroformatExtractor().extract_items))
    if 'rdfa' in syntaxes:
        processors.append(('rdfa', RDFaExtractor().extract_items))
    output = {}
    for label, extract in processors:
        try:
            output[label] = [obj for obj in extract(document=tree,
                                                    url=url,
                                                    html=htmlstring)]
        except Exception:
            if errors == 'log':
                logger.exception("Failed to extract {} from {}".format(label, url))
            if errors == 'ignore':
                pass
            if errors == 'strict':
                raise
    
    if uniform:
        if 'microdata' in syntaxes:
github scrapinghub / extruct / extruct / _extruct.py View on Github external
))
    if 'opengraph' in syntaxes:
        processors.append(
            ('opengraph',
             OpenGraphExtractor().extract_items,
             tree
             ))
    if 'microformat' in syntaxes:
        processors.append(
            ('microformat',
             MicroformatExtractor().extract_items,
             htmlstring
             ))
    if 'rdfa' in syntaxes:
        processors.append(
            ('rdfa', RDFaExtractor().extract_items,
             tree,
             ))

    output = {}
    for syntax, extract, document in processors:
        try:
            output[syntax] = list(extract(document, base_url=base_url))
        except Exception as e:
            if errors == 'log':
                logger.exception('Failed to extract {}, raises {}'
                                 .format(syntax, e)
                                 )
            if errors == 'ignore':
                pass
            if errors == 'strict':
                raise