How to use the extruct.jsonld.JsonLdExtractor function in extruct

To help you get started, we’ve selected a few extruct examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scrapinghub / extruct / extruct / __init__.py View on Github external
/* All other the properties in keys here */
                 }
       schema_context: schema's context for current page"""
    if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
        raise ValueError("syntaxes must be a list with any or all (default) of"
                         "these values: {}".format(SYNTAXES))
    if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    processors = []
    if 'microdata' in syntaxes:
        processors.append(('microdata', MicrodataExtractor().extract_items))
    if 'json-ld' in syntaxes:
        processors.append(('json-ld', JsonLdExtractor().extract_items))
    if 'opengraph' in syntaxes:
        processors.append(('opengraph', OpenGraphExtractor().extract_items))
    if 'microformat' in syntaxes:
        processors.append(('microformat', MicroformatExtractor().extract_items))
    if 'rdfa' in syntaxes:
        processors.append(('rdfa', RDFaExtractor().extract_items))
    output = {}
    for label, extract in processors:
        try:
            output[label] = [obj for obj in extract(document=tree,
                                                    url=url,
                                                    html=htmlstring)]
        except Exception:
            if errors == 'log':
                logger.exception("Failed to extract {} from {}".format(label, url))
            if errors == 'ignore':
github scrapinghub / extruct / tests / test_jsonld.py View on Github external
def _check_jsonld(self, body, expected):
        jsonlde = JsonLdExtractor()
        data = jsonlde.extract(body)
        self.assertEqual(data, expected)
github pmyteh / RISJbot / RISJbot / metadata.py View on Github external
if rdfa:
            try:
                self.rdfae = RDFaExtractor()
                self.rdfadata = self.rdfae.extract(self.response.text,
                                                   url=self.response.url)
            except JSONDecodeError:
                pass
        if microdata:
            try:
                self.mde = MicrodataExtractor()
                self.mdedata = self.mde.extract(self.response.text)
            except JSONDecodeError:
                pass
        if jsonld:
            try:
                self.jlde = JsonLdExtractor()
                self.jldata = self.jlde.extract(self.response.text)
            except (JSONDecodeError, TypeError):
                self.jldata = []
            finally:
                # Sometimes we get this in the meta dict from RISJExtractJSONLD
                self.jldata.extend(self.response.meta.get('json-ld', []))
github scrapinghub / extruct / extruct / _extruct.py View on Github external
'Failed to parse html, raises {}'.format(e))
            return {}
        if errors == 'strict':
            raise

    processors = []
    if 'microdata' in syntaxes:
        processors.append(
            ('microdata',
             MicrodataExtractor(add_html_node=return_html_node).extract_items,
             tree
             ))
    if 'json-ld' in syntaxes:
        processors.append(
            ('json-ld',
             JsonLdExtractor().extract_items,
             tree,
             ))
    if 'opengraph' in syntaxes:
        processors.append(
            ('opengraph',
             OpenGraphExtractor().extract_items,
             tree
             ))
    if 'microformat' in syntaxes:
        processors.append(
            ('microformat',
             MicroformatExtractor().extract_items,
             htmlstring
             ))
    if 'rdfa' in syntaxes:
        processors.append(