How to use the extruct.w3cmicrodata.MicrodataExtractor function in extruct

To help you get started, we’ve selected a few extruct examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scrapinghub / extruct / extruct / __init__.py View on Github external
{'@context': 'http://example.com', 
                 '@type': 'example_type',
                 /* All other the properties in keys here */
                 }
       schema_context: schema's context for current page"""
    if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
        raise ValueError("syntaxes must be a list with any or all (default) of"
                         "these values: {}".format(SYNTAXES))
    if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    processors = []
    if 'microdata' in syntaxes:
        processors.append(('microdata', MicrodataExtractor().extract_items))
    if 'json-ld' in syntaxes:
        processors.append(('json-ld', JsonLdExtractor().extract_items))
    if 'opengraph' in syntaxes:
        processors.append(('opengraph', OpenGraphExtractor().extract_items))
    if 'microformat' in syntaxes:
        processors.append(('microformat', MicroformatExtractor().extract_items))
    if 'rdfa' in syntaxes:
        processors.append(('rdfa', RDFaExtractor().extract_items))
    output = {}
    for label, extract in processors:
        try:
            output[label] = [obj for obj in extract(document=tree,
                                                    url=url,
                                                    html=htmlstring)]
        except Exception:
            if errors == 'log':
github scrapinghub / extruct / tests / test_microdata.py View on Github external
def test_join_none(self):
        body = get_testdata('schema.org', 'product-ref.html')
        expected = json.loads(get_testdata('schema.org', 'product-ref.json').decode('UTF-8'))

        mde = MicrodataExtractor()
        data = mde.extract(body)
        self.assertEqual(data, expected)
github scrapinghub / extruct / tests / test_microdata.py View on Github external
def test_w3c_5_2(self):
        body = get_testdata('w3c', 'microdata.5.2.html')
        expected = json.loads(get_testdata('w3c', 'microdata.5.2.flat.json').decode('UTF-8'))

        mde = MicrodataExtractor(nested=False, strict=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
github scrapinghub / extruct / tests / test_microdata.py View on Github external
def test_w3c_5_3(self):
        body = get_testdata('w3c', 'microdata.5.3.html')
        expected = json.loads(get_testdata('w3c', 'microdata.5.3.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
github scrapinghub / extruct / tests / test_microdata.py View on Github external
def test_w3c_textContent_values(self):
        body = get_testdata('w3c', 'microdata.4.2.strings.html')
        expected = json.loads(get_testdata('w3c', 'microdata.4.2.strings.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
github scrapinghub / extruct / tests / test_microdata.py View on Github external
def test_schemaorg_MusicRecording(self):
        for i in [1]:
            body = get_testdata('schema.org', 'MusicRecording.{:03d}.html'.format(i))
            expected = json.loads(get_testdata('schema.org', 'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertEqual(data, expected)
github scrapinghub / extruct / tests / test_microdata.py View on Github external
def test_w3c_object_element(self):
        body = get_testdata('w3c', 'microdata.object.html')
        expected = json.loads(get_testdata('w3c', 'microdata.object.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, 'http://www.example.com/microdata/test')
        self.assertEqual(data, expected)
github scrapinghub / extruct / tests / test_microdata.py View on Github external
def test_w3c_7_1(self):
        body = get_testdata('w3c', 'microdata.7.1.html')
        expected = json.loads(get_testdata('w3c', 'microdata.7.1.flat.json').decode('UTF-8'))

        mde = MicrodataExtractor(nested=False, strict=True)
        data = mde.extract(body, 'http://blog.example.com/progress-report')
        self.assertEqual(data, expected)
github scrapinghub / extruct / tests / test_microdata.py View on Github external
def test_w3c_data_element(self):
        body = get_testdata('w3c', 'microdata.4.2.data.html')
        expected = json.loads(get_testdata('w3c', 'microdata.4.2.data.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
github scrapinghub / extruct / tests / test_microdata.py View on Github external
def test_join_custom_url(self):
        body = get_testdata('schema.org', 'product.html')
        expected = json.loads(get_testdata('schema.org', 'product_custom_url.json').decode('UTF-8'))

        mde = MicrodataExtractor()
        data = mde.extract(body, base_url='http://some-example.com')
        self.assertEqual(data, expected)