How to use the extruct.extract function in extruct

To help you get started, we’ve selected a few extruct examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scrapinghub / extruct / tests / test_extruct_uniform.py View on Github external
def test_microformat(self):
        body = get_testdata('misc', 'microformat_test.html')
        expected = json.loads(get_testdata('misc', 'microformat_flat_test.json').decode('UTF-8'))
        data = extruct.extract(body, uniform=True)
        self.assertEqual(jsonize_dict(data['microformat']), expected)
github scrapinghub / extruct / tests / test_uniform.py View on Github external
"aggregateRating": {
                          "@type": "AggregateRating",
                          "ratingValue": "4.4",
                          "reviewCount": "89"},
                      "offers": {
                          "@type": "Offer",
                          "priceCurrency": "USD",
                          "price": "119.99",
                          "priceValidUntil": "2020-11-05",
                          "seller": {"@type": "Organization",
                                     "name": "Executive Objects"},
                          "itemCondition": "http://schema.org/UsedCondition",
                          "availability": "http://schema.org/InStock"
                          }}]
        body = get_testdata('misc', 'product_microdata.html')
        data = extruct.extract(body, syntaxes=['microdata'], uniform=True)
        self.assertEqual(data['microdata'], expected)
github scrapinghub / extruct / tests / test_extruct.py View on Github external
def test_extra_kwargs(self):
        body, expected = self._microdata_custom_url('product_custom_url.json')
        with self.assertRaises(TypeError):
            extruct.extract(body, foo='bar')
github scrapinghub / extruct / tests / test_extruct.py View on Github external
def test_errors(self):
        body = ''

        # raise exceptions
        with pytest.raises(Exception):
            data = extruct.extract(body)

        # ignore exceptions
        expected = {}
        data = extruct.extract(body, errors='ignore')
        assert data == expected

        # ignore exceptions
        data = extruct.extract(body, errors='log')
        assert data == expected
github stanford-oval / genie-toolkit / scripts / parse_restaurants.py View on Github external
queue = [initial]
    visited = set()

    while len(queue) > 0 and len(visited) < limit:
        next = queue.pop()
        if next in visited:
            continue

        visited.add(next)
        print(f'Calling {next}', file=sys.stderr)

        try:
            response = requests.get(next)
            base_url = get_base_url(response.text, response.url)

            data = extruct.extract(response.text, base_url=base_url, syntaxes=['json-ld'])
            if len(data['json-ld']) > 0:
                output.append(data['json-ld'][0])

            soup = BeautifulSoup(response.text, 'html5lib')

            for link in soup.find_all('a'):
                if not 'href' in link.attrs:
                    continue
                linkurl = urllib.parse.urljoin(base_url, link['href'])
                if linkurl in visited:
                    continue

                if urlpatterns:
                    for pat in urlpatterns:
                        if pat(linkurl):
                            queue.insert(0, linkurl.split('?')[0])
github scrapinghub / extruct / extruct / tool.py View on Github external
def metadata_from_url(url, syntaxes=SYNTAXES, uniform=False,
                      schema_context='http://schema.org', errors='strict'):
    resp = requests.get(url, timeout=30)
    result = {'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason)}
    try:
        resp.raise_for_status()
    except requests.exceptions.HTTPError:
        return result
    result.update(extruct.extract(resp.content,
                                  base_url=url,  # FIXME: use base url
                                  syntaxes=syntaxes, 
                                  uniform=uniform,
                                  schema_context=schema_context,
                                  errors=errors))
    return result
github MaayanLab / FAIRshake / FAIRshakeAPI / assessments / json-ld / __init__.py View on Github external
def perform(kls, inputs):
    url = inputs['target:url']
    try:
      r = requests.get(url)
      base_url = get_base_url(r.text, r.url)
      data = extruct.extract(r.text, base_url=base_url, syntaxes=['json-ld'])['json-ld']
      tree = Tree(data)
    except:
      data = None

    return dict(
      **{
        'metric:30': {
          'answer': 'yes' if data else 'no',
          'comment': 'jsonld was found and properly parsed' if data else 'jsonld could not be parsed',
        },
      },
      **{
        key: {
          'answer': 'yes' if attr else 'no',
          'comment': attr if attr else 'json-ld %s not found' % (' '.join(to_schema[key])),
        } if key.startswith('metric:') else attr
github google / dspl / tools / dspl2 / dspl2 / filegetter.py View on Github external
def _ProcessDspl2File(filename, fileobj, *, type=''):
  if any([filename.endswith('.html'),
          type.startswith('text/html')]):
    data = extruct.extract(file.read(), uniform='True')
    return LoadGraph({
        '@context': 'http://schema.org',
        '@graph': [
            subdata_elem
            for subdata in data.values()
            for subdata_elem in subdata
            if subdata
        ]
    }, filename)
  if any([filename.endswith('.json'),
          filename.endswith('.jsonld'),
          type.startswith('application/ld+json')]):
    json_val = json.load(fileobj)
    return LoadGraph(json_val, filename)
github micahcochran / scrape-schema-recipe / scrape_schema_recipe / scrape.py View on Github external
no results - an empty list will be returned


    """

    if not isinstance(url, str):
        raise TypeError('url is type "{}", a string was expected'
                        ''.format(type(url)))

    data = {}  # type: Dict[str, List[Dict]]
    if not user_agent_str:
        user_agent_str = USER_AGENT_STR

    r = requests.get(url, headers={'User-Agent': user_agent_str})
    r.raise_for_status()
    data = extruct.extract(r.text, r.url)
    url = r.url

    scrapings = _convert_to_scrapings(data, nonstandard_attrs, url=url)

    if migrate_old_schema is True:
        scrapings = _migrate_old_schema(scrapings)

    if python_objects is not False:
        scrapings = _pythonize_objects(scrapings, python_objects)

    return scrapings