How to use the refextract.references.errors.FullTextNotAvailableError function in refextract

To help you get started, we’ve selected a few refextract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / refextract / tests / test_api.py View on Github external
def test_extract_references_from_url(pdf_files):
    with open(pdf_files[0], 'rb') as fd:
        url = "http://arxiv.org/pdf/1503.07589v1.pdf"
        responses.add(
            responses.GET,
            url,
            body=fd.read(),
            content_type='application/pdf'
        )

    r = extract_references_from_url(url)
    assert len(r) == 36

    with pytest.raises(FullTextNotAvailableError):
        url = "http://www.example.com"
        responses.add(
            responses.GET,
            url,
            body="File not found!",
            status=404,
            content_type='text/plain',
        )
        extract_references_from_url(url)
github inspirehep / refextract / tests / test_api.py View on Github external
def test_extract_references_from_file(pdf_files):
    r = extract_references_from_file(pdf_files[0])
    assert 'texkey' in r[0]
    assert 'author' in r[0]
    assert len(r) == 36
    with pytest.raises(FullTextNotAvailableError):
        extract_references_from_file(pdf_files[0] + "error")
github inspirehep / refextract / refextract / references / api.py View on Github external
E.g. you can change that by passing the reference_format:

    >>> extract_references_from_file(path, reference_format=u"{title},{volume},{page}")

    If you want to also link each reference to some other resource (like a record),
    you can provide a linker_callback function to be executed for every reference
    element found.

    To override KBs for journal names etc., use ``override_kbs_files``:

    >>> extract_references_from_file(path, override_kbs_files={'journals': 'my/path/to.kb'})

    """
    if not os.path.isfile(path):
        raise FullTextNotAvailableError(u"File not found: '{0}'".format(path))

    docbody = get_plaintext_document_body(path)
    reflines, dummy, dummy = extract_references_from_fulltext(docbody)
    if not reflines:
        docbody = get_plaintext_document_body(path, keep_layout=True)
        reflines, dummy, dummy = extract_references_from_fulltext(docbody)

    parsed_refs, stats = parse_references(
        reflines,
        recid=recid,
        reference_format=reference_format,
        linker_callback=linker_callback,
        override_kbs_files=override_kbs_files,
    )

    if magic.from_file(path, mime=True) == "application/pdf":
github inspirehep / refextract / refextract / references / api.py View on Github external
)
    os.close(filename)

    try:
        req = requests.get(
            url=url,
            headers=headers,
            stream=True
        )
        req.raise_for_status()
        with open(filepath, 'wb') as f:
            for chunk in req.iter_content(chunk_size):
                f.write(chunk)
        references = extract_references_from_file(filepath, **kwargs)
    except requests.exceptions.HTTPError as exc:
        raise FullTextNotAvailableError(f"URL not found: '{url}'") from exc
    finally:
        os.remove(filepath)
    return references